diff options
Diffstat (limited to 'rust')
-rw-r--r-- | rust/helpers/helpers.c | 1 | ||||
-rw-r--r-- | rust/helpers/mm.c | 50 | ||||
-rw-r--r-- | rust/kernel/lib.rs | 1 | ||||
-rw-r--r-- | rust/kernel/miscdevice.rs | 45 | ||||
-rw-r--r-- | rust/kernel/mm.rs | 344 | ||||
-rw-r--r-- | rust/kernel/mm/virt.rs | 471 | ||||
-rw-r--r-- | rust/kernel/task.rs | 247 |
7 files changed, 1041 insertions, 118 deletions
diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c index cc81c8ba779f..805307018f0e 100644 --- a/rust/helpers/helpers.c +++ b/rust/helpers/helpers.c @@ -24,6 +24,7 @@ #include "io.c" #include "jump_label.c" #include "kunit.c" +#include "mm.c" #include "mutex.c" #include "page.c" #include "platform.c" diff --git a/rust/helpers/mm.c b/rust/helpers/mm.c new file mode 100644 index 000000000000..81b510c96fd2 --- /dev/null +++ b/rust/helpers/mm.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/mm.h> +#include <linux/sched/mm.h> + +void rust_helper_mmgrab(struct mm_struct *mm) +{ + mmgrab(mm); +} + +void rust_helper_mmdrop(struct mm_struct *mm) +{ + mmdrop(mm); +} + +void rust_helper_mmget(struct mm_struct *mm) +{ + mmget(mm); +} + +bool rust_helper_mmget_not_zero(struct mm_struct *mm) +{ + return mmget_not_zero(mm); +} + +void rust_helper_mmap_read_lock(struct mm_struct *mm) +{ + mmap_read_lock(mm); +} + +bool rust_helper_mmap_read_trylock(struct mm_struct *mm) +{ + return mmap_read_trylock(mm); +} + +void rust_helper_mmap_read_unlock(struct mm_struct *mm) +{ + mmap_read_unlock(mm); +} + +struct vm_area_struct *rust_helper_vma_lookup(struct mm_struct *mm, + unsigned long addr) +{ + return vma_lookup(mm, addr); +} + +void rust_helper_vma_end_read(struct vm_area_struct *vma) +{ + vma_end_read(vma); +} diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index 24440751ca26..911c72a0fc21 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -72,6 +72,7 @@ pub mod jump_label; pub mod kunit; pub mod list; pub mod miscdevice; +pub mod mm; #[cfg(CONFIG_NET)] pub mod net; pub mod of; diff --git a/rust/kernel/miscdevice.rs b/rust/kernel/miscdevice.rs index fa9ecc42602a..9d9771247c38 100644 --- a/rust/kernel/miscdevice.rs +++ b/rust/kernel/miscdevice.rs @@ -14,6 +14,7 @@ use crate::{ error::{to_result, Error, Result, VTABLE_DEFAULT_ERROR}, ffi::{c_int, c_long, c_uint, c_ulong}, fs::File, + mm::virt::VmaNew, prelude::*, seq_file::SeqFile, str::CStr, @@ -119,6 +120,22 @@ pub trait MiscDevice: Sized { drop(device); } + /// Handle for mmap. + /// + /// This function is invoked when a user space process invokes the `mmap` system call on + /// `file`. The function is a callback that is part of the VMA initializer. The kernel will do + /// initial setup of the VMA before calling this function. The function can then interact with + /// the VMA initialization by calling methods of `vma`. If the function does not return an + /// error, the kernel will complete initialization of the VMA according to the properties of + /// `vma`. + fn mmap( + _device: <Self::Ptr as ForeignOwnable>::Borrowed<'_>, + _file: &File, + _vma: &VmaNew, + ) -> Result { + build_error!(VTABLE_DEFAULT_ERROR) + } + /// Handler for ioctls. /// /// The `cmd` argument is usually manipulated using the utilties in [`kernel::ioctl`]. @@ -226,6 +243,33 @@ impl<T: MiscDevice> MiscdeviceVTable<T> { /// # Safety /// /// `file` must be a valid file that is associated with a `MiscDeviceRegistration<T>`. + /// `vma` must be a vma that is currently being mmap'ed with this file. + unsafe extern "C" fn mmap( + file: *mut bindings::file, + vma: *mut bindings::vm_area_struct, + ) -> c_int { + // SAFETY: The mmap call of a file can access the private data. + let private = unsafe { (*file).private_data }; + // SAFETY: This is a Rust Miscdevice, so we call `into_foreign` in `open` and + // `from_foreign` in `release`, and `fops_mmap` is guaranteed to be called between those + // two operations. + let device = unsafe { <T::Ptr as ForeignOwnable>::borrow(private) }; + // SAFETY: The caller provides a vma that is undergoing initial VMA setup. + let area = unsafe { VmaNew::from_raw(vma) }; + // SAFETY: + // * The file is valid for the duration of this call. + // * There is no active fdget_pos region on the file on this thread. + let file = unsafe { File::from_raw_file(file) }; + + match T::mmap(device, file, area) { + Ok(()) => 0, + Err(err) => err.to_errno(), + } + } + + /// # Safety + /// + /// `file` must be a valid file that is associated with a `MiscDeviceRegistration<T>`. unsafe extern "C" fn ioctl(file: *mut bindings::file, cmd: c_uint, arg: c_ulong) -> c_long { // SAFETY: The ioctl call of a file can access the private data. let private = unsafe { (*file).private_data }; @@ -291,6 +335,7 @@ impl<T: MiscDevice> MiscdeviceVTable<T> { const VTABLE: bindings::file_operations = bindings::file_operations { open: Some(Self::open), release: Some(Self::release), + mmap: if T::HAS_MMAP { Some(Self::mmap) } else { None }, unlocked_ioctl: if T::HAS_IOCTL { Some(Self::ioctl) } else { diff --git a/rust/kernel/mm.rs b/rust/kernel/mm.rs new file mode 100644 index 000000000000..615907a0f3b4 --- /dev/null +++ b/rust/kernel/mm.rs @@ -0,0 +1,344 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2024 Google LLC. + +//! Memory management. +//! +//! This module deals with managing the address space of userspace processes. Each process has an +//! instance of [`Mm`], which keeps track of multiple VMAs (virtual memory areas). Each VMA +//! corresponds to a region of memory that the userspace process can access, and the VMA lets you +//! control what happens when userspace reads or writes to that region of memory. +//! +//! C header: [`include/linux/mm.h`](srctree/include/linux/mm.h) +#![cfg(CONFIG_MMU)] + +use crate::{ + bindings, + types::{ARef, AlwaysRefCounted, NotThreadSafe, Opaque}, +}; +use core::{ops::Deref, ptr::NonNull}; + +pub mod virt; +use virt::VmaRef; + +/// A wrapper for the kernel's `struct mm_struct`. +/// +/// This represents the address space of a userspace process, so each process has one `Mm` +/// instance. It may hold many VMAs internally. +/// +/// There is a counter called `mm_users` that counts the users of the address space; this includes +/// the userspace process itself, but can also include kernel threads accessing the address space. +/// Once `mm_users` reaches zero, this indicates that the address space can be destroyed. To access +/// the address space, you must prevent `mm_users` from reaching zero while you are accessing it. +/// The [`MmWithUser`] type represents an address space where this is guaranteed, and you can +/// create one using [`mmget_not_zero`]. +/// +/// The `ARef<Mm>` smart pointer holds an `mmgrab` refcount. Its destructor may sleep. +/// +/// # Invariants +/// +/// Values of this type are always refcounted using `mmgrab`. +/// +/// [`mmget_not_zero`]: Mm::mmget_not_zero +#[repr(transparent)] +pub struct Mm { + mm: Opaque<bindings::mm_struct>, +} + +// SAFETY: It is safe to call `mmdrop` on another thread than where `mmgrab` was called. +unsafe impl Send for Mm {} +// SAFETY: All methods on `Mm` can be called in parallel from several threads. +unsafe impl Sync for Mm {} + +// SAFETY: By the type invariants, this type is always refcounted. +unsafe impl AlwaysRefCounted for Mm { + #[inline] + fn inc_ref(&self) { + // SAFETY: The pointer is valid since self is a reference. + unsafe { bindings::mmgrab(self.as_raw()) }; + } + + #[inline] + unsafe fn dec_ref(obj: NonNull<Self>) { + // SAFETY: The caller is giving up their refcount. + unsafe { bindings::mmdrop(obj.cast().as_ptr()) }; + } +} + +/// A wrapper for the kernel's `struct mm_struct`. +/// +/// This type is like [`Mm`], but with non-zero `mm_users`. It can only be used when `mm_users` can +/// be proven to be non-zero at compile-time, usually because the relevant code holds an `mmget` +/// refcount. It can be used to access the associated address space. +/// +/// The `ARef<MmWithUser>` smart pointer holds an `mmget` refcount. Its destructor may sleep. +/// +/// # Invariants +/// +/// Values of this type are always refcounted using `mmget`. The value of `mm_users` is non-zero. +#[repr(transparent)] +pub struct MmWithUser { + mm: Mm, +} + +// SAFETY: It is safe to call `mmput` on another thread than where `mmget` was called. +unsafe impl Send for MmWithUser {} +// SAFETY: All methods on `MmWithUser` can be called in parallel from several threads. +unsafe impl Sync for MmWithUser {} + +// SAFETY: By the type invariants, this type is always refcounted. +unsafe impl AlwaysRefCounted for MmWithUser { + #[inline] + fn inc_ref(&self) { + // SAFETY: The pointer is valid since self is a reference. + unsafe { bindings::mmget(self.as_raw()) }; + } + + #[inline] + unsafe fn dec_ref(obj: NonNull<Self>) { + // SAFETY: The caller is giving up their refcount. + unsafe { bindings::mmput(obj.cast().as_ptr()) }; + } +} + +// Make all `Mm` methods available on `MmWithUser`. +impl Deref for MmWithUser { + type Target = Mm; + + #[inline] + fn deref(&self) -> &Mm { + &self.mm + } +} + +/// A wrapper for the kernel's `struct mm_struct`. +/// +/// This type is identical to `MmWithUser` except that it uses `mmput_async` when dropping a +/// refcount. This means that the destructor of `ARef<MmWithUserAsync>` is safe to call in atomic +/// context. +/// +/// # Invariants +/// +/// Values of this type are always refcounted using `mmget`. The value of `mm_users` is non-zero. +#[repr(transparent)] +pub struct MmWithUserAsync { + mm: MmWithUser, +} + +// SAFETY: It is safe to call `mmput_async` on another thread than where `mmget` was called. +unsafe impl Send for MmWithUserAsync {} +// SAFETY: All methods on `MmWithUserAsync` can be called in parallel from several threads. +unsafe impl Sync for MmWithUserAsync {} + +// SAFETY: By the type invariants, this type is always refcounted. +unsafe impl AlwaysRefCounted for MmWithUserAsync { + #[inline] + fn inc_ref(&self) { + // SAFETY: The pointer is valid since self is a reference. + unsafe { bindings::mmget(self.as_raw()) }; + } + + #[inline] + unsafe fn dec_ref(obj: NonNull<Self>) { + // SAFETY: The caller is giving up their refcount. + unsafe { bindings::mmput_async(obj.cast().as_ptr()) }; + } +} + +// Make all `MmWithUser` methods available on `MmWithUserAsync`. +impl Deref for MmWithUserAsync { + type Target = MmWithUser; + + #[inline] + fn deref(&self) -> &MmWithUser { + &self.mm + } +} + +// These methods are safe to call even if `mm_users` is zero. +impl Mm { + /// Returns a raw pointer to the inner `mm_struct`. + #[inline] + pub fn as_raw(&self) -> *mut bindings::mm_struct { + self.mm.get() + } + + /// Obtain a reference from a raw pointer. + /// + /// # Safety + /// + /// The caller must ensure that `ptr` points at an `mm_struct`, and that it is not deallocated + /// during the lifetime 'a. + #[inline] + pub unsafe fn from_raw<'a>(ptr: *const bindings::mm_struct) -> &'a Mm { + // SAFETY: Caller promises that the pointer is valid for 'a. Layouts are compatible due to + // repr(transparent). + unsafe { &*ptr.cast() } + } + + /// Calls `mmget_not_zero` and returns a handle if it succeeds. + #[inline] + pub fn mmget_not_zero(&self) -> Option<ARef<MmWithUser>> { + // SAFETY: The pointer is valid since self is a reference. + let success = unsafe { bindings::mmget_not_zero(self.as_raw()) }; + + if success { + // SAFETY: We just created an `mmget` refcount. + Some(unsafe { ARef::from_raw(NonNull::new_unchecked(self.as_raw().cast())) }) + } else { + None + } + } +} + +// These methods require `mm_users` to be non-zero. +impl MmWithUser { + /// Obtain a reference from a raw pointer. + /// + /// # Safety + /// + /// The caller must ensure that `ptr` points at an `mm_struct`, and that `mm_users` remains + /// non-zero for the duration of the lifetime 'a. + #[inline] + pub unsafe fn from_raw<'a>(ptr: *const bindings::mm_struct) -> &'a MmWithUser { + // SAFETY: Caller promises that the pointer is valid for 'a. The layout is compatible due + // to repr(transparent). + unsafe { &*ptr.cast() } + } + + /// Use `mmput_async` when dropping this refcount. + #[inline] + pub fn into_mmput_async(me: ARef<MmWithUser>) -> ARef<MmWithUserAsync> { + // SAFETY: The layouts and invariants are compatible. + unsafe { ARef::from_raw(ARef::into_raw(me).cast()) } + } + + /// Attempt to access a vma using the vma read lock. + /// + /// This is an optimistic trylock operation, so it may fail if there is contention. In that + /// case, you should fall back to taking the mmap read lock. + /// + /// When per-vma locks are disabled, this always returns `None`. + #[inline] + pub fn lock_vma_under_rcu(&self, vma_addr: usize) -> Option<VmaReadGuard<'_>> { + #[cfg(CONFIG_PER_VMA_LOCK)] + { + // SAFETY: Calling `bindings::lock_vma_under_rcu` is always okay given an mm where + // `mm_users` is non-zero. + let vma = unsafe { bindings::lock_vma_under_rcu(self.as_raw(), vma_addr) }; + if !vma.is_null() { + return Some(VmaReadGuard { + // SAFETY: If `lock_vma_under_rcu` returns a non-null ptr, then it points at a + // valid vma. The vma is stable for as long as the vma read lock is held. + vma: unsafe { VmaRef::from_raw(vma) }, + _nts: NotThreadSafe, + }); + } + } + + // Silence warnings about unused variables. + #[cfg(not(CONFIG_PER_VMA_LOCK))] + let _ = vma_addr; + + None + } + + /// Lock the mmap read lock. + #[inline] + pub fn mmap_read_lock(&self) -> MmapReadGuard<'_> { + // SAFETY: The pointer is valid since self is a reference. + unsafe { bindings::mmap_read_lock(self.as_raw()) }; + + // INVARIANT: We just acquired the read lock. + MmapReadGuard { + mm: self, + _nts: NotThreadSafe, + } + } + + /// Try to lock the mmap read lock. + #[inline] + pub fn mmap_read_trylock(&self) -> Option<MmapReadGuard<'_>> { + // SAFETY: The pointer is valid since self is a reference. + let success = unsafe { bindings::mmap_read_trylock(self.as_raw()) }; + + if success { + // INVARIANT: We just acquired the read lock. + Some(MmapReadGuard { + mm: self, + _nts: NotThreadSafe, + }) + } else { + None + } + } +} + +/// A guard for the mmap read lock. +/// +/// # Invariants +/// +/// This `MmapReadGuard` guard owns the mmap read lock. +pub struct MmapReadGuard<'a> { + mm: &'a MmWithUser, + // `mmap_read_lock` and `mmap_read_unlock` must be called on the same thread + _nts: NotThreadSafe, +} + +impl<'a> MmapReadGuard<'a> { + /// Look up a vma at the given address. + #[inline] + pub fn vma_lookup(&self, vma_addr: usize) -> Option<&virt::VmaRef> { + // SAFETY: By the type invariants we hold the mmap read guard, so we can safely call this + // method. Any value is okay for `vma_addr`. + let vma = unsafe { bindings::vma_lookup(self.mm.as_raw(), vma_addr) }; + + if vma.is_null() { + None + } else { + // SAFETY: We just checked that a vma was found, so the pointer references a valid vma. + // + // Furthermore, the returned vma is still under the protection of the read lock guard + // and can be used while the mmap read lock is still held. That the vma is not used + // after the MmapReadGuard gets dropped is enforced by the borrow-checker. + unsafe { Some(virt::VmaRef::from_raw(vma)) } + } + } +} + +impl Drop for MmapReadGuard<'_> { + #[inline] + fn drop(&mut self) { + // SAFETY: We hold the read lock by the type invariants. + unsafe { bindings::mmap_read_unlock(self.mm.as_raw()) }; + } +} + +/// A guard for the vma read lock. +/// +/// # Invariants +/// +/// This `VmaReadGuard` guard owns the vma read lock. +pub struct VmaReadGuard<'a> { + vma: &'a VmaRef, + // `vma_end_read` must be called on the same thread as where the lock was taken + _nts: NotThreadSafe, +} + +// Make all `VmaRef` methods available on `VmaReadGuard`. +impl Deref for VmaReadGuard<'_> { + type Target = VmaRef; + + #[inline] + fn deref(&self) -> &VmaRef { + self.vma + } +} + +impl Drop for VmaReadGuard<'_> { + #[inline] + fn drop(&mut self) { + // SAFETY: We hold the read lock by the type invariants. + unsafe { bindings::vma_end_read(self.vma.as_ptr()) }; + } +} diff --git a/rust/kernel/mm/virt.rs b/rust/kernel/mm/virt.rs new file mode 100644 index 000000000000..31803674aecc --- /dev/null +++ b/rust/kernel/mm/virt.rs @@ -0,0 +1,471 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2024 Google LLC. + +//! Virtual memory. +//! +//! This module deals with managing a single VMA in the address space of a userspace process. Each +//! VMA corresponds to a region of memory that the userspace process can access, and the VMA lets +//! you control what happens when userspace reads or writes to that region of memory. +//! +//! The module has several different Rust types that all correspond to the C type called +//! `vm_area_struct`. The different structs represent what kind of access you have to the VMA, e.g. +//! [`VmaRef`] is used when you hold the mmap or vma read lock. Using the appropriate struct +//! ensures that you can't, for example, accidentally call a function that requires holding the +//! write lock when you only hold the read lock. + +use crate::{ + bindings, + error::{code::EINVAL, to_result, Result}, + mm::MmWithUser, + page::Page, + types::Opaque, +}; + +use core::ops::Deref; + +/// A wrapper for the kernel's `struct vm_area_struct` with read access. +/// +/// It represents an area of virtual memory. +/// +/// # Invariants +/// +/// The caller must hold the mmap read lock or the vma read lock. +#[repr(transparent)] +pub struct VmaRef { + vma: Opaque<bindings::vm_area_struct>, +} + +// Methods you can call when holding the mmap or vma read lock (or stronger). They must be usable +// no matter what the vma flags are. +impl VmaRef { + /// Access a virtual memory area given a raw pointer. + /// + /// # Safety + /// + /// Callers must ensure that `vma` is valid for the duration of 'a, and that the mmap or vma + /// read lock (or stronger) is held for at least the duration of 'a. + #[inline] + pub unsafe fn from_raw<'a>(vma: *const bindings::vm_area_struct) -> &'a Self { + // SAFETY: The caller ensures that the invariants are satisfied for the duration of 'a. + unsafe { &*vma.cast() } + } + + /// Returns a raw pointer to this area. + #[inline] + pub fn as_ptr(&self) -> *mut bindings::vm_area_struct { + self.vma.get() + } + + /// Access the underlying `mm_struct`. + #[inline] + pub fn mm(&self) -> &MmWithUser { + // SAFETY: By the type invariants, this `vm_area_struct` is valid and we hold the mmap/vma + // read lock or stronger. This implies that the underlying mm has a non-zero value of + // `mm_users`. + unsafe { MmWithUser::from_raw((*self.as_ptr()).vm_mm) } + } + + /// Returns the flags associated with the virtual memory area. + /// + /// The possible flags are a combination of the constants in [`flags`]. + #[inline] + pub fn flags(&self) -> vm_flags_t { + // SAFETY: By the type invariants, the caller holds at least the mmap read lock, so this + // access is not a data race. + unsafe { (*self.as_ptr()).__bindgen_anon_2.vm_flags } + } + + /// Returns the (inclusive) start address of the virtual memory area. + #[inline] + pub fn start(&self) -> usize { + // SAFETY: By the type invariants, the caller holds at least the mmap read lock, so this + // access is not a data race. + unsafe { (*self.as_ptr()).__bindgen_anon_1.__bindgen_anon_1.vm_start } + } + + /// Returns the (exclusive) end address of the virtual memory area. + #[inline] + pub fn end(&self) -> usize { + // SAFETY: By the type invariants, the caller holds at least the mmap read lock, so this + // access is not a data race. + unsafe { (*self.as_ptr()).__bindgen_anon_1.__bindgen_anon_1.vm_end } + } + + /// Zap pages in the given page range. + /// + /// This clears page table mappings for the range at the leaf level, leaving all other page + /// tables intact, and freeing any memory referenced by the VMA in this range. That is, + /// anonymous memory is completely freed, file-backed memory has its reference count on page + /// cache folio's dropped, any dirty data will still be written back to disk as usual. + /// + /// It may seem odd that we clear at the leaf level, this is however a product of the page + /// table structure used to map physical memory into a virtual address space - each virtual + /// address actually consists of a bitmap of array indices into page tables, which form a + /// hierarchical page table level structure. + /// + /// As a result, each page table level maps a multiple of page table levels below, and thus + /// span ever increasing ranges of pages. At the leaf or PTE level, we map the actual physical + /// memory. + /// + /// It is here where a zap operates, as it the only place we can be certain of clearing without + /// impacting any other virtual mappings. It is an implementation detail as to whether the + /// kernel goes further in freeing unused page tables, but for the purposes of this operation + /// we must only assume that the leaf level is cleared. + #[inline] + pub fn zap_page_range_single(&self, address: usize, size: usize) { + let (end, did_overflow) = address.overflowing_add(size); + if did_overflow || address < self.start() || self.end() < end { + // TODO: call WARN_ONCE once Rust version of it is added + return; + } + + // SAFETY: By the type invariants, the caller has read access to this VMA, which is + // sufficient for this method call. This method has no requirements on the vma flags. The + // address range is checked to be within the vma. + unsafe { + bindings::zap_page_range_single(self.as_ptr(), address, size, core::ptr::null_mut()) + }; + } + + /// If the [`VM_MIXEDMAP`] flag is set, returns a [`VmaMixedMap`] to this VMA, otherwise + /// returns `None`. + /// + /// This can be used to access methods that require [`VM_MIXEDMAP`] to be set. + /// + /// [`VM_MIXEDMAP`]: flags::MIXEDMAP + #[inline] + pub fn as_mixedmap_vma(&self) -> Option<&VmaMixedMap> { + if self.flags() & flags::MIXEDMAP != 0 { + // SAFETY: We just checked that `VM_MIXEDMAP` is set. All other requirements are + // satisfied by the type invariants of `VmaRef`. + Some(unsafe { VmaMixedMap::from_raw(self.as_ptr()) }) + } else { + None + } + } +} + +/// A wrapper for the kernel's `struct vm_area_struct` with read access and [`VM_MIXEDMAP`] set. +/// +/// It represents an area of virtual memory. +/// +/// This struct is identical to [`VmaRef`] except that it must only be used when the +/// [`VM_MIXEDMAP`] flag is set on the vma. +/// +/// # Invariants +/// +/// The caller must hold the mmap read lock or the vma read lock. The `VM_MIXEDMAP` flag must be +/// set. +/// +/// [`VM_MIXEDMAP`]: flags::MIXEDMAP +#[repr(transparent)] +pub struct VmaMixedMap { + vma: VmaRef, +} + +// Make all `VmaRef` methods available on `VmaMixedMap`. +impl Deref for VmaMixedMap { + type Target = VmaRef; + + #[inline] + fn deref(&self) -> &VmaRef { + &self.vma + } +} + +impl VmaMixedMap { + /// Access a virtual memory area given a raw pointer. + /// + /// # Safety + /// + /// Callers must ensure that `vma` is valid for the duration of 'a, and that the mmap read lock + /// (or stronger) is held for at least the duration of 'a. The `VM_MIXEDMAP` flag must be set. + #[inline] + pub unsafe fn from_raw<'a>(vma: *const bindings::vm_area_struct) -> &'a Self { + // SAFETY: The caller ensures that the invariants are satisfied for the duration of 'a. + unsafe { &*vma.cast() } + } + + /// Maps a single page at the given address within the virtual memory area. + /// + /// This operation does not take ownership of the page. + #[inline] + pub fn vm_insert_page(&self, address: usize, page: &Page) -> Result { + // SAFETY: By the type invariant of `Self` caller has read access and has verified that + // `VM_MIXEDMAP` is set. By invariant on `Page` the page has order 0. + to_result(unsafe { bindings::vm_insert_page(self.as_ptr(), address, page.as_ptr()) }) + } +} + +/// A configuration object for setting up a VMA in an `f_ops->mmap()` hook. +/// +/// The `f_ops->mmap()` hook is called when a new VMA is being created, and the hook is able to +/// configure the VMA in various ways to fit the driver that owns it. Using `VmaNew` indicates that +/// you are allowed to perform operations on the VMA that can only be performed before the VMA is +/// fully initialized. +/// +/// # Invariants +/// +/// For the duration of 'a, the referenced vma must be undergoing initialization in an +/// `f_ops->mmap()` hook. +pub struct VmaNew { + vma: VmaRef, +} + +// Make all `VmaRef` methods available on `VmaNew`. +impl Deref for VmaNew { + type Target = VmaRef; + + #[inline] + fn deref(&self) -> &VmaRef { + &self.vma + } +} + +impl VmaNew { + /// Access a virtual memory area given a raw pointer. + /// + /// # Safety + /// + /// Callers must ensure that `vma` is undergoing initial vma setup for the duration of 'a. + #[inline] + pub unsafe fn from_raw<'a>(vma: *mut bindings::vm_area_struct) -> &'a Self { + // SAFETY: The caller ensures that the invariants are satisfied for the duration of 'a. + unsafe { &*vma.cast() } + } + + /// Internal method for updating the vma flags. + /// + /// # Safety + /// + /// This must not be used to set the flags to an invalid value. + #[inline] + unsafe fn update_flags(&self, set: vm_flags_t, unset: vm_flags_t) { + let mut flags = self.flags(); + flags |= set; + flags &= !unset; + + // SAFETY: This is not a data race: the vma is undergoing initial setup, so it's not yet + // shared. Additionally, `VmaNew` is `!Sync`, so it cannot be used to write in parallel. + // The caller promises that this does not set the flags to an invalid value. + unsafe { (*self.as_ptr()).__bindgen_anon_2.__vm_flags = flags }; + } + + /// Set the `VM_MIXEDMAP` flag on this vma. + /// + /// This enables the vma to contain both `struct page` and pure PFN pages. Returns a reference + /// that can be used to call `vm_insert_page` on the vma. + #[inline] + pub fn set_mixedmap(&self) -> &VmaMixedMap { + // SAFETY: We don't yet provide a way to set VM_PFNMAP, so this cannot put the flags in an + // invalid state. + unsafe { self.update_flags(flags::MIXEDMAP, 0) }; + + // SAFETY: We just set `VM_MIXEDMAP` on the vma. + unsafe { VmaMixedMap::from_raw(self.vma.as_ptr()) } + } + + /// Set the `VM_IO` flag on this vma. + /// + /// This is used for memory mapped IO and similar. The flag tells other parts of the kernel to + /// avoid looking at the pages. For memory mapped IO this is useful as accesses to the pages + /// could have side effects. + #[inline] + pub fn set_io(&self) { + // SAFETY: Setting the VM_IO flag is always okay. + unsafe { self.update_flags(flags::IO, 0) }; + } + + /// Set the `VM_DONTEXPAND` flag on this vma. + /// + /// This prevents the vma from being expanded with `mremap()`. + #[inline] + pub fn set_dontexpand(&self) { + // SAFETY: Setting the VM_DONTEXPAND flag is always okay. + unsafe { self.update_flags(flags::DONTEXPAND, 0) }; + } + + /// Set the `VM_DONTCOPY` flag on this vma. + /// + /// This prevents the vma from being copied on fork. This option is only permanent if `VM_IO` + /// is set. + #[inline] + pub fn set_dontcopy(&self) { + // SAFETY: Setting the VM_DONTCOPY flag is always okay. + unsafe { self.update_flags(flags::DONTCOPY, 0) }; + } + + /// Set the `VM_DONTDUMP` flag on this vma. + /// + /// This prevents the vma from being included in core dumps. This option is only permanent if + /// `VM_IO` is set. + #[inline] + pub fn set_dontdump(&self) { + // SAFETY: Setting the VM_DONTDUMP flag is always okay. + unsafe { self.update_flags(flags::DONTDUMP, 0) }; + } + + /// Returns whether `VM_READ` is set. + /// + /// This flag indicates whether userspace is mapping this vma as readable. + #[inline] + pub fn readable(&self) -> bool { + (self.flags() & flags::READ) != 0 + } + + /// Try to clear the `VM_MAYREAD` flag, failing if `VM_READ` is set. + /// + /// This flag indicates whether userspace is allowed to make this vma readable with + /// `mprotect()`. + /// + /// Note that this operation is irreversible. Once `VM_MAYREAD` has been cleared, it can never + /// be set again. + #[inline] + pub fn try_clear_mayread(&self) -> Result { + if self.readable() { + return Err(EINVAL); + } + // SAFETY: Clearing `VM_MAYREAD` is okay when `VM_READ` is not set. + unsafe { self.update_flags(0, flags::MAYREAD) }; + Ok(()) + } + + /// Returns whether `VM_WRITE` is set. + /// + /// This flag indicates whether userspace is mapping this vma as writable. + #[inline] + pub fn writable(&self) -> bool { + (self.flags() & flags::WRITE) != 0 + } + + /// Try to clear the `VM_MAYWRITE` flag, failing if `VM_WRITE` is set. + /// + /// This flag indicates whether userspace is allowed to make this vma writable with + /// `mprotect()`. + /// + /// Note that this operation is irreversible. Once `VM_MAYWRITE` has been cleared, it can never + /// be set again. + #[inline] + pub fn try_clear_maywrite(&self) -> Result { + if self.writable() { + return Err(EINVAL); + } + // SAFETY: Clearing `VM_MAYWRITE` is okay when `VM_WRITE` is not set. + unsafe { self.update_flags(0, flags::MAYWRITE) }; + Ok(()) + } + + /// Returns whether `VM_EXEC` is set. + /// + /// This flag indicates whether userspace is mapping this vma as executable. + #[inline] + pub fn executable(&self) -> bool { + (self.flags() & flags::EXEC) != 0 + } + + /// Try to clear the `VM_MAYEXEC` flag, failing if `VM_EXEC` is set. + /// + /// This flag indicates whether userspace is allowed to make this vma executable with + /// `mprotect()`. + /// + /// Note that this operation is irreversible. Once `VM_MAYEXEC` has been cleared, it can never + /// be set again. + #[inline] + pub fn try_clear_mayexec(&self) -> Result { + if self.executable() { + return Err(EINVAL); + } + // SAFETY: Clearing `VM_MAYEXEC` is okay when `VM_EXEC` is not set. + unsafe { self.update_flags(0, flags::MAYEXEC) }; + Ok(()) + } +} + +/// The integer type used for vma flags. +#[doc(inline)] +pub use bindings::vm_flags_t; + +/// All possible flags for [`VmaRef`]. +pub mod flags { + use super::vm_flags_t; + use crate::bindings; + + /// No flags are set. + pub const NONE: vm_flags_t = bindings::VM_NONE as _; + + /// Mapping allows reads. + pub const READ: vm_flags_t = bindings::VM_READ as _; + + /// Mapping allows writes. + pub const WRITE: vm_flags_t = bindings::VM_WRITE as _; + + /// Mapping allows execution. + pub const EXEC: vm_flags_t = bindings::VM_EXEC as _; + + /// Mapping is shared. + pub const SHARED: vm_flags_t = bindings::VM_SHARED as _; + + /// Mapping may be updated to allow reads. + pub const MAYREAD: vm_flags_t = bindings::VM_MAYREAD as _; + + /// Mapping may be updated to allow writes. + pub const MAYWRITE: vm_flags_t = bindings::VM_MAYWRITE as _; + + /// Mapping may be updated to allow execution. + pub const MAYEXEC: vm_flags_t = bindings::VM_MAYEXEC as _; + + /// Mapping may be updated to be shared. + pub const MAYSHARE: vm_flags_t = bindings::VM_MAYSHARE as _; + + /// Page-ranges managed without `struct page`, just pure PFN. + pub const PFNMAP: vm_flags_t = bindings::VM_PFNMAP as _; + + /// Memory mapped I/O or similar. + pub const IO: vm_flags_t = bindings::VM_IO as _; + + /// Do not copy this vma on fork. + pub const DONTCOPY: vm_flags_t = bindings::VM_DONTCOPY as _; + + /// Cannot expand with mremap(). + pub const DONTEXPAND: vm_flags_t = bindings::VM_DONTEXPAND as _; + + /// Lock the pages covered when they are faulted in. + pub const LOCKONFAULT: vm_flags_t = bindings::VM_LOCKONFAULT as _; + + /// Is a VM accounted object. + pub const ACCOUNT: vm_flags_t = bindings::VM_ACCOUNT as _; + + /// Should the VM suppress accounting. + pub const NORESERVE: vm_flags_t = bindings::VM_NORESERVE as _; + + /// Huge TLB Page VM. + pub const HUGETLB: vm_flags_t = bindings::VM_HUGETLB as _; + + /// Synchronous page faults. (DAX-specific) + pub const SYNC: vm_flags_t = bindings::VM_SYNC as _; + + /// Architecture-specific flag. + pub const ARCH_1: vm_flags_t = bindings::VM_ARCH_1 as _; + + /// Wipe VMA contents in child on fork. + pub const WIPEONFORK: vm_flags_t = bindings::VM_WIPEONFORK as _; + + /// Do not include in the core dump. + pub const DONTDUMP: vm_flags_t = bindings::VM_DONTDUMP as _; + + /// Not soft dirty clean area. + pub const SOFTDIRTY: vm_flags_t = bindings::VM_SOFTDIRTY as _; + + /// Can contain `struct page` and pure PFN pages. + pub const MIXEDMAP: vm_flags_t = bindings::VM_MIXEDMAP as _; + + /// MADV_HUGEPAGE marked this vma. + pub const HUGEPAGE: vm_flags_t = bindings::VM_HUGEPAGE as _; + + /// MADV_NOHUGEPAGE marked this vma. + pub const NOHUGEPAGE: vm_flags_t = bindings::VM_NOHUGEPAGE as _; + + /// KSM may merge identical pages. + pub const MERGEABLE: vm_flags_t = bindings::VM_MERGEABLE as _; +} diff --git a/rust/kernel/task.rs b/rust/kernel/task.rs index 9e6f6854948d..927413d85484 100644 --- a/rust/kernel/task.rs +++ b/rust/kernel/task.rs @@ -7,6 +7,7 @@ use crate::{ bindings, ffi::{c_int, c_long, c_uint}, + mm::MmWithUser, pid_namespace::PidNamespace, types::{ARef, NotThreadSafe, Opaque}, }; @@ -33,22 +34,20 @@ pub const TASK_NORMAL: c_uint = bindings::TASK_NORMAL as c_uint; #[macro_export] macro_rules! current { () => { - // SAFETY: Deref + addr-of below create a temporary `TaskRef` that cannot outlive the - // caller. + // SAFETY: This expression creates a temporary value that is dropped at the end of the + // caller's scope. The following mechanisms ensure that the resulting `&CurrentTask` cannot + // leave current task context: + // + // * To return to userspace, the caller must leave the current scope. + // * Operations such as `begin_new_exec()` are necessarily unsafe and the caller of + // `begin_new_exec()` is responsible for safety. + // * Rust abstractions for things such as a `kthread_use_mm()` scope must require the + // closure to be `Send`, so the `NotThreadSafe` field of `CurrentTask` ensures that the + // `&CurrentTask` cannot cross the scope in either direction. unsafe { &*$crate::task::Task::current() } }; } -/// Returns the currently running task's pid namespace. -#[macro_export] -macro_rules! current_pid_ns { - () => { - // SAFETY: Deref + addr-of below create a temporary `PidNamespaceRef` that cannot outlive - // the caller. - unsafe { &*$crate::task::Task::current_pid_ns() } - }; -} - /// Wraps the kernel's `struct task_struct`. /// /// # Invariants @@ -87,7 +86,7 @@ macro_rules! current_pid_ns { /// impl State { /// fn new() -> Self { /// Self { -/// creator: current!().into(), +/// creator: ARef::from(&**current!()), /// index: 0, /// } /// } @@ -107,6 +106,44 @@ unsafe impl Send for Task {} // synchronised by C code (e.g., `signal_pending`). unsafe impl Sync for Task {} +/// Represents the [`Task`] in the `current` global. +/// +/// This type exists to provide more efficient operations that are only valid on the current task. +/// For example, to retrieve the pid-namespace of a task, you must use rcu protection unless it is +/// the current task. +/// +/// # Invariants +/// +/// Each value of this type must only be accessed from the task context it was created within. +/// +/// Of course, every thread is in a different task context, but for the purposes of this invariant, +/// these operations also permanently leave the task context: +/// +/// * Returning to userspace from system call context. +/// * Calling `release_task()`. +/// * Calling `begin_new_exec()` in a binary format loader. +/// +/// Other operations temporarily create a new sub-context: +/// +/// * Calling `kthread_use_mm()` creates a new context, and `kthread_unuse_mm()` returns to the +/// old context. +/// +/// This means that a `CurrentTask` obtained before a `kthread_use_mm()` call may be used again +/// once `kthread_unuse_mm()` is called, but it must not be used between these two calls. +/// Conversely, a `CurrentTask` obtained between a `kthread_use_mm()`/`kthread_unuse_mm()` pair +/// must not be used after `kthread_unuse_mm()`. +#[repr(transparent)] +pub struct CurrentTask(Task, NotThreadSafe); + +// Make all `Task` methods available on `CurrentTask`. +impl Deref for CurrentTask { + type Target = Task; + #[inline] + fn deref(&self) -> &Task { + &self.0 + } +} + /// The type of process identifiers (PIDs). pub type Pid = bindings::pid_t; @@ -133,119 +170,29 @@ impl Task { /// /// # Safety /// - /// Callers must ensure that the returned object doesn't outlive the current task/thread. - pub unsafe fn current() -> impl Deref<Target = Task> { - struct TaskRef<'a> { - task: &'a Task, - _not_send: NotThreadSafe, + /// Callers must ensure that the returned object is only used to access a [`CurrentTask`] + /// within the task context that was active when this function was called. For more details, + /// see the invariants section for [`CurrentTask`]. + pub unsafe fn current() -> impl Deref<Target = CurrentTask> { + struct TaskRef { + task: *const CurrentTask, } - impl Deref for TaskRef<'_> { - type Target = Task; + impl Deref for TaskRef { + type Target = CurrentTask; fn deref(&self) -> &Self::Target { - self.task + // SAFETY: The returned reference borrows from this `TaskRef`, so it cannot outlive + // the `TaskRef`, which the caller of `Task::current()` has promised will not + // outlive the task/thread for which `self.task` is the `current` pointer. Thus, it + // is okay to return a `CurrentTask` reference here. + unsafe { &*self.task } } } - let current = Task::current_raw(); TaskRef { - // SAFETY: If the current thread is still running, the current task is valid. Given - // that `TaskRef` is not `Send`, we know it cannot be transferred to another thread - // (where it could potentially outlive the caller). - task: unsafe { &*current.cast() }, - _not_send: NotThreadSafe, - } - } - - /// Returns a PidNamespace reference for the currently executing task's/thread's pid namespace. - /// - /// This function can be used to create an unbounded lifetime by e.g., storing the returned - /// PidNamespace in a global variable which would be a bug. So the recommended way to get the - /// current task's/thread's pid namespace is to use the [`current_pid_ns`] macro because it is - /// safe. - /// - /// # Safety - /// - /// Callers must ensure that the returned object doesn't outlive the current task/thread. - pub unsafe fn current_pid_ns() -> impl Deref<Target = PidNamespace> { - struct PidNamespaceRef<'a> { - task: &'a PidNamespace, - _not_send: NotThreadSafe, - } - - impl Deref for PidNamespaceRef<'_> { - type Target = PidNamespace; - - fn deref(&self) -> &Self::Target { - self.task - } - } - - // The lifetime of `PidNamespace` is bound to `Task` and `struct pid`. - // - // The `PidNamespace` of a `Task` doesn't ever change once the `Task` is alive. A - // `unshare(CLONE_NEWPID)` or `setns(fd_pidns/pidfd, CLONE_NEWPID)` will not have an effect - // on the calling `Task`'s pid namespace. It will only effect the pid namespace of children - // created by the calling `Task`. This invariant guarantees that after having acquired a - // reference to a `Task`'s pid namespace it will remain unchanged. - // - // When a task has exited and been reaped `release_task()` will be called. This will set - // the `PidNamespace` of the task to `NULL`. So retrieving the `PidNamespace` of a task - // that is dead will return `NULL`. Note, that neither holding the RCU lock nor holding a - // referencing count to - // the `Task` will prevent `release_task()` being called. - // - // In order to retrieve the `PidNamespace` of a `Task` the `task_active_pid_ns()` function - // can be used. There are two cases to consider: - // - // (1) retrieving the `PidNamespace` of the `current` task - // (2) retrieving the `PidNamespace` of a non-`current` task - // - // From system call context retrieving the `PidNamespace` for case (1) is always safe and - // requires neither RCU locking nor a reference count to be held. Retrieving the - // `PidNamespace` after `release_task()` for current will return `NULL` but no codepath - // like that is exposed to Rust. - // - // Retrieving the `PidNamespace` from system call context for (2) requires RCU protection. - // Accessing `PidNamespace` outside of RCU protection requires a reference count that - // must've been acquired while holding the RCU lock. Note that accessing a non-`current` - // task means `NULL` can be returned as the non-`current` task could have already passed - // through `release_task()`. - // - // To retrieve (1) the `current_pid_ns!()` macro should be used which ensure that the - // returned `PidNamespace` cannot outlive the calling scope. The associated - // `current_pid_ns()` function should not be called directly as it could be abused to - // created an unbounded lifetime for `PidNamespace`. The `current_pid_ns!()` macro allows - // Rust to handle the common case of accessing `current`'s `PidNamespace` without RCU - // protection and without having to acquire a reference count. - // - // For (2) the `task_get_pid_ns()` method must be used. This will always acquire a - // reference on `PidNamespace` and will return an `Option` to force the caller to - // explicitly handle the case where `PidNamespace` is `None`, something that tends to be - // forgotten when doing the equivalent operation in `C`. Missing RCU primitives make it - // difficult to perform operations that are otherwise safe without holding a reference - // count as long as RCU protection is guaranteed. But it is not important currently. But we - // do want it in the future. - // - // Note for (2) the required RCU protection around calling `task_active_pid_ns()` - // synchronizes against putting the last reference of the associated `struct pid` of - // `task->thread_pid`. The `struct pid` stored in that field is used to retrieve the - // `PidNamespace` of the caller. When `release_task()` is called `task->thread_pid` will be - // `NULL`ed and `put_pid()` on said `struct pid` will be delayed in `free_pid()` via - // `call_rcu()` allowing everyone with an RCU protected access to the `struct pid` acquired - // from `task->thread_pid` to finish. - // - // SAFETY: The current task's pid namespace is valid as long as the current task is running. - let pidns = unsafe { bindings::task_active_pid_ns(Task::current_raw()) }; - PidNamespaceRef { - // SAFETY: If the current thread is still running, the current task and its associated - // pid namespace are valid. `PidNamespaceRef` is not `Send`, so we know it cannot be - // transferred to another thread (where it could potentially outlive the current - // `Task`). The caller needs to ensure that the PidNamespaceRef doesn't outlive the - // current task/thread. - task: unsafe { PidNamespace::from_ptr(pidns) }, - _not_send: NotThreadSafe, + // CAST: The layout of `struct task_struct` and `CurrentTask` is identical. + task: Task::current_raw().cast(), } } @@ -328,6 +275,70 @@ impl Task { } } +impl CurrentTask { + /// Access the address space of the current task. + /// + /// This function does not touch the refcount of the mm. + #[inline] + pub fn mm(&self) -> Option<&MmWithUser> { + // SAFETY: The `mm` field of `current` is not modified from other threads, so reading it is + // not a data race. + let mm = unsafe { (*self.as_ptr()).mm }; + + if mm.is_null() { + return None; + } + + // SAFETY: If `current->mm` is non-null, then it references a valid mm with a non-zero + // value of `mm_users`. Furthermore, the returned `&MmWithUser` borrows from this + // `CurrentTask`, so it cannot escape the scope in which the current pointer was obtained. + // + // This is safe even if `kthread_use_mm()`/`kthread_unuse_mm()` are used. There are two + // relevant cases: + // * If the `&CurrentTask` was created before `kthread_use_mm()`, then it cannot be + // accessed during the `kthread_use_mm()`/`kthread_unuse_mm()` scope due to the + // `NotThreadSafe` field of `CurrentTask`. + // * If the `&CurrentTask` was created within a `kthread_use_mm()`/`kthread_unuse_mm()` + // scope, then the `&CurrentTask` cannot escape that scope, so the returned `&MmWithUser` + // also cannot escape that scope. + // In either case, it's not possible to read `current->mm` and keep using it after the + // scope is ended with `kthread_unuse_mm()`. + Some(unsafe { MmWithUser::from_raw(mm) }) + } + + /// Access the pid namespace of the current task. + /// + /// This function does not touch the refcount of the namespace or use RCU protection. + /// + /// To access the pid namespace of another task, see [`Task::get_pid_ns`]. + #[doc(alias = "task_active_pid_ns")] + #[inline] + pub fn active_pid_ns(&self) -> Option<&PidNamespace> { + // SAFETY: It is safe to call `task_active_pid_ns` without RCU protection when calling it + // on the current task. + let active_ns = unsafe { bindings::task_active_pid_ns(self.as_ptr()) }; + + if active_ns.is_null() { + return None; + } + + // The lifetime of `PidNamespace` is bound to `Task` and `struct pid`. + // + // The `PidNamespace` of a `Task` doesn't ever change once the `Task` is alive. + // + // From system call context retrieving the `PidNamespace` for the current task is always + // safe and requires neither RCU locking nor a reference count to be held. Retrieving the + // `PidNamespace` after `release_task()` for current will return `NULL` but no codepath + // like that is exposed to Rust. + // + // SAFETY: If `current`'s pid ns is non-null, then it references a valid pid ns. + // Furthermore, the returned `&PidNamespace` borrows from this `CurrentTask`, so it cannot + // escape the scope in which the current pointer was obtained, e.g. it cannot live past a + // `release_task()` call. + Some(unsafe { PidNamespace::from_ptr(active_ns) }) + } +} + // SAFETY: The type invariants guarantee that `Task` is always refcounted. unsafe impl crate::types::AlwaysRefCounted for Task { fn inc_ref(&self) { |