summaryrefslogtreecommitdiff
path: root/rust/kernel/mm/virt.rs
blob: 31803674aecc57408df7960def17cfdc2cebcd6c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
// SPDX-License-Identifier: GPL-2.0

// Copyright (C) 2024 Google LLC.

//! Virtual memory.
//!
//! This module deals with managing a single VMA in the address space of a userspace process. Each
//! VMA corresponds to a region of memory that the userspace process can access, and the VMA lets
//! you control what happens when userspace reads or writes to that region of memory.
//!
//! The module has several different Rust types that all correspond to the C type called
//! `vm_area_struct`. The different structs represent what kind of access you have to the VMA, e.g.
//! [`VmaRef`] is used when you hold the mmap or vma read lock. Using the appropriate struct
//! ensures that you can't, for example, accidentally call a function that requires holding the
//! write lock when you only hold the read lock.

use crate::{
    bindings,
    error::{code::EINVAL, to_result, Result},
    mm::MmWithUser,
    page::Page,
    types::Opaque,
};

use core::ops::Deref;

/// A wrapper for the kernel's `struct vm_area_struct` with read access.
///
/// It represents an area of virtual memory.
///
/// # Invariants
///
/// The caller must hold the mmap read lock or the vma read lock.
#[repr(transparent)]
pub struct VmaRef {
    vma: Opaque<bindings::vm_area_struct>,
}

// Methods you can call when holding the mmap or vma read lock (or stronger). They must be usable
// no matter what the vma flags are.
impl VmaRef {
    /// Access a virtual memory area given a raw pointer.
    ///
    /// # Safety
    ///
    /// Callers must ensure that `vma` is valid for the duration of 'a, and that the mmap or vma
    /// read lock (or stronger) is held for at least the duration of 'a.
    #[inline]
    pub unsafe fn from_raw<'a>(vma: *const bindings::vm_area_struct) -> &'a Self {
        // SAFETY: The caller ensures that the invariants are satisfied for the duration of 'a.
        unsafe { &*vma.cast() }
    }

    /// Returns a raw pointer to this area.
    #[inline]
    pub fn as_ptr(&self) -> *mut bindings::vm_area_struct {
        self.vma.get()
    }

    /// Access the underlying `mm_struct`.
    #[inline]
    pub fn mm(&self) -> &MmWithUser {
        // SAFETY: By the type invariants, this `vm_area_struct` is valid and we hold the mmap/vma
        // read lock or stronger. This implies that the underlying mm has a non-zero value of
        // `mm_users`.
        unsafe { MmWithUser::from_raw((*self.as_ptr()).vm_mm) }
    }

    /// Returns the flags associated with the virtual memory area.
    ///
    /// The possible flags are a combination of the constants in [`flags`].
    #[inline]
    pub fn flags(&self) -> vm_flags_t {
        // SAFETY: By the type invariants, the caller holds at least the mmap read lock, so this
        // access is not a data race.
        unsafe { (*self.as_ptr()).__bindgen_anon_2.vm_flags }
    }

    /// Returns the (inclusive) start address of the virtual memory area.
    #[inline]
    pub fn start(&self) -> usize {
        // SAFETY: By the type invariants, the caller holds at least the mmap read lock, so this
        // access is not a data race.
        unsafe { (*self.as_ptr()).__bindgen_anon_1.__bindgen_anon_1.vm_start }
    }

    /// Returns the (exclusive) end address of the virtual memory area.
    #[inline]
    pub fn end(&self) -> usize {
        // SAFETY: By the type invariants, the caller holds at least the mmap read lock, so this
        // access is not a data race.
        unsafe { (*self.as_ptr()).__bindgen_anon_1.__bindgen_anon_1.vm_end }
    }

    /// Zap pages in the given page range.
    ///
    /// This clears page table mappings for the range at the leaf level, leaving all other page
    /// tables intact, and freeing any memory referenced by the VMA in this range. That is,
    /// anonymous memory is completely freed, file-backed memory has its reference count on page
    /// cache folio's dropped, any dirty data will still be written back to disk as usual.
    ///
    /// It may seem odd that we clear at the leaf level, this is however a product of the page
    /// table structure used to map physical memory into a virtual address space - each virtual
    /// address actually consists of a bitmap of array indices into page tables, which form a
    /// hierarchical page table level structure.
    ///
    /// As a result, each page table level maps a multiple of page table levels below, and thus
    /// span ever increasing ranges of pages. At the leaf or PTE level, we map the actual physical
    /// memory.
    ///
    /// It is here where a zap operates, as it the only place we can be certain of clearing without
    /// impacting any other virtual mappings. It is an implementation detail as to whether the
    /// kernel goes further in freeing unused page tables, but for the purposes of this operation
    /// we must only assume that the leaf level is cleared.
    #[inline]
    pub fn zap_page_range_single(&self, address: usize, size: usize) {
        let (end, did_overflow) = address.overflowing_add(size);
        if did_overflow || address < self.start() || self.end() < end {
            // TODO: call WARN_ONCE once Rust version of it is added
            return;
        }

        // SAFETY: By the type invariants, the caller has read access to this VMA, which is
        // sufficient for this method call. This method has no requirements on the vma flags. The
        // address range is checked to be within the vma.
        unsafe {
            bindings::zap_page_range_single(self.as_ptr(), address, size, core::ptr::null_mut())
        };
    }

    /// If the [`VM_MIXEDMAP`] flag is set, returns a [`VmaMixedMap`] to this VMA, otherwise
    /// returns `None`.
    ///
    /// This can be used to access methods that require [`VM_MIXEDMAP`] to be set.
    ///
    /// [`VM_MIXEDMAP`]: flags::MIXEDMAP
    #[inline]
    pub fn as_mixedmap_vma(&self) -> Option<&VmaMixedMap> {
        if self.flags() & flags::MIXEDMAP != 0 {
            // SAFETY: We just checked that `VM_MIXEDMAP` is set. All other requirements are
            // satisfied by the type invariants of `VmaRef`.
            Some(unsafe { VmaMixedMap::from_raw(self.as_ptr()) })
        } else {
            None
        }
    }
}

/// A wrapper for the kernel's `struct vm_area_struct` with read access and [`VM_MIXEDMAP`] set.
///
/// It represents an area of virtual memory.
///
/// This struct is identical to [`VmaRef`] except that it must only be used when the
/// [`VM_MIXEDMAP`] flag is set on the vma.
///
/// # Invariants
///
/// The caller must hold the mmap read lock or the vma read lock. The `VM_MIXEDMAP` flag must be
/// set.
///
/// [`VM_MIXEDMAP`]: flags::MIXEDMAP
#[repr(transparent)]
pub struct VmaMixedMap {
    vma: VmaRef,
}

// Make all `VmaRef` methods available on `VmaMixedMap`.
impl Deref for VmaMixedMap {
    type Target = VmaRef;

    #[inline]
    fn deref(&self) -> &VmaRef {
        &self.vma
    }
}

impl VmaMixedMap {
    /// Access a virtual memory area given a raw pointer.
    ///
    /// # Safety
    ///
    /// Callers must ensure that `vma` is valid for the duration of 'a, and that the mmap read lock
    /// (or stronger) is held for at least the duration of 'a. The `VM_MIXEDMAP` flag must be set.
    #[inline]
    pub unsafe fn from_raw<'a>(vma: *const bindings::vm_area_struct) -> &'a Self {
        // SAFETY: The caller ensures that the invariants are satisfied for the duration of 'a.
        unsafe { &*vma.cast() }
    }

    /// Maps a single page at the given address within the virtual memory area.
    ///
    /// This operation does not take ownership of the page.
    #[inline]
    pub fn vm_insert_page(&self, address: usize, page: &Page) -> Result {
        // SAFETY: By the type invariant of `Self` caller has read access and has verified that
        // `VM_MIXEDMAP` is set. By invariant on `Page` the page has order 0.
        to_result(unsafe { bindings::vm_insert_page(self.as_ptr(), address, page.as_ptr()) })
    }
}

/// A configuration object for setting up a VMA in an `f_ops->mmap()` hook.
///
/// The `f_ops->mmap()` hook is called when a new VMA is being created, and the hook is able to
/// configure the VMA in various ways to fit the driver that owns it. Using `VmaNew` indicates that
/// you are allowed to perform operations on the VMA that can only be performed before the VMA is
/// fully initialized.
///
/// # Invariants
///
/// For the duration of 'a, the referenced vma must be undergoing initialization in an
/// `f_ops->mmap()` hook.
pub struct VmaNew {
    vma: VmaRef,
}

// Make all `VmaRef` methods available on `VmaNew`.
impl Deref for VmaNew {
    type Target = VmaRef;

    #[inline]
    fn deref(&self) -> &VmaRef {
        &self.vma
    }
}

impl VmaNew {
    /// Access a virtual memory area given a raw pointer.
    ///
    /// # Safety
    ///
    /// Callers must ensure that `vma` is undergoing initial vma setup for the duration of 'a.
    #[inline]
    pub unsafe fn from_raw<'a>(vma: *mut bindings::vm_area_struct) -> &'a Self {
        // SAFETY: The caller ensures that the invariants are satisfied for the duration of 'a.
        unsafe { &*vma.cast() }
    }

    /// Internal method for updating the vma flags.
    ///
    /// # Safety
    ///
    /// This must not be used to set the flags to an invalid value.
    #[inline]
    unsafe fn update_flags(&self, set: vm_flags_t, unset: vm_flags_t) {
        let mut flags = self.flags();
        flags |= set;
        flags &= !unset;

        // SAFETY: This is not a data race: the vma is undergoing initial setup, so it's not yet
        // shared. Additionally, `VmaNew` is `!Sync`, so it cannot be used to write in parallel.
        // The caller promises that this does not set the flags to an invalid value.
        unsafe { (*self.as_ptr()).__bindgen_anon_2.__vm_flags = flags };
    }

    /// Set the `VM_MIXEDMAP` flag on this vma.
    ///
    /// This enables the vma to contain both `struct page` and pure PFN pages. Returns a reference
    /// that can be used to call `vm_insert_page` on the vma.
    #[inline]
    pub fn set_mixedmap(&self) -> &VmaMixedMap {
        // SAFETY: We don't yet provide a way to set VM_PFNMAP, so this cannot put the flags in an
        // invalid state.
        unsafe { self.update_flags(flags::MIXEDMAP, 0) };

        // SAFETY: We just set `VM_MIXEDMAP` on the vma.
        unsafe { VmaMixedMap::from_raw(self.vma.as_ptr()) }
    }

    /// Set the `VM_IO` flag on this vma.
    ///
    /// This is used for memory mapped IO and similar. The flag tells other parts of the kernel to
    /// avoid looking at the pages. For memory mapped IO this is useful as accesses to the pages
    /// could have side effects.
    #[inline]
    pub fn set_io(&self) {
        // SAFETY: Setting the VM_IO flag is always okay.
        unsafe { self.update_flags(flags::IO, 0) };
    }

    /// Set the `VM_DONTEXPAND` flag on this vma.
    ///
    /// This prevents the vma from being expanded with `mremap()`.
    #[inline]
    pub fn set_dontexpand(&self) {
        // SAFETY: Setting the VM_DONTEXPAND flag is always okay.
        unsafe { self.update_flags(flags::DONTEXPAND, 0) };
    }

    /// Set the `VM_DONTCOPY` flag on this vma.
    ///
    /// This prevents the vma from being copied on fork. This option is only permanent if `VM_IO`
    /// is set.
    #[inline]
    pub fn set_dontcopy(&self) {
        // SAFETY: Setting the VM_DONTCOPY flag is always okay.
        unsafe { self.update_flags(flags::DONTCOPY, 0) };
    }

    /// Set the `VM_DONTDUMP` flag on this vma.
    ///
    /// This prevents the vma from being included in core dumps. This option is only permanent if
    /// `VM_IO` is set.
    #[inline]
    pub fn set_dontdump(&self) {
        // SAFETY: Setting the VM_DONTDUMP flag is always okay.
        unsafe { self.update_flags(flags::DONTDUMP, 0) };
    }

    /// Returns whether `VM_READ` is set.
    ///
    /// This flag indicates whether userspace is mapping this vma as readable.
    #[inline]
    pub fn readable(&self) -> bool {
        (self.flags() & flags::READ) != 0
    }

    /// Try to clear the `VM_MAYREAD` flag, failing if `VM_READ` is set.
    ///
    /// This flag indicates whether userspace is allowed to make this vma readable with
    /// `mprotect()`.
    ///
    /// Note that this operation is irreversible. Once `VM_MAYREAD` has been cleared, it can never
    /// be set again.
    #[inline]
    pub fn try_clear_mayread(&self) -> Result {
        if self.readable() {
            return Err(EINVAL);
        }
        // SAFETY: Clearing `VM_MAYREAD` is okay when `VM_READ` is not set.
        unsafe { self.update_flags(0, flags::MAYREAD) };
        Ok(())
    }

    /// Returns whether `VM_WRITE` is set.
    ///
    /// This flag indicates whether userspace is mapping this vma as writable.
    #[inline]
    pub fn writable(&self) -> bool {
        (self.flags() & flags::WRITE) != 0
    }

    /// Try to clear the `VM_MAYWRITE` flag, failing if `VM_WRITE` is set.
    ///
    /// This flag indicates whether userspace is allowed to make this vma writable with
    /// `mprotect()`.
    ///
    /// Note that this operation is irreversible. Once `VM_MAYWRITE` has been cleared, it can never
    /// be set again.
    #[inline]
    pub fn try_clear_maywrite(&self) -> Result {
        if self.writable() {
            return Err(EINVAL);
        }
        // SAFETY: Clearing `VM_MAYWRITE` is okay when `VM_WRITE` is not set.
        unsafe { self.update_flags(0, flags::MAYWRITE) };
        Ok(())
    }

    /// Returns whether `VM_EXEC` is set.
    ///
    /// This flag indicates whether userspace is mapping this vma as executable.
    #[inline]
    pub fn executable(&self) -> bool {
        (self.flags() & flags::EXEC) != 0
    }

    /// Try to clear the `VM_MAYEXEC` flag, failing if `VM_EXEC` is set.
    ///
    /// This flag indicates whether userspace is allowed to make this vma executable with
    /// `mprotect()`.
    ///
    /// Note that this operation is irreversible. Once `VM_MAYEXEC` has been cleared, it can never
    /// be set again.
    #[inline]
    pub fn try_clear_mayexec(&self) -> Result {
        if self.executable() {
            return Err(EINVAL);
        }
        // SAFETY: Clearing `VM_MAYEXEC` is okay when `VM_EXEC` is not set.
        unsafe { self.update_flags(0, flags::MAYEXEC) };
        Ok(())
    }
}

/// The integer type used for vma flags.
#[doc(inline)]
pub use bindings::vm_flags_t;

/// All possible flags for [`VmaRef`].
pub mod flags {
    use super::vm_flags_t;
    use crate::bindings;

    /// No flags are set.
    pub const NONE: vm_flags_t = bindings::VM_NONE as _;

    /// Mapping allows reads.
    pub const READ: vm_flags_t = bindings::VM_READ as _;

    /// Mapping allows writes.
    pub const WRITE: vm_flags_t = bindings::VM_WRITE as _;

    /// Mapping allows execution.
    pub const EXEC: vm_flags_t = bindings::VM_EXEC as _;

    /// Mapping is shared.
    pub const SHARED: vm_flags_t = bindings::VM_SHARED as _;

    /// Mapping may be updated to allow reads.
    pub const MAYREAD: vm_flags_t = bindings::VM_MAYREAD as _;

    /// Mapping may be updated to allow writes.
    pub const MAYWRITE: vm_flags_t = bindings::VM_MAYWRITE as _;

    /// Mapping may be updated to allow execution.
    pub const MAYEXEC: vm_flags_t = bindings::VM_MAYEXEC as _;

    /// Mapping may be updated to be shared.
    pub const MAYSHARE: vm_flags_t = bindings::VM_MAYSHARE as _;

    /// Page-ranges managed without `struct page`, just pure PFN.
    pub const PFNMAP: vm_flags_t = bindings::VM_PFNMAP as _;

    /// Memory mapped I/O or similar.
    pub const IO: vm_flags_t = bindings::VM_IO as _;

    /// Do not copy this vma on fork.
    pub const DONTCOPY: vm_flags_t = bindings::VM_DONTCOPY as _;

    /// Cannot expand with mremap().
    pub const DONTEXPAND: vm_flags_t = bindings::VM_DONTEXPAND as _;

    /// Lock the pages covered when they are faulted in.
    pub const LOCKONFAULT: vm_flags_t = bindings::VM_LOCKONFAULT as _;

    /// Is a VM accounted object.
    pub const ACCOUNT: vm_flags_t = bindings::VM_ACCOUNT as _;

    /// Should the VM suppress accounting.
    pub const NORESERVE: vm_flags_t = bindings::VM_NORESERVE as _;

    /// Huge TLB Page VM.
    pub const HUGETLB: vm_flags_t = bindings::VM_HUGETLB as _;

    /// Synchronous page faults. (DAX-specific)
    pub const SYNC: vm_flags_t = bindings::VM_SYNC as _;

    /// Architecture-specific flag.
    pub const ARCH_1: vm_flags_t = bindings::VM_ARCH_1 as _;

    /// Wipe VMA contents in child on fork.
    pub const WIPEONFORK: vm_flags_t = bindings::VM_WIPEONFORK as _;

    /// Do not include in the core dump.
    pub const DONTDUMP: vm_flags_t = bindings::VM_DONTDUMP as _;

    /// Not soft dirty clean area.
    pub const SOFTDIRTY: vm_flags_t = bindings::VM_SOFTDIRTY as _;

    /// Can contain `struct page` and pure PFN pages.
    pub const MIXEDMAP: vm_flags_t = bindings::VM_MIXEDMAP as _;

    /// MADV_HUGEPAGE marked this vma.
    pub const HUGEPAGE: vm_flags_t = bindings::VM_HUGEPAGE as _;

    /// MADV_NOHUGEPAGE marked this vma.
    pub const NOHUGEPAGE: vm_flags_t = bindings::VM_NOHUGEPAGE as _;

    /// KSM may merge identical pages.
    pub const MERGEABLE: vm_flags_t = bindings::VM_MERGEABLE as _;
}