diff options
Diffstat (limited to 'kernel')
444 files changed, 46016 insertions, 21890 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore index c6b299a6b786..a501bfc80694 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -1,3 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only /config_data /kheaders.md5 +/kheaders-objlist +/kheaders-srclist diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 4d111f871951..15632358bcf7 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -38,8 +38,7 @@ config KEXEC config KEXEC_FILE bool "Enable kexec file based system call" depends on ARCH_SUPPORTS_KEXEC_FILE - select CRYPTO - select CRYPTO_SHA256 + select CRYPTO_LIB_SHA256 select KEXEC_CORE help This is new version of kexec system call. This system call is @@ -116,6 +115,35 @@ config CRASH_DUMP For s390, this option also enables zfcpdump. See also <file:Documentation/arch/s390/zfcpdump.rst> +config CRASH_DM_CRYPT + bool "Support saving crash dump to dm-crypt encrypted volume" + depends on KEXEC_FILE + depends on CRASH_DUMP + depends on DM_CRYPT + depends on KEYS + help + With this option enabled, user space can intereact with + /sys/kernel/config/crash_dm_crypt_keys to make the dm crypt keys + persistent for the dump-capture kernel. + +config CRASH_DM_CRYPT_CONFIGS + def_tristate CRASH_DM_CRYPT + select CONFIGFS_FS + help + CRASH_DM_CRYPT cannot directly select CONFIGFS_FS, because that + is required to be built-in. + +config CRASH_DUMP_KUNIT_TEST + tristate "Unit Tests for kernel crash dumps" if !KUNIT_ALL_TESTS + depends on CRASH_DUMP && KUNIT + default KUNIT_ALL_TESTS + help + This option builds KUnit unit tests for kernel crash dumps. The unit + tests will be used to verify the correctness of covered functions and + also prevent any regression. + + If unsure, say N. + config CRASH_HOTPLUG bool "Update the crash elfcorehdr on system configuration changes" default y diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 54ea59ff8fbe..da326800c1c9 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -103,6 +103,19 @@ config PREEMPT_RT Select this if you are building a kernel for systems which require real-time guarantees. +config PREEMPT_RT_NEEDS_BH_LOCK + bool "Enforce softirq synchronisation on PREEMPT_RT" + depends on PREEMPT_RT + help + Enforce synchronisation across the softirqs context. On PREEMPT_RT + the softirq is preemptible. This enforces the same per-CPU BLK + semantic non-PREEMPT_RT builds have. This should not be needed + because per-CPU locks were added to avoid the per-CPU BKL. + + This switch provides the old behaviour for testing reasons. Select + this if you suspect an error with preemptible softirq and want test + the old synchronized behaviour. + config PREEMPT_COUNT bool diff --git a/kernel/Makefile b/kernel/Makefile index 434929de17ef..e83669841b8c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,11 +8,10 @@ obj-y = fork.o exec_domain.o panic.o \ sysctl.o capability.o ptrace.o user.o \ signal.o sys.o umh.o workqueue.o pid.o task_work.o \ extable.o params.o \ - kthread.o sys_ni.o nsproxy.o \ + kthread.o sys_ni.o nsproxy.o nstree.o nscommon.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o regset.o ksyms_common.o -obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o obj-$(CONFIG_MULTIUSER) += groups.o obj-$(CONFIG_VHOST_TASK) += vhost_task.o @@ -53,8 +52,10 @@ obj-y += printk/ obj-y += irq/ obj-y += rcu/ obj-y += livepatch/ +obj-y += liveupdate/ obj-y += dma/ obj-y += entry/ +obj-y += unwind/ obj-$(CONFIG_MODULES) += module/ obj-$(CONFIG_KCMP) += kcmp.o @@ -77,6 +78,8 @@ obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o elfcorehdr.o obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o obj-$(CONFIG_KEXEC_CORE) += kexec_core.o obj-$(CONFIG_CRASH_DUMP) += crash_core.o +obj-$(CONFIG_CRASH_DM_CRYPT) += crash_dump_dm_crypt.o +obj-$(CONFIG_CRASH_DUMP_KUNIT_TEST) += crash_core_test.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o @@ -120,7 +123,7 @@ obj-$(CONFIG_KCSAN) += kcsan/ obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o -obj-$(CONFIG_CFI_CLANG) += cfi.o +obj-$(CONFIG_CFI) += cfi.o obj-$(CONFIG_PERF_EVENTS) += events/ @@ -137,11 +140,12 @@ obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o obj-$(CONFIG_RESOURCE_KUNIT_TEST) += resource_kunit.o obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o -CFLAGS_stackleak.o += $(DISABLE_STACKLEAK_PLUGIN) -obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o -KASAN_SANITIZE_stackleak.o := n -KCSAN_SANITIZE_stackleak.o := n -KCOV_INSTRUMENT_stackleak.o := n +CFLAGS_kstack_erase.o += $(DISABLE_KSTACK_ERASE) +CFLAGS_kstack_erase.o += $(call cc-option,-mgeneral-regs-only) +obj-$(CONFIG_KSTACK_ERASE) += kstack_erase.o +KASAN_SANITIZE_kstack_erase.o := n +KCSAN_SANITIZE_kstack_erase.o := n +KCOV_INSTRUMENT_kstack_erase.o := n obj-$(CONFIG_SCF_TORTURE_TEST) += scftorture.o @@ -156,11 +160,48 @@ filechk_cat = cat $< $(obj)/config_data: $(KCONFIG_CONFIG) FORCE $(call filechk,cat) +# kheaders_data.tar.xz $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz -quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz - cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@ -$(obj)/kheaders_data.tar.xz: FORCE - $(call cmd,genikh) +quiet_cmd_kheaders_data = GEN $@ + cmd_kheaders_data = "$<" "$@" "$(obj)/kheaders-srclist" "$(obj)/kheaders-objlist" "$(KBUILD_BUILD_TIMESTAMP)" + cmd_kheaders_data_dep = cat $(depfile) >> $(dot-target).cmd; rm -f $(depfile) -clean-files := kheaders_data.tar.xz kheaders.md5 +define rule_kheaders_data + $(call cmd_and_savecmd,kheaders_data) + $(call cmd,kheaders_data_dep) +endef + +targets += kheaders_data.tar.xz +$(obj)/kheaders_data.tar.xz: $(src)/gen_kheaders.sh $(obj)/kheaders-srclist $(obj)/kheaders-objlist $(obj)/kheaders.md5 FORCE + $(call if_changed_rule,kheaders_data) + +# generated headers in objtree +# +# include/generated/utsversion.h is ignored because it is generated +# after gen_kheaders.sh is executed. (utsversion.h is unneeded for kheaders) +filechk_kheaders_objlist = \ + for d in include "arch/$(SRCARCH)/include"; do \ + find "$${d}/generated" ! -path "include/generated/utsversion.h" -a -name "*.h" -print; \ + done + +$(obj)/kheaders-objlist: FORCE + $(call filechk,kheaders_objlist) + +# non-generated headers in srctree +filechk_kheaders_srclist = \ + for d in include "arch/$(SRCARCH)/include"; do \ + find "$(srctree)/$${d}" -path "$(srctree)/$${d}/generated" -prune -o -name "*.h" -print; \ + done + +$(obj)/kheaders-srclist: FORCE + $(call filechk,kheaders_srclist) + +# Some files are symlinks. If symlinks are changed, kheaders_data.tar.xz should +# be rebuilt. +filechk_kheaders_md5sum = xargs -r -a $< stat -c %N | md5sum + +$(obj)/kheaders.md5: $(obj)/kheaders-srclist FORCE + $(call filechk,kheaders_md5sum) + +clean-files := kheaders.md5 kheaders-srclist kheaders-objlist diff --git a/kernel/acct.c b/kernel/acct.c index 6520baa13669..2a2b3c874acd 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -44,19 +44,14 @@ * a struct file opened for write. Fixed. 2/6/2000, AV. */ -#include <linux/mm.h> #include <linux/slab.h> #include <linux/acct.h> #include <linux/capability.h> -#include <linux/file.h> #include <linux/tty.h> -#include <linux/security.h> -#include <linux/vfs.h> +#include <linux/statfs.h> #include <linux/jiffies.h> -#include <linux/times.h> #include <linux/syscalls.h> -#include <linux/mount.h> -#include <linux/uaccess.h> +#include <linux/namei.h> #include <linux/sched/cputime.h> #include <asm/div64.h> @@ -217,84 +212,70 @@ static void close_work(struct work_struct *work) complete(&acct->done); } -static int acct_on(struct filename *pathname) +DEFINE_FREE(fput_sync, struct file *, if (!IS_ERR_OR_NULL(_T)) __fput_sync(_T)) +static int acct_on(const char __user *name) { - struct file *file; - struct vfsmount *mnt, *internal; + /* Difference from BSD - they don't do O_APPEND */ + const int open_flags = O_WRONLY|O_APPEND|O_LARGEFILE; struct pid_namespace *ns = task_active_pid_ns(current); + struct filename *pathname __free(putname) = getname(name); + struct file *original_file __free(fput) = NULL; // in that order + struct path internal __free(path_put) = {}; // in that order + struct file *file __free(fput_sync) = NULL; // in that order struct bsd_acct_struct *acct; + struct vfsmount *mnt; struct fs_pin *old; - int err; - acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); - if (!acct) - return -ENOMEM; + if (IS_ERR(pathname)) + return PTR_ERR(pathname); + original_file = file_open_name(pathname, open_flags, 0); + if (IS_ERR(original_file)) + return PTR_ERR(original_file); - /* Difference from BSD - they don't do O_APPEND */ - file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0); - if (IS_ERR(file)) { - kfree(acct); + mnt = mnt_clone_internal(&original_file->f_path); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + internal.mnt = mnt; + internal.dentry = dget(mnt->mnt_root); + + file = dentry_open(&internal, open_flags, current_cred()); + if (IS_ERR(file)) return PTR_ERR(file); - } - if (!S_ISREG(file_inode(file)->i_mode)) { - kfree(acct); - filp_close(file, NULL); + if (!S_ISREG(file_inode(file)->i_mode)) return -EACCES; - } /* Exclude kernel kernel internal filesystems. */ - if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT)) { - kfree(acct); - filp_close(file, NULL); + if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT)) return -EINVAL; - } /* Exclude procfs and sysfs. */ - if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) { - kfree(acct); - filp_close(file, NULL); + if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) return -EINVAL; - } - if (!(file->f_mode & FMODE_CAN_WRITE)) { - kfree(acct); - filp_close(file, NULL); + if (!(file->f_mode & FMODE_CAN_WRITE)) return -EIO; - } - internal = mnt_clone_internal(&file->f_path); - if (IS_ERR(internal)) { - kfree(acct); - filp_close(file, NULL); - return PTR_ERR(internal); - } - err = mnt_get_write_access(internal); - if (err) { - mntput(internal); - kfree(acct); - filp_close(file, NULL); - return err; - } - mnt = file->f_path.mnt; - file->f_path.mnt = internal; + + acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); + if (!acct) + return -ENOMEM; atomic_long_set(&acct->count, 1); init_fs_pin(&acct->pin, acct_pin_kill); - acct->file = file; + acct->file = no_free_ptr(file); acct->needcheck = jiffies; acct->ns = ns; mutex_init(&acct->lock); INIT_WORK(&acct->work, close_work); init_completion(&acct->done); mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ - pin_insert(&acct->pin, mnt); + pin_insert(&acct->pin, original_file->f_path.mnt); rcu_read_lock(); old = xchg(&ns->bacct, &acct->pin); mutex_unlock(&acct->lock); pin_kill(old); - mnt_put_write_access(mnt); - mntput(mnt); return 0; } @@ -319,14 +300,9 @@ SYSCALL_DEFINE1(acct, const char __user *, name) return -EPERM; if (name) { - struct filename *tmp = getname(name); - - if (IS_ERR(tmp)) - return PTR_ERR(tmp); mutex_lock(&acct_on_mutex); - error = acct_on(tmp); + error = acct_on(name); mutex_unlock(&acct_on_mutex); - putname(tmp); } else { rcu_read_lock(); pin_kill(task_active_pid_ns(current)->bacct); @@ -544,26 +520,23 @@ static void fill_ac(struct bsd_acct_struct *acct) static void acct_write_process(struct bsd_acct_struct *acct) { struct file *file = acct->file; - const struct cred *cred; acct_t *ac = &acct->ac; /* Perform file operations on behalf of whoever enabled accounting */ - cred = override_creds(file->f_cred); - - /* - * First check to see if there is enough free_space to continue - * the process accounting system. Then get freeze protection. If - * the fs is frozen, just skip the write as we could deadlock - * the system otherwise. - */ - if (check_free_space(acct) && file_start_write_trylock(file)) { - /* it's been opened O_APPEND, so position is irrelevant */ - loff_t pos = 0; - __kernel_write(file, ac, sizeof(acct_t), &pos); - file_end_write(file); + scoped_with_creds(file->f_cred) { + /* + * First check to see if there is enough free_space to continue + * the process accounting system. Then get freeze protection. If + * the fs is frozen, just skip the write as we could deadlock + * the system otherwise. + */ + if (check_free_space(acct) && file_start_write_trylock(file)) { + /* it's been opened O_APPEND, so position is irrelevant */ + loff_t pos = 0; + __kernel_write(file, ac, sizeof(acct_t), &pos); + file_end_write(file); + } } - - revert_creds(cred); } static void do_acct_process(struct bsd_acct_struct *acct) diff --git a/kernel/audit.c b/kernel/audit.c index 5f5bf85bcc90..26a332ffb1b8 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -54,6 +54,7 @@ #include <net/netlink.h> #include <linux/skbuff.h> #include <linux/security.h> +#include <linux/lsm_hooks.h> #include <linux/freezer.h> #include <linux/pid_namespace.h> #include <net/netns/generic.h> @@ -81,6 +82,13 @@ static u32 audit_failure = AUDIT_FAIL_PRINTK; /* private audit network namespace index */ static unsigned int audit_net_id; +/* Number of modules that provide a security context. + List of lsms that provide a security context */ +static u32 audit_subj_secctx_cnt; +static u32 audit_obj_secctx_cnt; +static const struct lsm_id *audit_subj_lsms[MAX_LSM_COUNT]; +static const struct lsm_id *audit_obj_lsms[MAX_LSM_COUNT]; + /** * struct audit_net - audit private network namespace data * @sk: communication socket @@ -195,8 +203,10 @@ static struct audit_ctl_mutex { * to place it on a transmit queue. Multiple audit_buffers can be in * use simultaneously. */ struct audit_buffer { - struct sk_buff *skb; /* formatted skb ready to send */ + struct sk_buff *skb; /* the skb for audit_log functions */ + struct sk_buff_head skb_list; /* formatted skbs, ready to send */ struct audit_context *ctx; /* NULL or associated context */ + struct audit_stamp stamp; /* audit stamp for these records */ gfp_t gfp_mask; }; @@ -279,6 +289,33 @@ static pid_t auditd_pid_vnr(void) } /** + * audit_cfg_lsm - Identify a security module as providing a secctx. + * @lsmid: LSM identity + * @flags: which contexts are provided + * + * Description: + * Increments the count of the security modules providing a secctx. + * If the LSM id is already in the list leave it alone. + */ +void audit_cfg_lsm(const struct lsm_id *lsmid, int flags) +{ + int i; + + if (flags & AUDIT_CFG_LSM_SECCTX_SUBJECT) { + for (i = 0 ; i < audit_subj_secctx_cnt; i++) + if (audit_subj_lsms[i] == lsmid) + return; + audit_subj_lsms[audit_subj_secctx_cnt++] = lsmid; + } + if (flags & AUDIT_CFG_LSM_SECCTX_OBJECT) { + for (i = 0 ; i < audit_obj_secctx_cnt; i++) + if (audit_obj_lsms[i] == lsmid) + return; + audit_obj_lsms[audit_obj_secctx_cnt++] = lsmid; + } +} + +/** * audit_get_sk - Return the audit socket for the given network namespace * @net: the destination network namespace * @@ -1113,7 +1150,6 @@ static int is_audit_feature_set(int i) return af.features & AUDIT_FEATURE_TO_MASK(i); } - static int audit_get_feature(struct sk_buff *skb) { u32 seq; @@ -1473,7 +1509,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, case AUDIT_SIGNAL_INFO: if (lsmprop_is_set(&audit_sig_lsm)) { err = security_lsmprop_to_secctx(&audit_sig_lsm, - &lsmctx); + &lsmctx, LSM_ID_UNDEF); if (err < 0) return err; } @@ -1776,10 +1812,13 @@ __setup("audit_backlog_limit=", audit_backlog_limit_set); static void audit_buffer_free(struct audit_buffer *ab) { + struct sk_buff *skb; + if (!ab) return; - kfree_skb(ab->skb); + while ((skb = skb_dequeue(&ab->skb_list))) + kfree_skb(skb); kmem_cache_free(audit_buffer_cache, ab); } @@ -1792,9 +1831,14 @@ static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx, if (!ab) return NULL; + skb_queue_head_init(&ab->skb_list); + ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); if (!ab->skb) goto err; + + skb_queue_tail(&ab->skb_list, ab->skb); + if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) goto err; @@ -1833,11 +1877,11 @@ unsigned int audit_serial(void) } static inline void audit_get_stamp(struct audit_context *ctx, - struct timespec64 *t, unsigned int *serial) + struct audit_stamp *stamp) { - if (!ctx || !auditsc_get_stamp(ctx, t, serial)) { - ktime_get_coarse_real_ts64(t); - *serial = audit_serial(); + if (!ctx || !auditsc_get_stamp(ctx, stamp)) { + ktime_get_coarse_real_ts64(&stamp->ctime); + stamp->serial = audit_serial(); } } @@ -1860,8 +1904,6 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab; - struct timespec64 t; - unsigned int serial; if (audit_initialized != AUDIT_INITIALIZED) return NULL; @@ -1916,12 +1958,14 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, return NULL; } - audit_get_stamp(ab->ctx, &t, &serial); + audit_get_stamp(ab->ctx, &ab->stamp); /* cancel dummy context to enable supporting records */ if (ctx) ctx->dummy = 0; audit_log_format(ab, "audit(%llu.%03lu:%u): ", - (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial); + (unsigned long long)ab->stamp.ctime.tv_sec, + ab->stamp.ctime.tv_nsec/1000000, + ab->stamp.serial); return ab; } @@ -1956,8 +2000,8 @@ static inline int audit_expand(struct audit_buffer *ab, int extra) * will be called a second time. Currently, we assume that a printk * can't format message larger than 1024 bytes, so we don't either. */ -static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, - va_list args) +static __printf(2, 0) +void audit_log_vformat(struct audit_buffer *ab, const char *fmt, va_list args) { int len, avail; struct sk_buff *skb; @@ -2177,33 +2221,179 @@ void audit_log_key(struct audit_buffer *ab, char *key) audit_log_format(ab, "(null)"); } -int audit_log_task_context(struct audit_buffer *ab) +/** + * audit_buffer_aux_new - Add an aux record buffer to the skb list + * @ab: audit_buffer + * @type: message type + * + * Aux records are allocated and added to the skb list of + * the "main" record. The ab->skb is reset to point to the + * aux record on its creation. When the aux record in complete + * ab->skb has to be reset to point to the "main" record. + * This allows the audit_log_ functions to be ignorant of + * which kind of record it is logging to. It also avoids adding + * special data for aux records. + * + * On success ab->skb will point to the new aux record. + * Returns 0 on success, -ENOMEM should allocation fail. + */ +static int audit_buffer_aux_new(struct audit_buffer *ab, int type) +{ + WARN_ON(ab->skb != skb_peek(&ab->skb_list)); + + ab->skb = nlmsg_new(AUDIT_BUFSIZ, ab->gfp_mask); + if (!ab->skb) + goto err; + if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) + goto err; + skb_queue_tail(&ab->skb_list, ab->skb); + + audit_log_format(ab, "audit(%llu.%03lu:%u): ", + (unsigned long long)ab->stamp.ctime.tv_sec, + ab->stamp.ctime.tv_nsec/1000000, + ab->stamp.serial); + + return 0; + +err: + kfree_skb(ab->skb); + ab->skb = skb_peek(&ab->skb_list); + return -ENOMEM; +} + +/** + * audit_buffer_aux_end - Switch back to the "main" record from an aux record + * @ab: audit_buffer + * + * Restores the "main" audit record to ab->skb. + */ +static void audit_buffer_aux_end(struct audit_buffer *ab) +{ + ab->skb = skb_peek(&ab->skb_list); +} + +/** + * audit_log_subj_ctx - Add LSM subject information + * @ab: audit_buffer + * @prop: LSM subject properties. + * + * Add a subj= field and, if necessary, a AUDIT_MAC_TASK_CONTEXTS record. + */ +int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop) { - struct lsm_prop prop; struct lsm_context ctx; + char *space = ""; int error; + int i; - security_current_getlsmprop_subj(&prop); - if (!lsmprop_is_set(&prop)) + security_current_getlsmprop_subj(prop); + if (!lsmprop_is_set(prop)) return 0; - error = security_lsmprop_to_secctx(&prop, &ctx); - if (error < 0) { - if (error != -EINVAL) - goto error_path; + if (audit_subj_secctx_cnt < 2) { + error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF); + if (error < 0) { + if (error != -EINVAL) + goto error_path; + return 0; + } + audit_log_format(ab, " subj=%s", ctx.context); + security_release_secctx(&ctx); return 0; } - - audit_log_format(ab, " subj=%s", ctx.context); - security_release_secctx(&ctx); + /* Multiple LSMs provide contexts. Include an aux record. */ + audit_log_format(ab, " subj=?"); + error = audit_buffer_aux_new(ab, AUDIT_MAC_TASK_CONTEXTS); + if (error) + goto error_path; + + for (i = 0; i < audit_subj_secctx_cnt; i++) { + error = security_lsmprop_to_secctx(prop, &ctx, + audit_subj_lsms[i]->id); + if (error < 0) { + /* + * Don't print anything. An LSM like BPF could + * claim to support contexts, but only do so under + * certain conditions. + */ + if (error == -EOPNOTSUPP) + continue; + if (error != -EINVAL) + audit_panic("error in audit_log_subj_ctx"); + } else { + audit_log_format(ab, "%ssubj_%s=%s", space, + audit_subj_lsms[i]->name, ctx.context); + space = " "; + security_release_secctx(&ctx); + } + } + audit_buffer_aux_end(ab); return 0; error_path: - audit_panic("error in audit_log_task_context"); + audit_panic("error in audit_log_subj_ctx"); return error; } +EXPORT_SYMBOL(audit_log_subj_ctx); + +int audit_log_task_context(struct audit_buffer *ab) +{ + struct lsm_prop prop; + + security_current_getlsmprop_subj(&prop); + return audit_log_subj_ctx(ab, &prop); +} EXPORT_SYMBOL(audit_log_task_context); +int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop) +{ + int i; + int rc; + int error = 0; + char *space = ""; + struct lsm_context ctx; + + if (audit_obj_secctx_cnt < 2) { + error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF); + if (error < 0) { + if (error != -EINVAL) + goto error_path; + return error; + } + audit_log_format(ab, " obj=%s", ctx.context); + security_release_secctx(&ctx); + return 0; + } + audit_log_format(ab, " obj=?"); + error = audit_buffer_aux_new(ab, AUDIT_MAC_OBJ_CONTEXTS); + if (error) + goto error_path; + + for (i = 0; i < audit_obj_secctx_cnt; i++) { + rc = security_lsmprop_to_secctx(prop, &ctx, + audit_obj_lsms[i]->id); + if (rc < 0) { + audit_log_format(ab, "%sobj_%s=?", space, + audit_obj_lsms[i]->name); + if (rc != -EINVAL) + audit_panic("error in audit_log_obj_ctx"); + error = rc; + } else { + audit_log_format(ab, "%sobj_%s=%s", space, + audit_obj_lsms[i]->name, ctx.context); + security_release_secctx(&ctx); + } + space = " "; + } + + audit_buffer_aux_end(ab); + return error; + +error_path: + audit_panic("error in audit_log_obj_ctx"); + return error; +} + void audit_log_d_path_exe(struct audit_buffer *ab, struct mm_struct *mm) { @@ -2285,7 +2475,7 @@ void audit_log_path_denied(int type, const char *operation) { struct audit_buffer *ab; - if (!audit_enabled || audit_dummy_context()) + if (!audit_enabled) return; /* Generate log with subject, operation, outcome. */ @@ -2411,6 +2601,28 @@ int audit_signal_info(int sig, struct task_struct *t) } /** + * __audit_log_end - enqueue one audit record + * @skb: the buffer to send + */ +static void __audit_log_end(struct sk_buff *skb) +{ + struct nlmsghdr *nlh; + + if (audit_rate_check()) { + /* setup the netlink header, see the comments in + * kauditd_send_multicast_skb() for length quirks */ + nlh = nlmsg_hdr(skb); + nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; + + /* queue the netlink packet */ + skb_queue_tail(&audit_queue, skb); + } else { + audit_log_lost("rate limit exceeded"); + kfree_skb(skb); + } +} + +/** * audit_log_end - end one audit record * @ab: the audit_buffer * @@ -2422,25 +2634,15 @@ int audit_signal_info(int sig, struct task_struct *t) void audit_log_end(struct audit_buffer *ab) { struct sk_buff *skb; - struct nlmsghdr *nlh; if (!ab) return; - if (audit_rate_check()) { - skb = ab->skb; - ab->skb = NULL; - - /* setup the netlink header, see the comments in - * kauditd_send_multicast_skb() for length quirks */ - nlh = nlmsg_hdr(skb); - nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; + while ((skb = skb_dequeue(&ab->skb_list))) + __audit_log_end(skb); - /* queue the netlink packet and poke the kauditd thread */ - skb_queue_tail(&audit_queue, skb); - wake_up_interruptible(&kauditd_wait); - } else - audit_log_lost("rate limit exceeded"); + /* poke the kauditd thread */ + wake_up_interruptible(&kauditd_wait); audit_buffer_free(ab); } diff --git a/kernel/audit.h b/kernel/audit.h index 0211cb307d30..7c401729e21b 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -99,6 +99,12 @@ struct audit_proctitle { char *value; /* the cmdline field */ }; +/* A timestamp/serial pair to identify an event */ +struct audit_stamp { + struct timespec64 ctime; /* time of syscall entry */ + unsigned int serial; /* serial number for record */ +}; + /* The per-task audit context. */ struct audit_context { int dummy; /* must be the first element */ @@ -108,10 +114,9 @@ struct audit_context { AUDIT_CTX_URING, /* in use by io_uring */ } context; enum audit_state state, current_state; - unsigned int serial; /* serial number for record */ + struct audit_stamp stamp; /* event identifier */ int major; /* syscall number */ int uring_op; /* uring operation */ - struct timespec64 ctime; /* time of syscall entry */ unsigned long argv[4]; /* syscall arguments */ long return_code;/* syscall return code */ u64 prio; @@ -133,7 +138,7 @@ struct audit_context { struct audit_aux_data *aux_pids; struct sockaddr_storage *sockaddr; size_t sockaddr_len; - /* Save things to print about task_struct */ + /* Save things to print about task_struct */ pid_t ppid; kuid_t uid, euid, suid, fsuid; kgid_t gid, egid, sgid, fsgid; @@ -200,7 +205,7 @@ struct audit_context { int argc; } execve; struct { - char *name; + const char *name; } module; struct { struct audit_ntp_data ntp_data; @@ -263,7 +268,7 @@ extern void audit_put_tty(struct tty_struct *tty); extern unsigned int audit_serial(void); #ifdef CONFIG_AUDITSYSCALL extern int auditsc_get_stamp(struct audit_context *ctx, - struct timespec64 *t, unsigned int *serial); + struct audit_stamp *stamp); extern void audit_put_watch(struct audit_watch *watch); extern void audit_get_watch(struct audit_watch *watch); @@ -304,7 +309,7 @@ extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx); extern struct list_head *audit_killed_trees(void); #else /* CONFIG_AUDITSYSCALL */ -#define auditsc_get_stamp(c, t, s) 0 +#define auditsc_get_stamp(c, s) 0 #define audit_put_watch(w) do { } while (0) #define audit_get_watch(w) do { } while (0) #define audit_to_watch(k, p, l, o) (-EINVAL) diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index c565fbf66ac8..b92805b317a2 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -76,17 +76,18 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa struct audit_fsnotify_mark *audit_mark; struct path path; struct dentry *dentry; - struct inode *inode; int ret; if (pathname[0] != '/' || pathname[len-1] == '/') return ERR_PTR(-EINVAL); - dentry = kern_path_locked(pathname, &path); + dentry = kern_path_parent(pathname, &path); if (IS_ERR(dentry)) return ERR_CAST(dentry); /* returning an error */ - inode = path.dentry->d_inode; - inode_unlock(inode); + if (d_really_is_negative(dentry)) { + audit_mark = ERR_PTR(-ENOENT); + goto out; + } audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL); if (unlikely(!audit_mark)) { @@ -100,7 +101,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa audit_update_mark(audit_mark, dentry->d_inode); audit_mark->rule = krule; - ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, 0); + ret = fsnotify_add_inode_mark(&audit_mark->mark, path.dentry->d_inode, 0); if (ret < 0) { audit_mark->path = NULL; fsnotify_put_mark(&audit_mark->mark); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index f2f38903b2fe..fda6beb041e0 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -93,8 +93,10 @@ static struct kmem_cache *audit_tree_mark_cachep __ro_after_init; static struct audit_tree *alloc_tree(const char *s) { struct audit_tree *tree; + size_t sz; - tree = kmalloc(struct_size(tree, pathname, strlen(s) + 1), GFP_KERNEL); + sz = strlen(s) + 1; + tree = kmalloc(struct_size(tree, pathname, sz), GFP_KERNEL); if (tree) { refcount_set(&tree->count, 1); tree->goner = 0; @@ -103,7 +105,7 @@ static struct audit_tree *alloc_tree(const char *s) INIT_LIST_HEAD(&tree->list); INIT_LIST_HEAD(&tree->same_root); tree->root = NULL; - strcpy(tree->pathname, s); + strscpy(tree->pathname, s, sz); } return tree; } @@ -668,12 +670,6 @@ int audit_remove_tree_rule(struct audit_krule *rule) return 0; } -static int compare_root(struct vfsmount *mnt, void *arg) -{ - return inode_to_key(d_backing_inode(mnt->mnt_root)) == - (unsigned long)arg; -} - void audit_trim_trees(void) { struct list_head cursor; @@ -683,8 +679,9 @@ void audit_trim_trees(void) while (cursor.next != &tree_list) { struct audit_tree *tree; struct path path; - struct vfsmount *root_mnt; struct audit_node *node; + const struct path *paths; + struct path array[16]; int err; tree = container_of(cursor.next, struct audit_tree, list); @@ -696,9 +693,9 @@ void audit_trim_trees(void) if (err) goto skip_it; - root_mnt = collect_mounts(&path); + paths = collect_paths(&path, array, 16); path_put(&path); - if (IS_ERR(root_mnt)) + if (IS_ERR(paths)) goto skip_it; spin_lock(&hash_lock); @@ -706,14 +703,17 @@ void audit_trim_trees(void) struct audit_chunk *chunk = find_chunk(node); /* this could be NULL if the watch is dying else where... */ node->index |= 1U<<31; - if (iterate_mounts(compare_root, - (void *)(chunk->key), - root_mnt)) - node->index &= ~(1U<<31); + for (const struct path *p = paths; p->dentry; p++) { + struct inode *inode = p->dentry->d_inode; + if (inode_to_key(inode) == chunk->key) { + node->index &= ~(1U<<31); + break; + } + } } spin_unlock(&hash_lock); trim_marked(tree); - drop_collected_mounts(root_mnt); + drop_collected_paths(paths, array); skip_it: put_tree(tree); mutex_lock(&audit_filter_mutex); @@ -742,9 +742,14 @@ void audit_put_tree(struct audit_tree *tree) put_tree(tree); } -static int tag_mount(struct vfsmount *mnt, void *arg) +static int tag_mounts(const struct path *paths, struct audit_tree *tree) { - return tag_chunk(d_backing_inode(mnt->mnt_root), arg); + for (const struct path *p = paths; p->dentry; p++) { + int err = tag_chunk(p->dentry->d_inode, tree); + if (err) + return err; + } + return 0; } /* @@ -801,7 +806,8 @@ int audit_add_tree_rule(struct audit_krule *rule) { struct audit_tree *seed = rule->tree, *tree; struct path path; - struct vfsmount *mnt; + struct path array[16]; + const struct path *paths; int err; rule->tree = NULL; @@ -828,16 +834,16 @@ int audit_add_tree_rule(struct audit_krule *rule) err = kern_path(tree->pathname, 0, &path); if (err) goto Err; - mnt = collect_mounts(&path); + paths = collect_paths(&path, array, 16); path_put(&path); - if (IS_ERR(mnt)) { - err = PTR_ERR(mnt); + if (IS_ERR(paths)) { + err = PTR_ERR(paths); goto Err; } get_tree(tree); - err = iterate_mounts(tag_mount, tree, mnt); - drop_collected_mounts(mnt); + err = tag_mounts(paths, tree); + drop_collected_paths(paths, array); if (!err) { struct audit_node *node; @@ -872,20 +878,21 @@ int audit_tag_tree(char *old, char *new) struct list_head cursor, barrier; int failed = 0; struct path path1, path2; - struct vfsmount *tagged; + struct path array[16]; + const struct path *paths; int err; err = kern_path(new, 0, &path2); if (err) return err; - tagged = collect_mounts(&path2); + paths = collect_paths(&path2, array, 16); path_put(&path2); - if (IS_ERR(tagged)) - return PTR_ERR(tagged); + if (IS_ERR(paths)) + return PTR_ERR(paths); err = kern_path(old, 0, &path1); if (err) { - drop_collected_mounts(tagged); + drop_collected_paths(paths, array); return err; } @@ -914,7 +921,7 @@ int audit_tag_tree(char *old, char *new) continue; } - failed = iterate_mounts(tag_mount, tree, tagged); + failed = tag_mounts(paths, tree); if (failed) { put_tree(tree); mutex_lock(&audit_filter_mutex); @@ -955,7 +962,7 @@ int audit_tag_tree(char *old, char *new) list_del(&cursor); mutex_unlock(&audit_filter_mutex); path_put(&path1); - drop_collected_mounts(tagged); + drop_collected_paths(paths, array); return failed; } diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 0ebbbe37a60f..a700e3c8925f 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -349,7 +349,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) { struct dentry *d; - d = kern_path_locked_negative(watch->path, parent); + d = kern_path_parent(watch->path, parent); if (IS_ERR(d)) return PTR_ERR(d); @@ -359,7 +359,6 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) watch->ino = d_backing_inode(d)->i_ino; } - inode_unlock(d_backing_inode(parent->dentry)); dput(d); return 0; } diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index e3f42018ed46..6a86c0683b67 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -638,10 +638,9 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) void *bufp; int i; - data = kmalloc(struct_size(data, buf, krule->buflen), GFP_KERNEL); + data = kzalloc(struct_size(data, buf, krule->buflen), GFP_KERNEL); if (unlikely(!data)) return NULL; - memset(data, 0, sizeof(*data)); data->flags = krule->flags | krule->listnr; data->action = krule->action; @@ -1326,7 +1325,7 @@ int audit_compare_dname_path(const struct qstr *dname, const char *path, int par /* handle trailing slashes */ pathlen -= parentlen; - while (p[pathlen - 1] == '/') + while (pathlen > 0 && p[pathlen - 1] == '/') pathlen--; if (pathlen != dlen) @@ -1440,7 +1439,7 @@ static int update_lsm_rule(struct audit_krule *r) } /* This function will re-initialize the lsm_rule field of all applicable rules. - * It will traverse the filter lists serarching for rules that contain LSM + * It will traverse the filter lists searching for rules that contain LSM * specific filter fields. When such a rule is found, it is copied, the * LSM field is re-initialized, and the old rule is replaced with the * updated rule. */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 78fd876a5473..dd0563a8e0be 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -994,10 +994,10 @@ static void audit_reset_context(struct audit_context *ctx) */ ctx->current_state = ctx->state; - ctx->serial = 0; + ctx->stamp.serial = 0; + ctx->stamp.ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 }; ctx->major = 0; ctx->uring_op = 0; - ctx->ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 }; memset(ctx->argv, 0, sizeof(ctx->argv)); ctx->return_code = 0; ctx->prio = (ctx->state == AUDIT_STATE_RECORD ? ~0ULL : 0); @@ -1098,7 +1098,6 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, char *comm) { struct audit_buffer *ab; - struct lsm_context ctx; int rc = 0; ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID); @@ -1108,15 +1107,9 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, from_kuid(&init_user_ns, auid), from_kuid(&init_user_ns, uid), sessionid); - if (lsmprop_is_set(prop)) { - if (security_lsmprop_to_secctx(prop, &ctx) < 0) { - audit_log_format(ab, " obj=(none)"); - rc = 1; - } else { - audit_log_format(ab, " obj=%s", ctx.context); - security_release_secctx(&ctx); - } - } + if (lsmprop_is_set(prop) && audit_log_obj_ctx(ab, prop)) + rc = 1; + audit_log_format(ab, " ocomm="); audit_log_untrustedstring(ab, comm); audit_log_end(ab); @@ -1392,15 +1385,8 @@ static void show_special(struct audit_context *context, int *call_panic) from_kgid(&init_user_ns, context->ipc.gid), context->ipc.mode); if (lsmprop_is_set(&context->ipc.oprop)) { - struct lsm_context lsmctx; - - if (security_lsmprop_to_secctx(&context->ipc.oprop, - &lsmctx) < 0) { + if (audit_log_obj_ctx(ab, &context->ipc.oprop)) *call_panic = 1; - } else { - audit_log_format(ab, " obj=%s", lsmctx.context); - security_release_secctx(&lsmctx); - } } if (context->ipc.has_perm) { audit_log_end(ab); @@ -1557,17 +1543,9 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, from_kgid(&init_user_ns, n->gid), MAJOR(n->rdev), MINOR(n->rdev)); - if (lsmprop_is_set(&n->oprop)) { - struct lsm_context ctx; - - if (security_lsmprop_to_secctx(&n->oprop, &ctx) < 0) { - if (call_panic) - *call_panic = 2; - } else { - audit_log_format(ab, " obj=%s", ctx.context); - security_release_secctx(&ctx); - } - } + if (lsmprop_is_set(&n->oprop) && + audit_log_obj_ctx(ab, &n->oprop)) + *call_panic = 2; /* log the audit_names record type */ switch (n->type) { @@ -1785,8 +1763,9 @@ static void audit_log_exit(void) audit_log_pid_context(context, context->target_pid, context->target_auid, context->target_uid, context->target_sessionid, - &context->target_ref, context->target_comm)) - call_panic = 1; + &context->target_ref, + context->target_comm)) + call_panic = 1; if (context->pwd.dentry && context->pwd.mnt) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); @@ -1917,7 +1896,7 @@ void __audit_uring_entry(u8 op) ctx->context = AUDIT_CTX_URING; ctx->current_state = ctx->state; - ktime_get_coarse_real_ts64(&ctx->ctime); + ktime_get_coarse_real_ts64(&ctx->stamp.ctime); } /** @@ -2039,7 +2018,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, context->argv[3] = a4; context->context = AUDIT_CTX_SYSCALL; context->current_state = state; - ktime_get_coarse_real_ts64(&context->ctime); + ktime_get_coarse_real_ts64(&context->stamp.ctime); } /** @@ -2437,41 +2416,36 @@ void __audit_inode_child(struct inode *parent, if (inode) handle_one(inode); - /* look for a parent entry first */ list_for_each_entry(n, &context->names_list, list) { - if (!n->name || - (n->type != AUDIT_TYPE_PARENT && - n->type != AUDIT_TYPE_UNKNOWN)) + /* can only match entries that have a name */ + if (!n->name) continue; - if (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev && - !audit_compare_dname_path(dname, - n->name->name, n->name_len)) { - if (n->type == AUDIT_TYPE_UNKNOWN) - n->type = AUDIT_TYPE_PARENT; + /* look for a parent entry first */ + if (!found_parent && + (n->type == AUDIT_TYPE_PARENT || n->type == AUDIT_TYPE_UNKNOWN) && + (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev && + !audit_compare_dname_path(dname, n->name->name, n->name_len))) { + n->type = AUDIT_TYPE_PARENT; found_parent = n; - break; - } - } - - cond_resched(); - - /* is there a matching child entry? */ - list_for_each_entry(n, &context->names_list, list) { - /* can only match entries that have a name */ - if (!n->name || - (n->type != type && n->type != AUDIT_TYPE_UNKNOWN)) + if (found_child) + break; continue; + } - if (!strcmp(dname->name, n->name->name) || - !audit_compare_dname_path(dname, n->name->name, + /* is there a matching child entry? */ + if (!found_child && + (n->type == type || n->type == AUDIT_TYPE_UNKNOWN) && + (!strcmp(dname->name, n->name->name) || + !audit_compare_dname_path(dname, n->name->name, found_parent ? found_parent->name_len : - AUDIT_NAME_FULL)) { + AUDIT_NAME_FULL))) { if (n->type == AUDIT_TYPE_UNKNOWN) n->type = type; found_child = n; - break; + if (found_parent) + break; } } @@ -2508,21 +2482,17 @@ EXPORT_SYMBOL_GPL(__audit_inode_child); /** * auditsc_get_stamp - get local copies of audit_context values * @ctx: audit_context for the task - * @t: timespec64 to store time recorded in the audit_context - * @serial: serial value that is recorded in the audit_context + * @stamp: timestamp to record * * Also sets the context as auditable. */ -int auditsc_get_stamp(struct audit_context *ctx, - struct timespec64 *t, unsigned int *serial) +int auditsc_get_stamp(struct audit_context *ctx, struct audit_stamp *stamp) { if (ctx->context == AUDIT_CTX_UNUSED) return 0; - if (!ctx->serial) - ctx->serial = audit_serial(); - t->tv_sec = ctx->ctime.tv_sec; - t->tv_nsec = ctx->ctime.tv_nsec; - *serial = ctx->serial; + if (!ctx->stamp.serial) + ctx->stamp.serial = audit_serial(); + *stamp = ctx->stamp; if (!ctx->prio) { ctx->prio = 1; ctx->current_state = AUDIT_STATE_RECORD; @@ -2864,7 +2834,7 @@ void __audit_openat2_how(struct open_how *how) context->type = AUDIT_OPENAT2; } -void __audit_log_kern_module(char *name) +void __audit_log_kern_module(const char *name) { struct audit_context *context = audit_context(); diff --git a/kernel/bounds.c b/kernel/bounds.c index 29b2cd00df2c..02b619eb6106 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -6,6 +6,7 @@ */ #define __GENERATING_BOUNDS_H +#define COMPILE_OFFSETS /* Include headers that define the enum constants of interest */ #include <linux/page-flags.h> #include <linux/mmzone.h> diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index 17067dcb4386..eb3de35734f0 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -3,7 +3,7 @@ # BPF interpreter that, for example, classic socket filters depend on. config BPF bool - select CRYPTO_LIB_SHA1 + select CRYPTO_LIB_SHA256 # Used by archs to tell that they support BPF JIT compiler plus which # flavour. Only one of the two can be selected for a specific arch since diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 70502f038b92..232cbc97434d 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,15 +6,15 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse endif CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_insn_array.o obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o obj-$(CONFIG_BPF_JIT) += trampoline.o -obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o +obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o stream.o ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy) obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o endif @@ -53,9 +53,13 @@ obj-$(CONFIG_BPF_SYSCALL) += relo_core.o obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o +ifeq ($(CONFIG_DMA_SHARED_BUFFER),y) +obj-$(CONFIG_BPF_SYSCALL) += dmabuf_iter.o +endif CFLAGS_REMOVE_percpu_freelist.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_queue_stack_maps.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_lpm_trie.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_ringbuf.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_rqspinlock.o = $(CC_FLAGS_FTRACE) diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 0d56cea71602..872dc0e41c65 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -334,7 +334,7 @@ static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long ad return -EINVAL; } - ret = mm_get_unmapped_area(current->mm, filp, addr, len * 2, 0, flags); + ret = mm_get_unmapped_area(filp, addr, len * 2, 0, flags); if (IS_ERR_VALUE(ret)) return ret; if ((ret >> 32) == ((ret + len - 1) >> 32)) @@ -550,6 +550,34 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) } } +/* + * Reserve an arena virtual address range without populating it. This call stops + * bpf_arena_alloc_pages from adding pages to this range. + */ +static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt) +{ + long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; + long pgoff; + int ret; + + if (uaddr & ~PAGE_MASK) + return 0; + + pgoff = compute_pgoff(arena, uaddr); + if (pgoff + page_cnt > page_cnt_max) + return -EINVAL; + + guard(mutex)(&arena->lock); + + /* Cannot guard already allocated pages. */ + ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); + if (ret) + return -EBUSY; + + /* "Allocate" the region to prevent it from being allocated. */ + return range_tree_clear(&arena->rt, pgoff, page_cnt); +} + __bpf_kfunc_start_defs(); __bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt, @@ -573,11 +601,26 @@ __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt return; arena_free_pages(arena, (long)ptr__ign, page_cnt); } + +__bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_cnt) +{ + struct bpf_map *map = p__map; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if (map->map_type != BPF_MAP_TYPE_ARENA) + return -EINVAL; + + if (!page_cnt) + return 0; + + return arena_reserve_pages(arena, (long)ptr__ign, page_cnt); +} __bpf_kfunc_end_defs(); BTF_KFUNCS_START(arena_kfuncs) BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_RET | KF_ARENA_ARG2) BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2) BTF_KFUNCS_END(arena_kfuncs) static const struct btf_kfunc_id_set common_kfunc_set = { @@ -590,3 +633,33 @@ static int __init kfunc_init(void) return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set); } late_initcall(kfunc_init); + +void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip) +{ + struct bpf_stream_stage ss; + struct bpf_prog *prog; + u64 user_vm_start; + + /* + * The RCU read lock is held to safely traverse the latch tree, but we + * don't need its protection when accessing the prog, since it will not + * disappear while we are handling the fault. + */ + rcu_read_lock(); + prog = bpf_prog_ksym_find(fault_ip); + rcu_read_unlock(); + if (!prog) + return; + + /* Use main prog for stream access */ + prog = prog->aux->main_prog_aux->prog; + + user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena); + addr += clear_lo32(user_vm_start); + + bpf_stream_stage(ss, prog, BPF_STDERR, ({ + bpf_stream_printk(ss, "ERROR: Arena %s access at unmapped address 0x%lx\n", + write ? "WRITE" : "READ", addr); + bpf_stream_dump_stack(ss); + })); +} diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index eb28c0f219ee..1eeb31c5b317 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -12,6 +12,7 @@ #include <uapi/linux/btf.h> #include <linux/rcupdate_trace.h> #include <linux/btf_ids.h> +#include <crypto/sha2.h> #include "map_in_map.h" @@ -174,6 +175,17 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) return array->value + (u64)array->elem_size * (index & array->index_mask); } +static int array_map_get_hash(struct bpf_map *map, u32 hash_buf_size, + void *hash_buf) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + + sha256(array->value, (u64)array->elem_size * array->map.max_entries, + hash_buf); + memcpy(array->map.sha, hash_buf, sizeof(array->map.sha)); + return 0; +} + static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off) { @@ -323,18 +335,17 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) } /* Called from syscall */ -static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +int bpf_array_get_next_key(struct bpf_map *map, void *key, void *next_key) { - struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = (u32 *)next_key; - if (index >= array->map.max_entries) { + if (index >= map->max_entries) { *next = 0; return 0; } - if (index == array->map.max_entries - 1) + if (index == map->max_entries - 1) return -ENOENT; *next = index + 1; @@ -431,22 +442,17 @@ static void *array_map_vmalloc_addr(struct bpf_array *array) return (void *)round_down((unsigned long)array, PAGE_SIZE); } -static void array_map_free_timers_wq(struct bpf_map *map) +static void array_map_free_internal_structs(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - /* We don't reset or free fields other than timer and workqueue - * on uref dropping to zero. - */ - if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE)) { - for (i = 0; i < array->map.max_entries; i++) { - if (btf_record_has_field(map->record, BPF_TIMER)) - bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); - if (btf_record_has_field(map->record, BPF_WORKQUEUE)) - bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); - } - } + /* We only free internal structs on uref dropping to zero */ + if (!bpf_map_has_internal_structs(map)) + return; + + for (i = 0; i < array->map.max_entries; i++) + bpf_map_free_internal_structs(map, array_map_elem_ptr(array, i)); } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ @@ -530,8 +536,6 @@ static int array_map_check_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type) { - u32 int_data; - /* One exception for keyless BTF: .bss/.data/.rodata map */ if (btf_type_is_void(key_type)) { if (map->map_type != BPF_MAP_TYPE_ARRAY || @@ -544,14 +548,11 @@ static int array_map_check_btf(const struct bpf_map *map, return 0; } - if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) - return -EINVAL; - - int_data = *(u32 *)(key_type + 1); - /* bpf array can only take a u32 key. This check makes sure + /* + * Bpf array can only take a u32 key. This check makes sure * that the btf matches the attr used during map_create. */ - if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) + if (!btf_type_is_i32(key_type)) return -EINVAL; return 0; @@ -787,8 +788,8 @@ const struct bpf_map_ops array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, - .map_get_next_key = array_map_get_next_key, - .map_release_uref = array_map_free_timers_wq, + .map_get_next_key = bpf_array_get_next_key, + .map_release_uref = array_map_free_internal_structs, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, @@ -805,6 +806,7 @@ const struct bpf_map_ops array_map_ops = { .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], .iter_seq_info = &iter_seq_info, + .map_get_hash = &array_map_get_hash, }; const struct bpf_map_ops percpu_array_map_ops = { @@ -812,7 +814,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = percpu_array_map_lookup_elem, .map_gen_lookup = percpu_array_map_gen_lookup, .map_update_elem = array_map_update_elem, @@ -1201,7 +1203,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_poke_track = prog_array_map_poke_track, .map_poke_untrack = prog_array_map_poke_untrack, .map_poke_run = prog_array_map_poke_run, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = prog_fd_array_get_ptr, @@ -1305,7 +1307,7 @@ const struct bpf_map_ops perf_event_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = perf_event_fd_array_map_free, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = perf_event_fd_array_get_ptr, @@ -1341,7 +1343,7 @@ const struct bpf_map_ops cgroup_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = cgroup_fd_array_free, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = cgroup_fd_array_get_ptr, @@ -1426,7 +1428,7 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_of_map_alloc, .map_free = array_of_map_free, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = array_of_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = bpf_map_fd_get_ptr, diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 148da8f7ff36..0687a760974a 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -45,8 +45,7 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup) { struct bpf_local_storage *local_storage; - migrate_disable(); - rcu_read_lock(); + rcu_read_lock_dont_migrate(); local_storage = rcu_dereference(cgroup->bpf_cgrp_storage); if (!local_storage) goto out; @@ -55,8 +54,7 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup) bpf_local_storage_destroy(local_storage); bpf_cgrp_storage_unlock(); out: - rcu_read_unlock(); - migrate_enable(); + rcu_read_unlock_migrate(); } static struct bpf_local_storage_data * diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 15a3eb9b02d9..e54cce2b9175 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -62,8 +62,7 @@ void bpf_inode_storage_free(struct inode *inode) if (!bsb) return; - migrate_disable(); - rcu_read_lock(); + rcu_read_lock_dont_migrate(); local_storage = rcu_dereference(bsb->storage); if (!local_storage) @@ -71,8 +70,7 @@ void bpf_inode_storage_free(struct inode *inode) bpf_local_storage_destroy(local_storage); out: - rcu_read_unlock(); - migrate_enable(); + rcu_read_unlock_migrate(); } static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c new file mode 100644 index 000000000000..c96630cb75bf --- /dev/null +++ b/kernel/bpf/bpf_insn_array.c @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Isovalent */ + +#include <linux/bpf.h> + +struct bpf_insn_array { + struct bpf_map map; + atomic_t used; + long *ips; + DECLARE_FLEX_ARRAY(struct bpf_insn_array_value, values); +}; + +#define cast_insn_array(MAP_PTR) \ + container_of((MAP_PTR), struct bpf_insn_array, map) + +#define INSN_DELETED ((u32)-1) + +static inline u64 insn_array_alloc_size(u32 max_entries) +{ + const u64 base_size = sizeof(struct bpf_insn_array); + const u64 entry_size = sizeof(struct bpf_insn_array_value); + + return base_size + max_entries * (entry_size + sizeof(long)); +} + +static int insn_array_alloc_check(union bpf_attr *attr) +{ + u32 value_size = sizeof(struct bpf_insn_array_value); + + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != value_size || attr->map_flags != 0) + return -EINVAL; + + return 0; +} + +static void insn_array_free(struct bpf_map *map) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + + bpf_map_area_free(insn_array); +} + +static struct bpf_map *insn_array_alloc(union bpf_attr *attr) +{ + u64 size = insn_array_alloc_size(attr->max_entries); + struct bpf_insn_array *insn_array; + + insn_array = bpf_map_area_alloc(size, NUMA_NO_NODE); + if (!insn_array) + return ERR_PTR(-ENOMEM); + + /* ips are allocated right after the insn_array->values[] array */ + insn_array->ips = (void *)&insn_array->values[attr->max_entries]; + + bpf_map_init_from_attr(&insn_array->map, attr); + + /* BPF programs aren't allowed to write to the map */ + insn_array->map.map_flags |= BPF_F_RDONLY_PROG; + + return &insn_array->map; +} + +static void *insn_array_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + u32 index = *(u32 *)key; + + if (unlikely(index >= insn_array->map.max_entries)) + return NULL; + + return &insn_array->values[index]; +} + +static long insn_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + u32 index = *(u32 *)key; + struct bpf_insn_array_value val = {}; + + if (unlikely(index >= insn_array->map.max_entries)) + return -E2BIG; + + if (unlikely(map_flags & BPF_NOEXIST)) + return -EEXIST; + + copy_map_value(map, &val, value); + if (val.jitted_off || val.xlated_off) + return -EINVAL; + + insn_array->values[index].orig_off = val.orig_off; + + return 0; +} + +static long insn_array_delete_elem(struct bpf_map *map, void *key) +{ + return -EINVAL; +} + +static int insn_array_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + if (!btf_type_is_i32(key_type)) + return -EINVAL; + + if (!btf_type_is_i64(value_type)) + return -EINVAL; + + return 0; +} + +static u64 insn_array_mem_usage(const struct bpf_map *map) +{ + return insn_array_alloc_size(map->max_entries); +} + +static int insn_array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + + if ((off % sizeof(long)) != 0 || + (off / sizeof(long)) >= map->max_entries) + return -EINVAL; + + /* from BPF's point of view, this map is a jump table */ + *imm = (unsigned long)insn_array->ips + off; + + return 0; +} + +BTF_ID_LIST_SINGLE(insn_array_btf_ids, struct, bpf_insn_array) + +const struct bpf_map_ops insn_array_map_ops = { + .map_alloc_check = insn_array_alloc_check, + .map_alloc = insn_array_alloc, + .map_free = insn_array_free, + .map_get_next_key = bpf_array_get_next_key, + .map_lookup_elem = insn_array_lookup_elem, + .map_update_elem = insn_array_update_elem, + .map_delete_elem = insn_array_delete_elem, + .map_check_btf = insn_array_check_btf, + .map_mem_usage = insn_array_mem_usage, + .map_direct_value_addr = insn_array_map_direct_value_addr, + .map_btf_id = &insn_array_btf_ids[0], +}; + +static inline bool is_frozen(struct bpf_map *map) +{ + guard(mutex)(&map->freeze_mutex); + + return map->frozen; +} + +static bool is_insn_array(const struct bpf_map *map) +{ + return map->map_type == BPF_MAP_TYPE_INSN_ARRAY; +} + +static inline bool valid_offsets(const struct bpf_insn_array *insn_array, + const struct bpf_prog *prog) +{ + u32 off; + int i; + + for (i = 0; i < insn_array->map.max_entries; i++) { + off = insn_array->values[i].orig_off; + + if (off >= prog->len) + return false; + + if (off > 0) { + if (prog->insnsi[off-1].code == (BPF_LD | BPF_DW | BPF_IMM)) + return false; + } + } + + return true; +} + +int bpf_insn_array_init(struct bpf_map *map, const struct bpf_prog *prog) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + struct bpf_insn_array_value *values = insn_array->values; + int i; + + if (!is_frozen(map)) + return -EINVAL; + + if (!valid_offsets(insn_array, prog)) + return -EINVAL; + + /* + * There can be only one program using the map + */ + if (atomic_xchg(&insn_array->used, 1)) + return -EBUSY; + + /* + * Reset all the map indexes to the original values. This is needed, + * e.g., when a replay of verification with different log level should + * be performed. + */ + for (i = 0; i < map->max_entries; i++) + values[i].xlated_off = values[i].orig_off; + + return 0; +} + +int bpf_insn_array_ready(struct bpf_map *map) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + int i; + + for (i = 0; i < map->max_entries; i++) { + if (insn_array->values[i].xlated_off == INSN_DELETED) + continue; + if (!insn_array->ips[i]) + return -EFAULT; + } + + return 0; +} + +void bpf_insn_array_release(struct bpf_map *map) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + + atomic_set(&insn_array->used, 0); +} + +void bpf_insn_array_adjust(struct bpf_map *map, u32 off, u32 len) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + int i; + + if (len <= 1) + return; + + for (i = 0; i < map->max_entries; i++) { + if (insn_array->values[i].xlated_off <= off) + continue; + if (insn_array->values[i].xlated_off == INSN_DELETED) + continue; + insn_array->values[i].xlated_off += len - 1; + } +} + +void bpf_insn_array_adjust_after_remove(struct bpf_map *map, u32 off, u32 len) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + int i; + + for (i = 0; i < map->max_entries; i++) { + if (insn_array->values[i].xlated_off < off) + continue; + if (insn_array->values[i].xlated_off == INSN_DELETED) + continue; + if (insn_array->values[i].xlated_off < off + len) + insn_array->values[i].xlated_off = INSN_DELETED; + else + insn_array->values[i].xlated_off -= len; + } +} + +/* + * This function is called by JITs. The image is the real program + * image, the offsets array set up the xlated -> jitted mapping. + * The offsets[xlated] offset should point to the beginning of + * the jitted instruction. + */ +void bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image) +{ + struct bpf_insn_array *insn_array; + struct bpf_map *map; + u32 xlated_off; + int i, j; + + if (!offsets || !image) + return; + + for (i = 0; i < prog->aux->used_map_cnt; i++) { + map = prog->aux->used_maps[i]; + if (!is_insn_array(map)) + continue; + + insn_array = cast_insn_array(map); + for (j = 0; j < map->max_entries; j++) { + xlated_off = insn_array->values[j].xlated_off; + if (xlated_off == INSN_DELETED) + continue; + if (xlated_off < prog->aux->subprog_start) + continue; + xlated_off -= prog->aux->subprog_start; + if (xlated_off >= prog->len) + continue; + + insn_array->values[j].jitted_off = offsets[xlated_off]; + insn_array->ips[j] = (long)(image + offsets[xlated_off]); + } + } +} diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 380e9a7cac75..eec60b57bd3d 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -38,8 +38,7 @@ static DEFINE_MUTEX(link_mutex); /* incremented on every opened seq_file */ static atomic64_t session_id; -static int prepare_seq_file(struct file *file, struct bpf_iter_link *link, - const struct bpf_iter_seq_info *seq_info); +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link); static void bpf_iter_inc_seq_num(struct seq_file *seq) { @@ -257,7 +256,7 @@ static int iter_open(struct inode *inode, struct file *file) { struct bpf_iter_link *link = inode->i_private; - return prepare_seq_file(file, link, __get_seq_info(link)); + return prepare_seq_file(file, link); } static int iter_release(struct inode *inode, struct file *file) @@ -553,7 +552,8 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, if (!link) return -ENOMEM; - bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog, + attr->link_create.attach_type); link->tinfo = tinfo; err = bpf_link_prime(&link->link, &link_primer); @@ -586,9 +586,9 @@ static void init_seq_meta(struct bpf_iter_priv_data *priv_data, priv_data->done_stop = false; } -static int prepare_seq_file(struct file *file, struct bpf_iter_link *link, - const struct bpf_iter_seq_info *seq_info) +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) { + const struct bpf_iter_seq_info *seq_info = __get_seq_info(link); struct bpf_iter_priv_data *priv_data; struct bpf_iter_target_info *tinfo; struct bpf_prog *prog; @@ -634,37 +634,24 @@ release_prog: int bpf_iter_new_fd(struct bpf_link *link) { struct bpf_iter_link *iter_link; - struct file *file; unsigned int flags; - int err, fd; + int err; if (link->ops != &bpf_iter_link_lops) return -EINVAL; flags = O_RDONLY | O_CLOEXEC; - fd = get_unused_fd_flags(flags); - if (fd < 0) - return fd; - - file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags); - if (IS_ERR(file)) { - err = PTR_ERR(file); - goto free_fd; - } + + FD_PREPARE(fdf, flags, anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags)); + if (fdf.err) + return fdf.err; iter_link = container_of(link, struct bpf_iter_link, link); - err = prepare_seq_file(file, iter_link, __get_seq_info(iter_link)); + err = prepare_seq_file(fd_prepare_file(fdf), iter_link); if (err) - goto free_file; - - fd_install(fd, file); - return fd; + return err; /* Automatic cleanup handles fput */ -free_file: - fput(file); -free_fd: - put_unused_fd(fd); - return err; + return fd_publish(fdf); } struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop) @@ -705,13 +692,11 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) migrate_enable(); rcu_read_unlock_trace(); } else { - rcu_read_lock(); - migrate_disable(); + rcu_read_lock_dont_migrate(); old_run_ctx = bpf_set_run_ctx(&run_ctx); ret = bpf_prog_run(prog, ctx); bpf_reset_run_ctx(old_run_ctx); - migrate_enable(); - rcu_read_unlock(); + rcu_read_unlock_migrate(); } /* bpf program can only return 0 or 1: diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index fa56c30833ff..e2fe6c32822b 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -73,30 +73,24 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, - void *value, bool charge_mem, bool swap_uptrs, gfp_t gfp_flags) + void *value, bool swap_uptrs, gfp_t gfp_flags) { struct bpf_local_storage_elem *selem; - if (charge_mem && mem_charge(smap, owner, smap->elem_size)) + if (mem_charge(smap, owner, smap->elem_size)) return NULL; - if (smap->bpf_ma) { - selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags); - if (selem) - /* Keep the original bpf_map_kzalloc behavior - * before started using the bpf_mem_cache_alloc. - * - * No need to use zero_map_value. The bpf_selem_free() - * only does bpf_mem_cache_free when there is - * no other bpf prog is using the selem. - */ - memset(SDATA(selem)->data, 0, smap->map.value_size); + if (smap->use_kmalloc_nolock) { + selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size, + __GFP_ZERO, NUMA_NO_NODE); } else { selem = bpf_map_kzalloc(&smap->map, smap->elem_size, gfp_flags | __GFP_NOWARN); } if (selem) { + RCU_INIT_POINTER(SDATA(selem)->smap, smap); + if (value) { /* No need to call check_and_init_map_value as memory is zero init */ copy_map_value(&smap->map, SDATA(selem)->data, value); @@ -106,13 +100,12 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, return selem; } - if (charge_mem) - mem_uncharge(smap, owner, smap->elem_size); + mem_uncharge(smap, owner, smap->elem_size); return NULL; } -/* rcu tasks trace callback for bpf_ma == false */ +/* rcu tasks trace callback for use_kmalloc_nolock == false */ static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) { struct bpf_local_storage *local_storage; @@ -127,12 +120,23 @@ static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) kfree_rcu(local_storage, rcu); } +/* Handle use_kmalloc_nolock == false */ +static void __bpf_local_storage_free(struct bpf_local_storage *local_storage, + bool vanilla_rcu) +{ + if (vanilla_rcu) + kfree_rcu(local_storage, rcu); + else + call_rcu_tasks_trace(&local_storage->rcu, + __bpf_local_storage_free_trace_rcu); +} + static void bpf_local_storage_free_rcu(struct rcu_head *rcu) { struct bpf_local_storage *local_storage; local_storage = container_of(rcu, struct bpf_local_storage, rcu); - bpf_mem_cache_raw_free(local_storage); + kfree_nolock(local_storage); } static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) @@ -143,46 +147,27 @@ static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) call_rcu(rcu, bpf_local_storage_free_rcu); } -/* Handle bpf_ma == false */ -static void __bpf_local_storage_free(struct bpf_local_storage *local_storage, - bool vanilla_rcu) -{ - if (vanilla_rcu) - kfree_rcu(local_storage, rcu); - else - call_rcu_tasks_trace(&local_storage->rcu, - __bpf_local_storage_free_trace_rcu); -} - static void bpf_local_storage_free(struct bpf_local_storage *local_storage, - struct bpf_local_storage_map *smap, - bool bpf_ma, bool reuse_now) + bool reuse_now) { if (!local_storage) return; - if (!bpf_ma) { + if (!local_storage->use_kmalloc_nolock) { __bpf_local_storage_free(local_storage, reuse_now); return; } - if (!reuse_now) { - call_rcu_tasks_trace(&local_storage->rcu, - bpf_local_storage_free_trace_rcu); + if (reuse_now) { + call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); return; } - if (smap) - bpf_mem_cache_free(&smap->storage_ma, local_storage); - else - /* smap could be NULL if the selem that triggered - * this 'local_storage' creation had been long gone. - * In this case, directly do call_rcu(). - */ - call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); + call_rcu_tasks_trace(&local_storage->rcu, + bpf_local_storage_free_trace_rcu); } -/* rcu tasks trace callback for bpf_ma == false */ +/* rcu tasks trace callback for use_kmalloc_nolock == false */ static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu) { struct bpf_local_storage_elem *selem; @@ -194,7 +179,7 @@ static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu) kfree_rcu(selem, rcu); } -/* Handle bpf_ma == false */ +/* Handle use_kmalloc_nolock == false */ static void __bpf_selem_free(struct bpf_local_storage_elem *selem, bool vanilla_rcu) { @@ -216,7 +201,7 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) migrate_disable(); bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); migrate_enable(); - bpf_mem_cache_raw_free(selem); + kfree_nolock(selem); } static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) @@ -228,14 +213,17 @@ static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) } void bpf_selem_free(struct bpf_local_storage_elem *selem, - struct bpf_local_storage_map *smap, bool reuse_now) { - if (!smap->bpf_ma) { - /* Only task storage has uptrs and task storage - * has moved to bpf_mem_alloc. Meaning smap->bpf_ma == true - * for task storage, so this bpf_obj_free_fields() won't unpin - * any uptr. + struct bpf_local_storage_map *smap; + + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); + + if (!smap->use_kmalloc_nolock) { + /* + * No uptr will be unpin even when reuse_now == false since uptr + * is only supported in task local storage, where + * smap->use_kmalloc_nolock == true. */ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); __bpf_selem_free(selem, reuse_now); @@ -243,18 +231,11 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, } if (reuse_now) { - /* reuse_now == true only happens when the storage owner - * (e.g. task_struct) is being destructed or the map itself - * is being destructed (ie map_free). In both cases, - * no bpf prog can have a hold on the selem. It is - * safe to unpin the uptrs and free the selem now. - */ - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); - /* Instead of using the vanilla call_rcu(), - * bpf_mem_cache_free will be able to reuse selem - * immediately. + /* + * While it is okay to call bpf_obj_free_fields() that unpins uptr when + * reuse_now == true, keep it in bpf_selem_free_rcu() for simplicity. */ - bpf_mem_cache_free(&smap->selem_ma, selem); + call_rcu(&selem->rcu, bpf_selem_free_rcu); return; } @@ -264,7 +245,6 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) { struct bpf_local_storage_elem *selem; - struct bpf_local_storage_map *smap; struct hlist_node *n; /* The "_safe" iteration is needed. @@ -272,10 +252,8 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) * but bpf_selem_free will use the selem->rcu_head * which is union-ized with the selem->free_node. */ - hlist_for_each_entry_safe(selem, n, list, free_node) { - smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - bpf_selem_free(selem, smap, reuse_now); - } + hlist_for_each_entry_safe(selem, n, list, free_node) + bpf_selem_free(selem, reuse_now); } /* local_storage->lock must be held and selem->local_storage == local_storage. @@ -284,7 +262,7 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) */ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem, - bool uncharge_mem, struct hlist_head *free_selem_list) + struct hlist_head *free_selem_list) { struct bpf_local_storage_map *smap; bool free_local_storage; @@ -297,8 +275,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor * The owner may be freed once the last selem is unlinked * from local_storage. */ - if (uncharge_mem) - mem_uncharge(smap, owner, smap->elem_size); + mem_uncharge(smap, owner, smap->elem_size); free_local_storage = hlist_is_singular_node(&selem->snode, &local_storage->list); @@ -336,47 +313,11 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor return free_local_storage; } -static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage, - struct bpf_local_storage_map *storage_smap, - struct bpf_local_storage_elem *selem) -{ - - struct bpf_local_storage_map *selem_smap; - - /* local_storage->smap may be NULL. If it is, get the bpf_ma - * from any selem in the local_storage->list. The bpf_ma of all - * local_storage and selem should have the same value - * for the same map type. - * - * If the local_storage->list is already empty, the caller will not - * care about the bpf_ma value also because the caller is not - * responsible to free the local_storage. - */ - - if (storage_smap) - return storage_smap->bpf_ma; - - if (!selem) { - struct hlist_node *n; - - n = rcu_dereference_check(hlist_first_rcu(&local_storage->list), - bpf_rcu_lock_held()); - if (!n) - return false; - - selem = hlist_entry(n, struct bpf_local_storage_elem, snode); - } - selem_smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - - return selem_smap->bpf_ma; -} - static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, bool reuse_now) { - struct bpf_local_storage_map *storage_smap; struct bpf_local_storage *local_storage; - bool bpf_ma, free_local_storage = false; + bool free_local_storage = false; HLIST_HEAD(selem_free_list); unsigned long flags; @@ -386,20 +327,17 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); - storage_smap = rcu_dereference_check(local_storage->smap, - bpf_rcu_lock_held()); - bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, selem); raw_spin_lock_irqsave(&local_storage->lock, flags); if (likely(selem_linked_to_storage(selem))) free_local_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, true, &selem_free_list); + local_storage, selem, &selem_free_list); raw_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&selem_free_list, reuse_now); if (free_local_storage) - bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now); + bpf_local_storage_free(local_storage, reuse_now); } void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, @@ -434,7 +372,6 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, unsigned long flags; raw_spin_lock_irqsave(&b->lock, flags); - RCU_INIT_POINTER(SDATA(selem)->smap, smap); hlist_add_head_rcu(&selem->map_node, &b->list); raw_spin_unlock_irqrestore(&b->lock, flags); } @@ -493,8 +430,9 @@ int bpf_local_storage_alloc(void *owner, if (err) return err; - if (smap->bpf_ma) - storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags); + if (smap->use_kmalloc_nolock) + storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage), + __GFP_ZERO, NUMA_NO_NODE); else storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), gfp_flags | __GFP_NOWARN); @@ -507,6 +445,7 @@ int bpf_local_storage_alloc(void *owner, INIT_HLIST_HEAD(&storage->list); raw_spin_lock_init(&storage->lock); storage->owner = owner; + storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; bpf_selem_link_storage_nolock(storage, first_selem); bpf_selem_link_map(smap, first_selem); @@ -528,22 +467,12 @@ int bpf_local_storage_alloc(void *owner, bpf_selem_unlink_map(first_selem); err = -EAGAIN; goto uncharge; - - /* Note that even first_selem was linked to smap's - * bucket->list, first_selem can be freed immediately - * (instead of kfree_rcu) because - * bpf_local_storage_map_free() does a - * synchronize_rcu_mult (waiting for both sleepable and - * normal programs) before walking the bucket->list. - * Hence, no one is accessing selem from the - * bucket->list under rcu_read_lock(). - */ } return 0; uncharge: - bpf_local_storage_free(storage, smap, smap->bpf_ma, true); + bpf_local_storage_free(storage, true); mem_uncharge(smap, owner, sizeof(*storage)); return err; } @@ -582,13 +511,13 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (err) return ERR_PTR(err); - selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags); + selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags); if (!selem) return ERR_PTR(-ENOMEM); err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags); if (err) { - bpf_selem_free(selem, smap, true); + bpf_selem_free(selem, true); mem_uncharge(smap, owner, smap->elem_size); return ERR_PTR(err); } @@ -616,7 +545,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, /* A lookup has just been done before and concluded a new selem is * needed. The chance of an unnecessary alloc is unlikely. */ - alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags); + alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags); if (!alloc_selem) return ERR_PTR(-ENOMEM); @@ -656,7 +585,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (old_sdata) { bpf_selem_unlink_map(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), - true, &old_selem_free_list); + &old_selem_free_list); } unlock: @@ -664,7 +593,7 @@ unlock: bpf_selem_free_list(&old_selem_free_list, false); if (alloc_selem) { mem_uncharge(smap, owner, smap->elem_size); - bpf_selem_free(alloc_selem, smap, true); + bpf_selem_free(alloc_selem, true); } return err ? ERR_PTR(err) : SDATA(selem); } @@ -722,13 +651,7 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type) { - u32 int_data; - - if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) - return -EINVAL; - - int_data = *(u32 *)(key_type + 1); - if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) + if (!btf_type_is_i32(key_type)) return -EINVAL; return 0; @@ -736,16 +659,12 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) { - struct bpf_local_storage_map *storage_smap; struct bpf_local_storage_elem *selem; - bool bpf_ma, free_storage = false; + bool free_storage = false; HLIST_HEAD(free_selem_list); struct hlist_node *n; unsigned long flags; - storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held()); - bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, NULL); - /* Neither the bpf_prog nor the bpf_map's syscall * could be modifying the local_storage->list now. * Thus, no elem can be added to or deleted from the @@ -768,14 +687,14 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) * of the loop will set the free_cgroup_storage to true. */ free_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, true, &free_selem_list); + local_storage, selem, &free_selem_list); } raw_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&free_selem_list, true); if (free_storage) - bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true); + bpf_local_storage_free(local_storage, true); } u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) @@ -788,20 +707,10 @@ u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) return usage; } -/* When bpf_ma == true, the bpf_mem_alloc is used to allocate and free memory. - * A deadlock free allocator is useful for storage that the bpf prog can easily - * get a hold of the owner PTR_TO_BTF_ID in any context. eg. bpf_get_current_task_btf. - * The task and cgroup storage fall into this case. The bpf_mem_alloc reuses - * memory immediately. To be reuse-immediate safe, the owner destruction - * code path needs to go through a rcu grace period before calling - * bpf_local_storage_destroy(). - * - * When bpf_ma == false, the kmalloc and kfree are used. - */ struct bpf_map * bpf_local_storage_map_alloc(union bpf_attr *attr, struct bpf_local_storage_cache *cache, - bool bpf_ma) + bool use_kmalloc_nolock) { struct bpf_local_storage_map *smap; unsigned int i; @@ -835,20 +744,9 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, /* In PREEMPT_RT, kmalloc(GFP_ATOMIC) is still not safe in non * preemptible context. Thus, enforce all storages to use - * bpf_mem_alloc when CONFIG_PREEMPT_RT is enabled. + * kmalloc_nolock() when CONFIG_PREEMPT_RT is enabled. */ - smap->bpf_ma = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : bpf_ma; - if (smap->bpf_ma) { - err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false); - if (err) - goto free_smap; - - err = bpf_mem_alloc_init(&smap->storage_ma, sizeof(struct bpf_local_storage), false); - if (err) { - bpf_mem_alloc_destroy(&smap->selem_ma); - goto free_smap; - } - } + smap->use_kmalloc_nolock = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : use_kmalloc_nolock; smap->cache_idx = bpf_local_storage_cache_idx_get(cache); return &smap->map; @@ -918,12 +816,9 @@ void bpf_local_storage_map_free(struct bpf_map *map, */ synchronize_rcu(); - if (smap->bpf_ma) { + if (smap->use_kmalloc_nolock) { rcu_barrier_tasks_trace(); - if (!rcu_trace_implies_rcu_gp()) - rcu_barrier(); - bpf_mem_alloc_destroy(&smap->selem_ma); - bpf_mem_alloc_destroy(&smap->storage_ma); + rcu_barrier(); } kvfree(smap->buckets); bpf_map_area_free(smap); diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index 3dabdd137d10..e7a2fc60523f 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -19,14 +19,6 @@ #define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING) #define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET) -static int get_next_cpu(int cpu) -{ - cpu = cpumask_next(cpu, cpu_possible_mask); - if (cpu >= nr_cpu_ids) - cpu = cpumask_first(cpu_possible_mask); - return cpu; -} - /* Local list helpers */ static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l) { @@ -337,12 +329,12 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, list) { __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); - if (++nfree == LOCAL_FREE_TARGET) + if (++nfree == lru->target_free) break; } - if (nfree < LOCAL_FREE_TARGET) - __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree, + if (nfree < lru->target_free) + __bpf_lru_list_shrink(lru, l, lru->target_free - nfree, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); @@ -482,7 +474,7 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags); - steal = get_next_cpu(steal); + steal = cpumask_next_wrap(steal, cpu_possible_mask); } while (!node && steal != first_steal); loc_l->next_steal = steal; @@ -577,6 +569,9 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); buf += elem_size; } + + lru->target_free = clamp((nr_elems / num_possible_cpus()) / 2, + 1, LOCAL_FREE_TARGET); } static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index cbd8d3720c2b..fe2661a58ea9 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -58,6 +58,7 @@ struct bpf_lru { del_from_htab_func del_from_htab; void *del_arg; unsigned int hash_offset; + unsigned int target_free; unsigned int nr_scans; bool percpu; }; diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 0a59df1c550a..7cb6e8d4282c 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -51,6 +51,7 @@ BTF_ID(func, bpf_lsm_key_getsecurity) BTF_ID(func, bpf_lsm_audit_rule_match) #endif BTF_ID(func, bpf_lsm_ismaclabel) +BTF_ID(func, bpf_lsm_file_alloc_security) BTF_SET_END(bpf_lsm_disabled_hooks) /* List of LSM hooks that should operate on 'current' cgroup regardless diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index db13ee70d94d..278490683d28 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -601,7 +601,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, if (model->ret_size > 0) flags |= BPF_TRAMP_F_RET_FENTRY_RET; - size = arch_bpf_trampoline_size(model, flags, tlinks, NULL); + size = arch_bpf_trampoline_size(model, flags, tlinks, stub_func); if (size <= 0) return size ? : -EFAULT; @@ -808,7 +808,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto reset_unlock; } bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, - &bpf_struct_ops_link_lops, prog); + &bpf_struct_ops_link_lops, prog, prog->expected_attach_type); *plink++ = &link->link; ksym = kzalloc(sizeof(*ksym), GFP_USER); @@ -1162,6 +1162,7 @@ bool bpf_struct_ops_get(const void *kdata) map = __bpf_map_inc_not_zero(&st_map->map, false); return !IS_ERR(map); } +EXPORT_SYMBOL_GPL(bpf_struct_ops_get); void bpf_struct_ops_put(const void *kdata) { @@ -1173,6 +1174,19 @@ void bpf_struct_ops_put(const void *kdata) bpf_map_put(&st_map->map); } +EXPORT_SYMBOL_GPL(bpf_struct_ops_put); + +u32 bpf_struct_ops_id(const void *kdata) +{ + struct bpf_struct_ops_value *kvalue; + struct bpf_struct_ops_map *st_map; + + kvalue = container_of(kdata, struct bpf_struct_ops_value, data); + st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); + + return st_map->map.id; +} +EXPORT_SYMBOL_GPL(bpf_struct_ops_id); static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) { @@ -1351,7 +1365,8 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) err = -ENOMEM; goto err_out; } - bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL); + bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL, + attr->link_create.attach_type); err = bpf_link_prime(&link->link, &link_primer); if (err) diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 1109475953c0..a1dc1bf0848a 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -70,8 +70,7 @@ void bpf_task_storage_free(struct task_struct *task) { struct bpf_local_storage *local_storage; - migrate_disable(); - rcu_read_lock(); + rcu_read_lock_dont_migrate(); local_storage = rcu_dereference(task->bpf_storage); if (!local_storage) @@ -81,8 +80,7 @@ void bpf_task_storage_free(struct task_struct *task) bpf_local_storage_destroy(local_storage); bpf_task_storage_unlock(); out: - rcu_read_unlock(); - migrate_enable(); + rcu_read_unlock_migrate(); } static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 16ba36f34dfa..0de8fc8a0e0b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -26,6 +26,7 @@ #include <linux/bsearch.h> #include <linux/kobject.h> #include <linux/sysfs.h> +#include <linux/overflow.h> #include <net/netfilter/nf_bpf_link.h> @@ -857,26 +858,43 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) EXPORT_SYMBOL_GPL(btf_type_by_id); /* - * Regular int is not a bit field and it must be either - * u8/u16/u32/u64 or __int128. + * Check that the type @t is a regular int. This means that @t is not + * a bit field and it has the same size as either of u8/u16/u32/u64 + * or __int128. If @expected_size is not zero, then size of @t should + * be the same. A caller should already have checked that the type @t + * is an integer. */ +static bool __btf_type_int_is_regular(const struct btf_type *t, size_t expected_size) +{ + u32 int_data = btf_type_int(t); + u8 nr_bits = BTF_INT_BITS(int_data); + u8 nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); + + return BITS_PER_BYTE_MASKED(nr_bits) == 0 && + BTF_INT_OFFSET(int_data) == 0 && + (nr_bytes <= 16 && is_power_of_2(nr_bytes)) && + (expected_size == 0 || nr_bytes == expected_size); +} + static bool btf_type_int_is_regular(const struct btf_type *t) { - u8 nr_bits, nr_bytes; - u32 int_data; + return __btf_type_int_is_regular(t, 0); +} - int_data = btf_type_int(t); - nr_bits = BTF_INT_BITS(int_data); - nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); - if (BITS_PER_BYTE_MASKED(nr_bits) || - BTF_INT_OFFSET(int_data) || - (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) && - nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64) && - nr_bytes != (2 * sizeof(u64)))) { - return false; - } +bool btf_type_is_i32(const struct btf_type *t) +{ + return btf_type_is_int(t) && __btf_type_int_is_regular(t, 4); +} - return true; +bool btf_type_is_i64(const struct btf_type *t) +{ + return btf_type_is_int(t) && __btf_type_int_is_regular(t, 8); +} + +bool btf_type_is_primitive(const struct btf_type *t) +{ + return (btf_type_is_int(t) && btf_type_int_is_regular(t)) || + btf_is_any_enum(t); } /* @@ -3442,7 +3460,8 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt, node_field_name = strstr(value_type, ":"); if (!node_field_name) return -EINVAL; - value_type = kstrndup(value_type, node_field_name - value_type, GFP_KERNEL | __GFP_NOWARN); + value_type = kstrndup(value_type, node_field_name - value_type, + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!value_type) return -ENOMEM; id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT); @@ -3459,60 +3478,45 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt, return BTF_FIELD_FOUND; } -#define field_mask_test_name(field_type, field_type_str) \ - if (field_mask & field_type && !strcmp(name, field_type_str)) { \ - type = field_type; \ - goto end; \ - } - static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_type, - u32 field_mask, u32 *seen_mask, - int *align, int *sz) -{ - int type = 0; + u32 field_mask, u32 *seen_mask, int *align, int *sz) +{ + const struct { + enum btf_field_type type; + const char *const name; + const bool is_unique; + } field_types[] = { + { BPF_SPIN_LOCK, "bpf_spin_lock", true }, + { BPF_RES_SPIN_LOCK, "bpf_res_spin_lock", true }, + { BPF_TIMER, "bpf_timer", true }, + { BPF_WORKQUEUE, "bpf_wq", true }, + { BPF_TASK_WORK, "bpf_task_work", true }, + { BPF_LIST_HEAD, "bpf_list_head", false }, + { BPF_LIST_NODE, "bpf_list_node", false }, + { BPF_RB_ROOT, "bpf_rb_root", false }, + { BPF_RB_NODE, "bpf_rb_node", false }, + { BPF_REFCOUNT, "bpf_refcount", false }, + }; + int type = 0, i; const char *name = __btf_name_by_offset(btf, var_type->name_off); - - if (field_mask & BPF_SPIN_LOCK) { - if (!strcmp(name, "bpf_spin_lock")) { - if (*seen_mask & BPF_SPIN_LOCK) - return -E2BIG; - *seen_mask |= BPF_SPIN_LOCK; - type = BPF_SPIN_LOCK; - goto end; - } - } - if (field_mask & BPF_RES_SPIN_LOCK) { - if (!strcmp(name, "bpf_res_spin_lock")) { - if (*seen_mask & BPF_RES_SPIN_LOCK) - return -E2BIG; - *seen_mask |= BPF_RES_SPIN_LOCK; - type = BPF_RES_SPIN_LOCK; - goto end; - } - } - if (field_mask & BPF_TIMER) { - if (!strcmp(name, "bpf_timer")) { - if (*seen_mask & BPF_TIMER) - return -E2BIG; - *seen_mask |= BPF_TIMER; - type = BPF_TIMER; - goto end; - } - } - if (field_mask & BPF_WORKQUEUE) { - if (!strcmp(name, "bpf_wq")) { - if (*seen_mask & BPF_WORKQUEUE) + const char *field_type_name; + enum btf_field_type field_type; + bool is_unique; + + for (i = 0; i < ARRAY_SIZE(field_types); ++i) { + field_type = field_types[i].type; + field_type_name = field_types[i].name; + is_unique = field_types[i].is_unique; + if (!(field_mask & field_type) || strcmp(name, field_type_name)) + continue; + if (is_unique) { + if (*seen_mask & field_type) return -E2BIG; - *seen_mask |= BPF_WORKQUEUE; - type = BPF_WORKQUEUE; - goto end; + *seen_mask |= field_type; } + type = field_type; + goto end; } - field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head"); - field_mask_test_name(BPF_LIST_NODE, "bpf_list_node"); - field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root"); - field_mask_test_name(BPF_RB_NODE, "bpf_rb_node"); - field_mask_test_name(BPF_REFCOUNT, "bpf_refcount"); /* Only return BPF_KPTR when all other types with matchable names fail */ if (field_mask & (BPF_KPTR | BPF_UPTR) && !__btf_type_is_struct(var_type)) { @@ -3526,8 +3530,6 @@ end: return type; } -#undef field_mask_test_name - /* Repeat a number of fields for a specified number of times. * * Copy the fields starting from the first field and repeat them for @@ -3674,6 +3676,7 @@ static int btf_find_field_one(const struct btf *btf, case BPF_LIST_NODE: case BPF_RB_NODE: case BPF_REFCOUNT: + case BPF_TASK_WORK: ret = btf_find_struct(btf, var_type, off, sz, field_type, info_cnt ? &info[0] : &tmp); if (ret < 0) @@ -3957,7 +3960,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type /* This needs to be kzalloc to zero out padding and unused fields, see * comment in btf_record_equal. */ - rec = kzalloc(offsetof(struct btf_record, fields[cnt]), GFP_KERNEL | __GFP_NOWARN); + rec = kzalloc(struct_size(rec, fields, cnt), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!rec) return ERR_PTR(-ENOMEM); @@ -3966,6 +3969,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type rec->timer_off = -EINVAL; rec->wq_off = -EINVAL; rec->refcount_off = -EINVAL; + rec->task_work_off = -EINVAL; for (i = 0; i < cnt; i++) { field_type_size = btf_field_type_size(info_arr[i].type); if (info_arr[i].off + field_type_size > value_size) { @@ -4005,6 +4009,10 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type /* Cache offset for faster lookup at runtime */ rec->wq_off = rec->fields[i].offset; break; + case BPF_TASK_WORK: + WARN_ON_ONCE(rec->task_work_off >= 0); + rec->task_work_off = rec->fields[i].offset; + break; case BPF_REFCOUNT: WARN_ON_ONCE(rec->refcount_off >= 0); /* Cache offset for faster lookup at runtime */ @@ -5583,7 +5591,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) if (id < 0) continue; - new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]), + new_aof = krealloc(aof, struct_size(new_aof, ids, aof->cnt + 1), GFP_KERNEL | __GFP_NOWARN); if (!new_aof) { ret = -ENOMEM; @@ -5610,7 +5618,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) if (ret != BTF_FIELD_FOUND) continue; - new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]), + new_aof = krealloc(aof, struct_size(new_aof, ids, aof->cnt + 1), GFP_KERNEL | __GFP_NOWARN); if (!new_aof) { ret = -ENOMEM; @@ -5647,7 +5655,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) continue; parse: tab_cnt = tab ? tab->cnt : 0; - new_tab = krealloc(tab, offsetof(struct btf_struct_metas, types[tab_cnt + 1]), + new_tab = krealloc(tab, struct_size(new_tab, types, tab_cnt + 1), GFP_KERNEL | __GFP_NOWARN); if (!new_tab) { ret = -ENOMEM; @@ -6181,8 +6189,7 @@ int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_ty return kctx_type_id; } -BTF_ID_LIST(bpf_ctx_convert_btf_id) -BTF_ID(struct, bpf_ctx_convert) +BTF_ID_LIST_SINGLE(bpf_ctx_convert_btf_id, struct, bpf_ctx_convert) static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name, void *data, unsigned int data_size) @@ -6383,16 +6390,15 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) return prog->aux->attach_btf; } -static bool is_int_ptr(struct btf *btf, const struct btf_type *t) +static bool is_void_or_int_ptr(struct btf *btf, const struct btf_type *t) { /* skip modifiers */ t = btf_type_skip_modifiers(btf, t->type, NULL); - - return btf_type_is_int(t); + return btf_type_is_void(t) || btf_type_is_int(t); } -static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, - int off) +u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, + int off) { const struct btf_param *args; const struct btf_type *t; @@ -6541,6 +6547,7 @@ static const struct bpf_raw_tp_null_args raw_tp_null_args[] = { { "xprt_put_cong", 0x10 }, /* tcp */ { "tcp_send_reset", 0x11 }, + { "tcp_sendmsg_locked", 0x100 }, /* tegra_apb_dma */ { "tegra_dma_tx_status", 0x100 }, /* timer_migration */ @@ -6671,7 +6678,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, tname, off); return false; } - arg = get_ctx_arg_idx(btf, t, off); + arg = btf_ctx_arg_idx(btf, t, off); args = (const struct btf_param *)(t + 1); /* if (t == NULL) Fall back to default BPF prog with * MAX_BPF_FUNC_REG_ARGS u64 arguments. @@ -6744,7 +6751,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_small_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t)) + if (btf_type_is_small_int(t) || btf_is_any_enum(t) || btf_type_is_struct(t)) /* accessing a scalar */ return true; if (!btf_type_is_ptr(t)) { @@ -6776,14 +6783,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } } - if (t->type == 0) - /* This is a pointer to void. - * It is the same as scalar from the verifier safety pov. - * No further pointer walking is allowed. - */ - return true; - - if (is_int_ptr(btf, t)) + /* + * If it's a pointer to void, it's the same as scalar from the verifier + * safety POV. Either way, no futher pointer walking is allowed. + */ + if (is_void_or_int_ptr(btf, t)) return true; /* this is a pointer to another type */ @@ -6829,10 +6833,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* Is this a func with potential NULL args? */ if (strcmp(tname, raw_tp_null_args[i].func)) continue; - if (raw_tp_null_args[i].mask & (0x1 << (arg * 4))) + if (raw_tp_null_args[i].mask & (0x1ULL << (arg * 4))) info->reg_type |= PTR_MAYBE_NULL; /* Is the current arg IS_ERR? */ - if (raw_tp_null_args[i].mask & (0x2 << (arg * 4))) + if (raw_tp_null_args[i].mask & (0x2ULL << (arg * 4))) ptr_err_raw_tp = true; break; } @@ -6905,6 +6909,7 @@ enum bpf_struct_walk_result { /* < 0 error */ WALK_SCALAR = 0, WALK_PTR, + WALK_PTR_UNTRUSTED, WALK_STRUCT, }; @@ -7146,6 +7151,8 @@ error: *field_name = mname; return WALK_PTR; } + + return WALK_PTR_UNTRUSTED; } /* Allow more flexible access within an int as long as @@ -7218,6 +7225,9 @@ int btf_struct_access(struct bpf_verifier_log *log, *next_btf_id = id; *flag = tmp_flag; return PTR_TO_BTF_ID; + case WALK_PTR_UNTRUSTED: + *flag = MEM_RDONLY | PTR_UNTRUSTED; + return PTR_TO_MEM; case WALK_SCALAR: return SCALAR_VALUE; case WALK_STRUCT: @@ -7313,7 +7323,7 @@ static int __get_type_size(struct btf *btf, u32 btf_id, if (btf_type_is_ptr(t)) /* kernel size of pointer. Not BPF's size of pointer*/ return sizeof(void *); - if (btf_type_is_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t)) + if (btf_type_is_int(t) || btf_is_any_enum(t) || btf_type_is_struct(t)) return t->size; return -EINVAL; } @@ -7322,7 +7332,7 @@ static u8 __get_type_fmodel_flags(const struct btf_type *t) { u8 flags = 0; - if (__btf_type_is_struct(t)) + if (btf_type_is_struct(t)) flags |= BTF_FMODEL_STRUCT_ARG; if (btf_type_is_signed_int(t)) flags |= BTF_FMODEL_SIGNED_ARG; @@ -7363,7 +7373,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, return -EINVAL; } ret = __get_type_size(btf, func->type, &t); - if (ret < 0 || __btf_type_is_struct(t)) { + if (ret < 0 || btf_type_is_struct(t)) { bpf_log(log, "The function %s return type %s is unsupported.\n", tname, btf_type_str(t)); @@ -7630,11 +7640,12 @@ cand_cache_unlock: } enum btf_arg_tag { - ARG_TAG_CTX = BIT_ULL(0), - ARG_TAG_NONNULL = BIT_ULL(1), - ARG_TAG_TRUSTED = BIT_ULL(2), - ARG_TAG_NULLABLE = BIT_ULL(3), - ARG_TAG_ARENA = BIT_ULL(4), + ARG_TAG_CTX = BIT_ULL(0), + ARG_TAG_NONNULL = BIT_ULL(1), + ARG_TAG_TRUSTED = BIT_ULL(2), + ARG_TAG_UNTRUSTED = BIT_ULL(3), + ARG_TAG_NULLABLE = BIT_ULL(4), + ARG_TAG_ARENA = BIT_ULL(5), }; /* Process BTF of a function to produce high-level expectation of function @@ -7662,7 +7673,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) return 0; if (!prog->aux->func_info) { - bpf_log(log, "Verifier bug\n"); + verifier_bug(env, "func_info undefined"); return -EFAULT; } @@ -7686,7 +7697,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) tname = btf_name_by_offset(btf, fn_t->name_off); if (prog->aux->func_info_aux[subprog].unreliable) { - bpf_log(log, "Verifier bug in function %s()\n", tname); + verifier_bug(env, "unreliable BTF for function %s()", tname); return -EFAULT; } if (prog_type == BPF_PROG_TYPE_EXT) @@ -7742,6 +7753,8 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) tags |= ARG_TAG_CTX; } else if (strcmp(tag, "trusted") == 0) { tags |= ARG_TAG_TRUSTED; + } else if (strcmp(tag, "untrusted") == 0) { + tags |= ARG_TAG_UNTRUSTED; } else if (strcmp(tag, "nonnull") == 0) { tags |= ARG_TAG_NONNULL; } else if (strcmp(tag, "nullable") == 0) { @@ -7802,6 +7815,38 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) sub->args[i].btf_id = kern_type_id; continue; } + if (tags & ARG_TAG_UNTRUSTED) { + struct btf *vmlinux_btf; + int kern_type_id; + + if (tags & ~ARG_TAG_UNTRUSTED) { + bpf_log(log, "arg#%d untrusted cannot be combined with any other tags\n", i); + return -EINVAL; + } + + ref_t = btf_type_skip_modifiers(btf, t->type, NULL); + if (btf_type_is_void(ref_t) || btf_type_is_primitive(ref_t)) { + sub->args[i].arg_type = ARG_PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED; + sub->args[i].mem_size = 0; + continue; + } + + kern_type_id = btf_get_ptr_to_btf_id(log, i, btf, t); + if (kern_type_id < 0) + return kern_type_id; + + vmlinux_btf = bpf_get_btf_vmlinux(); + ref_t = btf_type_by_id(vmlinux_btf, kern_type_id); + if (!btf_type_is_struct(ref_t)) { + tname = __btf_name_by_offset(vmlinux_btf, t->name_off); + bpf_log(log, "arg#%d has type %s '%s', but only struct or primitive types are allowed\n", + i, btf_type_str(ref_t), tname); + return -EINVAL; + } + sub->args[i].arg_type = ARG_PTR_TO_BTF_ID | PTR_UNTRUSTED; + sub->args[i].btf_id = kern_type_id; + continue; + } if (tags & ARG_TAG_ARENA) { if (tags & ~ARG_TAG_ARENA) { bpf_log(log, "arg#%d arena cannot be combined with any other tags\n", i); @@ -8185,7 +8230,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, attr->attr.mode = 0444; attr->size = btf->data_size; attr->private = btf->data; - attr->read_new = sysfs_bin_attr_simple_read; + attr->read = sysfs_bin_attr_simple_read; err = sysfs_create_bin_file(btf_kobj, attr); if (err) { @@ -8563,7 +8608,7 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, /* Grow set */ set = krealloc(tab->sets[hook], - offsetof(struct btf_id_set8, pairs[set_cnt + add_set->cnt]), + struct_size(set, pairs, set_cnt + add_set->cnt), GFP_KERNEL | __GFP_NOWARN); if (!set) { ret = -ENOMEM; @@ -8849,7 +8894,7 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c } tab = krealloc(btf->dtor_kfunc_tab, - offsetof(struct btf_id_dtor_kfunc_tab, dtors[tab_cnt + add_cnt]), + struct_size(tab, dtors, tab_cnt + add_cnt), GFP_KERNEL | __GFP_NOWARN); if (!tab) { ret = -ENOMEM; @@ -9021,7 +9066,7 @@ static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands, bpf_free_cands_from_cache(*cc); *cc = NULL; } - new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL); + new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL_ACCOUNT); if (!new_cands) { bpf_free_cands(cands); return ERR_PTR(-ENOMEM); @@ -9029,7 +9074,7 @@ static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands, /* strdup the name, since it will stay in cache. * the cands->name points to strings in prog's BTF and the prog can be unloaded. */ - new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL); + new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL_ACCOUNT); bpf_free_cands(cands); if (!new_cands->name) { kfree(new_cands); @@ -9113,7 +9158,7 @@ bpf_core_add_cands(struct bpf_cand_cache *cands, const struct btf *targ_btf, continue; /* most of the time there is only one candidate for a given kind+name pair */ - new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL); + new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL_ACCOUNT); if (!new_cands) { bpf_free_cands(cands); return ERR_PTR(-ENOMEM); @@ -9230,7 +9275,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, /* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5" * into arrays of btf_ids of struct fields and array indices. */ - specs = kcalloc(3, sizeof(*specs), GFP_KERNEL); + specs = kcalloc(3, sizeof(*specs), GFP_KERNEL_ACCOUNT); if (!specs) return -ENOMEM; @@ -9255,7 +9300,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, goto out; } if (cc->cnt) { - cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL); + cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL_ACCOUNT); if (!cands.cands) { err = -ENOMEM; goto out; @@ -9407,8 +9452,7 @@ btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops, tab = btf->struct_ops_tab; if (!tab) { - tab = kzalloc(offsetof(struct btf_struct_ops_tab, ops[4]), - GFP_KERNEL); + tab = kzalloc(struct_size(tab, ops, 4), GFP_KERNEL); if (!tab) return -ENOMEM; tab->capacity = 4; @@ -9421,8 +9465,7 @@ btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops, if (tab->cnt == tab->capacity) { new_tab = krealloc(tab, - offsetof(struct btf_struct_ops_tab, - ops[tab->capacity * 2]), + struct_size(tab, ops, tab->capacity * 2), GFP_KERNEL); if (!new_tab) return -ENOMEM; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 84f58f3d028a..69988af44b37 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -27,20 +27,34 @@ EXPORT_SYMBOL(cgroup_bpf_enabled_key); /* * cgroup bpf destruction makes heavy use of work items and there can be a lot * of concurrent destructions. Use a separate workqueue so that cgroup bpf - * destruction work items don't end up filling up max_active of system_wq + * destruction work items don't end up filling up max_active of system_percpu_wq * which may lead to deadlock. */ static struct workqueue_struct *cgroup_bpf_destroy_wq; static int __init cgroup_bpf_wq_init(void) { - cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1); + cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", + WQ_PERCPU, 1); if (!cgroup_bpf_destroy_wq) panic("Failed to alloc workqueue for cgroup bpf destroy.\n"); return 0; } core_initcall(cgroup_bpf_wq_init); +static int cgroup_bpf_lifetime_notify(struct notifier_block *nb, + unsigned long action, void *data); + +static struct notifier_block cgroup_bpf_lifetime_nb = { + .notifier_call = cgroup_bpf_lifetime_notify, +}; + +void __init cgroup_bpf_lifetime_notifier_init(void) +{ + BUG_ON(blocking_notifier_chain_register(&cgroup_lifetime_notifier, + &cgroup_bpf_lifetime_nb)); +} + /* __always_inline is necessary to prevent indirect call through run_prog * function pointer. */ @@ -58,8 +72,7 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, u32 func_ret; run_ctx.retval = retval; - migrate_disable(); - rcu_read_lock(); + rcu_read_lock_dont_migrate(); array = rcu_dereference(cgrp->effective[atype]); item = &array->items[0]; old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); @@ -75,8 +88,7 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, item++; } bpf_reset_run_ctx(old_run_ctx); - rcu_read_unlock(); - migrate_enable(); + rcu_read_unlock_migrate(); return run_ctx.retval; } @@ -206,7 +218,7 @@ bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) } #endif /* CONFIG_BPF_LSM */ -void cgroup_bpf_offline(struct cgroup *cgrp) +static void cgroup_bpf_offline(struct cgroup *cgrp) { cgroup_get(cgrp); percpu_ref_kill(&cgrp->bpf.refcnt); @@ -491,7 +503,7 @@ static void activate_effective_progs(struct cgroup *cgrp, * cgroup_bpf_inherit() - inherit effective programs from parent * @cgrp: the cgroup to modify */ -int cgroup_bpf_inherit(struct cgroup *cgrp) +static int cgroup_bpf_inherit(struct cgroup *cgrp) { /* has to use marco instead of const int, since compiler thinks * that array below is variable length @@ -534,6 +546,27 @@ cleanup: return -ENOMEM; } +static int cgroup_bpf_lifetime_notify(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct cgroup *cgrp = data; + int ret = 0; + + if (cgrp->root != &cgrp_dfl_root) + return NOTIFY_OK; + + switch (action) { + case CGROUP_LIFETIME_ONLINE: + ret = cgroup_bpf_inherit(cgrp); + break; + case CGROUP_LIFETIME_OFFLINE: + cgroup_bpf_offline(cgrp); + break; + } + + return notifier_from_errno(ret); +} + static int update_effective_progs(struct cgroup *cgrp, enum cgroup_bpf_attach_type atype) { @@ -624,6 +657,116 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs, return NULL; } +static struct bpf_link *bpf_get_anchor_link(u32 flags, u32 id_or_fd) +{ + struct bpf_link *link = ERR_PTR(-EINVAL); + + if (flags & BPF_F_ID) + link = bpf_link_by_id(id_or_fd); + else if (id_or_fd) + link = bpf_link_get_from_fd(id_or_fd); + return link; +} + +static struct bpf_prog *bpf_get_anchor_prog(u32 flags, u32 id_or_fd) +{ + struct bpf_prog *prog = ERR_PTR(-EINVAL); + + if (flags & BPF_F_ID) + prog = bpf_prog_by_id(id_or_fd); + else if (id_or_fd) + prog = bpf_prog_get(id_or_fd); + return prog; +} + +static struct bpf_prog_list *get_prog_list(struct hlist_head *progs, struct bpf_prog *prog, + struct bpf_cgroup_link *link, u32 flags, u32 id_or_fd) +{ + bool is_link = flags & BPF_F_LINK, is_id = flags & BPF_F_ID; + struct bpf_prog_list *pltmp, *pl = ERR_PTR(-EINVAL); + bool preorder = flags & BPF_F_PREORDER; + struct bpf_link *anchor_link = NULL; + struct bpf_prog *anchor_prog = NULL; + bool is_before, is_after; + + is_before = flags & BPF_F_BEFORE; + is_after = flags & BPF_F_AFTER; + if (is_link || is_id || id_or_fd) { + /* flags must have either BPF_F_BEFORE or BPF_F_AFTER */ + if (is_before == is_after) + return ERR_PTR(-EINVAL); + if ((is_link && !link) || (!is_link && !prog)) + return ERR_PTR(-EINVAL); + } else if (!hlist_empty(progs)) { + /* flags cannot have both BPF_F_BEFORE and BPF_F_AFTER */ + if (is_before && is_after) + return ERR_PTR(-EINVAL); + } + + if (is_link) { + anchor_link = bpf_get_anchor_link(flags, id_or_fd); + if (IS_ERR(anchor_link)) + return ERR_CAST(anchor_link); + } else if (is_id || id_or_fd) { + anchor_prog = bpf_get_anchor_prog(flags, id_or_fd); + if (IS_ERR(anchor_prog)) + return ERR_CAST(anchor_prog); + } + + if (!anchor_prog && !anchor_link) { + /* if there is no anchor_prog/anchor_link, then BPF_F_PREORDER + * doesn't matter since either prepend or append to a combined + * list of progs will end up with correct result. + */ + hlist_for_each_entry(pltmp, progs, node) { + if (is_before) + return pltmp; + if (pltmp->node.next) + continue; + return pltmp; + } + return NULL; + } + + hlist_for_each_entry(pltmp, progs, node) { + if ((anchor_prog && anchor_prog == pltmp->prog) || + (anchor_link && anchor_link == &pltmp->link->link)) { + if (!!(pltmp->flags & BPF_F_PREORDER) != preorder) + goto out; + pl = pltmp; + goto out; + } + } + + pl = ERR_PTR(-ENOENT); +out: + if (anchor_link) + bpf_link_put(anchor_link); + else + bpf_prog_put(anchor_prog); + return pl; +} + +static int insert_pl_to_hlist(struct bpf_prog_list *pl, struct hlist_head *progs, + struct bpf_prog *prog, struct bpf_cgroup_link *link, + u32 flags, u32 id_or_fd) +{ + struct bpf_prog_list *pltmp; + + pltmp = get_prog_list(progs, prog, link, flags, id_or_fd); + if (IS_ERR(pltmp)) + return PTR_ERR(pltmp); + + if (!pltmp) + hlist_add_head(&pl->node, progs); + else if (flags & BPF_F_BEFORE) + hlist_add_before(&pl->node, &pltmp->node); + else + hlist_add_behind(&pl->node, &pltmp->node); + + return 0; +} + /** * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and * propagate the change to descendants @@ -633,6 +776,8 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs, * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set * @type: Type of attach operation * @flags: Option flags + * @id_or_fd: Relative prog id or fd + * @revision: bpf_prog_list revision * * Exactly one of @prog or @link can be non-null. * Must be called with cgroup_mutex held. @@ -640,7 +785,8 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs, static int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog *replace_prog, struct bpf_cgroup_link *link, - enum bpf_attach_type type, u32 flags) + enum bpf_attach_type type, u32 flags, u32 id_or_fd, + u64 revision) { u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); struct bpf_prog *old_prog = NULL; @@ -656,6 +802,9 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI))) /* invalid combination */ return -EINVAL; + if ((flags & BPF_F_REPLACE) && (flags & (BPF_F_BEFORE | BPF_F_AFTER))) + /* only either replace or insertion with before/after */ + return -EINVAL; if (link && (prog || replace_prog)) /* only either link or prog/replace_prog can be specified */ return -EINVAL; @@ -666,6 +815,8 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id); if (atype < 0) return -EINVAL; + if (revision && revision != cgrp->bpf.revisions[atype]) + return -ESTALE; progs = &cgrp->bpf.progs[atype]; @@ -694,22 +845,18 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, if (pl) { old_prog = pl->prog; } else { - struct hlist_node *last = NULL; - pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { bpf_cgroup_storages_free(new_storage); return -ENOMEM; } - if (hlist_empty(progs)) - hlist_add_head(&pl->node, progs); - else - hlist_for_each(last, progs) { - if (last->next) - continue; - hlist_add_behind(&pl->node, last); - break; - } + + err = insert_pl_to_hlist(pl, progs, prog, link, flags, id_or_fd); + if (err) { + kfree(pl); + bpf_cgroup_storages_free(new_storage); + return err; + } } pl->prog = prog; @@ -719,7 +866,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, cgrp->bpf.flags[atype] = saved_flags; if (type == BPF_LSM_CGROUP) { - err = bpf_trampoline_link_cgroup_shim(new_prog, atype); + err = bpf_trampoline_link_cgroup_shim(new_prog, atype, type); if (err) goto cleanup; } @@ -728,6 +875,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, if (err) goto cleanup_trampoline; + cgrp->bpf.revisions[atype] += 1; if (old_prog) { if (type == BPF_LSM_CGROUP) bpf_trampoline_unlink_cgroup_shim(old_prog); @@ -759,12 +907,13 @@ static int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog *replace_prog, struct bpf_cgroup_link *link, enum bpf_attach_type type, - u32 flags) + u32 flags, u32 id_or_fd, u64 revision) { int ret; cgroup_lock(); - ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags); + ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags, + id_or_fd, revision); cgroup_unlock(); return ret; } @@ -834,7 +983,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, struct hlist_head *progs; bool found = false; - atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id); + atype = bpf_cgroup_atype_find(link->link.attach_type, new_prog->aux->attach_btf_id); if (atype < 0) return -EINVAL; @@ -852,6 +1001,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, if (!found) return -ENOENT; + cgrp->bpf.revisions[atype] += 1; old_prog = xchg(&link->link.prog, new_prog); replace_effective_prog(cgrp, atype, link); bpf_prog_put(old_prog); @@ -977,12 +1127,14 @@ found: * @prog: A program to detach or NULL * @link: A link to detach or NULL * @type: Type of detach operation + * @revision: bpf_prog_list revision * * At most one of @prog or @link can be non-NULL. * Must be called with cgroup_mutex held. */ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - struct bpf_cgroup_link *link, enum bpf_attach_type type) + struct bpf_cgroup_link *link, enum bpf_attach_type type, + u64 revision) { enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; @@ -1000,6 +1152,9 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, if (atype < 0) return -EINVAL; + if (revision && revision != cgrp->bpf.revisions[atype]) + return -ESTALE; + progs = &cgrp->bpf.progs[atype]; flags = cgrp->bpf.flags[atype]; @@ -1025,6 +1180,7 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, /* now can actually delete it from this cgroup list */ hlist_del(&pl->node); + cgrp->bpf.revisions[atype] += 1; kfree(pl); if (hlist_empty(progs)) @@ -1040,12 +1196,12 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, } static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type) + enum bpf_attach_type type, u64 revision) { int ret; cgroup_lock(); - ret = __cgroup_bpf_detach(cgrp, prog, NULL, type); + ret = __cgroup_bpf_detach(cgrp, prog, NULL, type, revision); cgroup_unlock(); return ret; } @@ -1063,6 +1219,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, struct bpf_prog_array *effective; int cnt, ret = 0, i; int total_cnt = 0; + u64 revision = 0; u32 flags; if (effective_query && prog_attach_flags) @@ -1100,6 +1257,10 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, return -EFAULT; if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt))) return -EFAULT; + if (!effective_query && from_atype == to_atype) + revision = cgrp->bpf.revisions[from_atype]; + if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision))) + return -EFAULT; if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt) /* return early if user requested only program count + flags */ return 0; @@ -1182,7 +1343,8 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr, } ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL, - attr->attach_type, attr->attach_flags); + attr->attach_type, attr->attach_flags, + attr->relative_fd, attr->expected_revision); if (replace_prog) bpf_prog_put(replace_prog); @@ -1204,7 +1366,7 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) if (IS_ERR(prog)) prog = NULL; - ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type); + ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, attr->expected_revision); if (prog) bpf_prog_put(prog); @@ -1233,8 +1395,8 @@ static void bpf_cgroup_link_release(struct bpf_link *link) } WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, - cg_link->type)); - if (cg_link->type == BPF_LSM_CGROUP) + link->attach_type, 0)); + if (link->attach_type == BPF_LSM_CGROUP) bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog); cg = cg_link->cgroup; @@ -1276,7 +1438,7 @@ static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link, "cgroup_id:\t%llu\n" "attach_type:\t%d\n", cg_id, - cg_link->type); + link->attach_type); } static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link, @@ -1292,7 +1454,7 @@ static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link, cgroup_unlock(); info->cgroup.cgroup_id = cg_id; - info->cgroup.attach_type = cg_link->type; + info->cgroup.attach_type = link->attach_type; return 0; } @@ -1305,6 +1467,13 @@ static const struct bpf_link_ops bpf_cgroup_link_lops = { .fill_link_info = bpf_cgroup_link_fill_link_info, }; +#define BPF_F_LINK_ATTACH_MASK \ + (BPF_F_ID | \ + BPF_F_BEFORE | \ + BPF_F_AFTER | \ + BPF_F_PREORDER | \ + BPF_F_LINK) + int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_link_primer link_primer; @@ -1312,7 +1481,7 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) struct cgroup *cgrp; int err; - if (attr->link_create.flags) + if (attr->link_create.flags & (~BPF_F_LINK_ATTACH_MASK)) return -EINVAL; cgrp = cgroup_get_from_fd(attr->link_create.target_fd); @@ -1325,9 +1494,8 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) goto out_put_cgroup; } bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops, - prog); + prog, attr->link_create.attach_type); link->cgroup = cgrp; - link->type = attr->link_create.attach_type; err = bpf_link_prime(&link->link, &link_primer); if (err) { @@ -1336,7 +1504,9 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) } err = cgroup_bpf_attach(cgrp, NULL, NULL, link, - link->type, BPF_F_ALLOW_MULTI); + link->link.attach_type, BPF_F_ALLOW_MULTI | attr->link_create.flags, + attr->link_create.cgroup.relative_fd, + attr->link_create.cgroup.expected_revision); if (err) { bpf_link_cleanup(&link_primer); goto out_put_cgroup; @@ -1495,7 +1665,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); * returned value != 1 during execution. In all other cases, 0 is returned. */ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, - struct sockaddr *uaddr, + struct sockaddr_unsized *uaddr, int *uaddrlen, enum cgroup_bpf_attach_type atype, void *t_ctx, @@ -1506,7 +1676,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, .uaddr = uaddr, .t_ctx = t_ctx, }; - struct sockaddr_storage unspec; + struct sockaddr_storage storage; struct cgroup *cgrp; int ret; @@ -1518,8 +1688,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, return 0; if (!ctx.uaddr) { - memset(&unspec, 0, sizeof(unspec)); - ctx.uaddr = (struct sockaddr *)&unspec; + memset(&storage, 0, sizeof(storage)); + ctx.uaddr = (struct sockaddr_unsized *)&storage; ctx.uaddrlen = 0; } else { ctx.uaddrlen = *uaddrlen; @@ -1653,10 +1823,6 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) if (func_proto) return func_proto; - func_proto = cgroup_current_func_proto(func_id, prog); - if (func_proto) - return func_proto; - switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; @@ -2104,7 +2270,7 @@ static const struct bpf_func_proto bpf_sysctl_get_name_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -2204,10 +2370,6 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) if (func_proto) return func_proto; - func_proto = cgroup_current_func_proto(func_id, prog); - if (func_proto) - return func_proto; - switch (func_id) { case BPF_FUNC_sysctl_get_name: return &bpf_sysctl_get_name_proto; @@ -2351,10 +2513,6 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) if (func_proto) return func_proto; - func_proto = cgroup_current_func_proto(func_id, prog); - if (func_proto) - return func_proto; - switch (func_id) { #ifdef CONFIG_NET case BPF_FUNC_get_netns_cookie: @@ -2418,22 +2576,22 @@ static bool cg_sockopt_is_valid_access(int off, int size, } switch (off) { - case offsetof(struct bpf_sockopt, sk): + case bpf_ctx_range_ptr(struct bpf_sockopt, sk): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCKET; break; - case offsetof(struct bpf_sockopt, optval): + case bpf_ctx_range_ptr(struct bpf_sockopt, optval): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_PACKET; break; - case offsetof(struct bpf_sockopt, optval_end): + case bpf_ctx_range_ptr(struct bpf_sockopt, optval_end): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_PACKET_END; break; - case offsetof(struct bpf_sockopt, retval): + case bpf_ctx_range(struct bpf_sockopt, retval): if (size != size_default) return false; return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT; @@ -2601,23 +2759,3 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return NULL; } } - -/* Common helpers for cgroup hooks with valid process context. */ -const struct bpf_func_proto * -cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - switch (func_id) { - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_current_comm: - return &bpf_get_current_comm_proto; -#ifdef CONFIG_CGROUP_NET_CLASSID - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_curr_proto; -#endif - case BPF_FUNC_current_task_under_cgroup: - return &bpf_current_task_under_cgroup_proto; - default: - return NULL; - } -} diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ba6b6118cf50..1b9b18e5b03c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -18,6 +18,7 @@ */ #include <uapi/linux/btf.h> +#include <crypto/sha1.h> #include <linux/filter.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> @@ -38,6 +39,7 @@ #include <linux/bpf_mem_alloc.h> #include <linux/memcontrol.h> #include <linux/execmem.h> +#include <crypto/sha2.h> #include <asm/barrier.h> #include <linux/unaligned.h> @@ -119,6 +121,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->pages = size / PAGE_SIZE; fp->aux = aux; + fp->aux->main_prog_aux = aux; fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); fp->blinding_requested = bpf_jit_blinding_enabled(fp); @@ -134,6 +137,10 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag mutex_init(&fp->aux->ext_mutex); mutex_init(&fp->aux->dst_mutex); +#ifdef CONFIG_BPF_SYSCALL + bpf_prog_stream_init(fp); +#endif + return fp; } @@ -289,28 +296,18 @@ void __bpf_prog_free(struct bpf_prog *fp) int bpf_prog_calc_tag(struct bpf_prog *fp) { - const u32 bits_offset = SHA1_BLOCK_SIZE - sizeof(__be64); - u32 raw_size = bpf_prog_tag_scratch_size(fp); - u32 digest[SHA1_DIGEST_WORDS]; - u32 ws[SHA1_WORKSPACE_WORDS]; - u32 i, bsize, psize, blocks; + size_t size = bpf_prog_insn_size(fp); struct bpf_insn *dst; bool was_ld_map; - u8 *raw, *todo; - __be32 *result; - __be64 *bits; + u32 i; - raw = vmalloc(raw_size); - if (!raw) + dst = vmalloc(size); + if (!dst) return -ENOMEM; - sha1_init(digest); - memset(ws, 0, sizeof(ws)); - /* We need to take out the map fd for the digest calculation * since they are unstable from user space side. */ - dst = (void *)raw; for (i = 0, was_ld_map = false; i < fp->len; i++) { dst[i] = fp->insnsi[i]; if (!was_ld_map && @@ -330,33 +327,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) was_ld_map = false; } } - - psize = bpf_prog_insn_size(fp); - memset(&raw[psize], 0, raw_size - psize); - raw[psize++] = 0x80; - - bsize = round_up(psize, SHA1_BLOCK_SIZE); - blocks = bsize / SHA1_BLOCK_SIZE; - todo = raw; - if (bsize - psize >= sizeof(__be64)) { - bits = (__be64 *)(todo + bsize - sizeof(__be64)); - } else { - bits = (__be64 *)(todo + bsize + bits_offset); - blocks++; - } - *bits = cpu_to_be64((psize - 1) << 3); - - while (blocks--) { - sha1_transform(digest, todo, ws); - todo += SHA1_BLOCK_SIZE; - } - - result = (__force __be32 *)digest; - for (i = 0; i < SHA1_DIGEST_WORDS; i++) - result[i] = cpu_to_be32(digest[i]); - memcpy(fp->tag, result, sizeof(fp->tag)); - - vfree(raw); + sha256((u8 *)dst, size, fp->digest); + vfree(dst); return 0; } @@ -778,13 +750,32 @@ bool is_bpf_text_address(unsigned long addr) struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) { - struct bpf_ksym *ksym = bpf_ksym_find(addr); + struct bpf_ksym *ksym; + + WARN_ON_ONCE(!rcu_read_lock_held()); + ksym = bpf_ksym_find(addr); return ksym && ksym->prog ? container_of(ksym, struct bpf_prog_aux, ksym)->prog : NULL; } +bool bpf_has_frame_pointer(unsigned long ip) +{ + struct bpf_ksym *ksym; + unsigned long offset; + + guard(rcu)(); + + ksym = bpf_ksym_find(ip); + if (!ksym || !ksym->fp_start || !ksym->fp_end) + return false; + + offset = ip - ksym->start; + + return offset >= ksym->fp_start && offset < ksym->fp_end; +} + const struct exception_table_entry *search_bpf_extables(unsigned long addr) { const struct exception_table_entry *e = NULL; @@ -1290,6 +1281,13 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog, return 0; } +const char *bpf_jit_get_prog_name(struct bpf_prog *prog) +{ + if (prog->aux->ksym.prog) + return prog->aux->ksym.name; + return prog->aux->name; +} + static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, struct bpf_insn *to_buff, @@ -1468,6 +1466,23 @@ void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other) bpf_prog_clone_free(fp_other); } +static void adjust_insn_arrays(struct bpf_prog *prog, u32 off, u32 len) +{ +#ifdef CONFIG_BPF_SYSCALL + struct bpf_map *map; + int i; + + if (len <= 1) + return; + + for (i = 0; i < prog->aux->used_map_cnt; i++) { + map = prog->aux->used_maps[i]; + if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) + bpf_insn_array_adjust(map, off, len); + } +#endif +} + struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) { struct bpf_insn insn_buff[16], aux[2]; @@ -1523,6 +1538,9 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) clone = tmp; insn_delta = rewritten - 1; + /* Instructions arrays must be updated using absolute xlated offsets */ + adjust_insn_arrays(clone, prog->aux->subprog_start + i, rewritten); + /* Walk new program and skip insns we just inserted. */ insn = clone->insnsi + i + insn_delta; insn_cnt += insn_delta; @@ -1706,6 +1724,7 @@ bool bpf_opcode_in_insntable(u8 code) [BPF_LD | BPF_IND | BPF_B] = true, [BPF_LD | BPF_IND | BPF_H] = true, [BPF_LD | BPF_IND | BPF_W] = true, + [BPF_JMP | BPF_JA | BPF_X] = true, [BPF_JMP | BPF_JCOND] = true, }; #undef BPF_INSN_3_TBL @@ -2102,14 +2121,15 @@ out: #undef COND_JMP /* ST, STX and LDX*/ ST_NOSPEC: - /* Speculation barrier for mitigating Speculative Store Bypass. - * In case of arm64, we rely on the firmware mitigation as - * controlled via the ssbd kernel parameter. Whenever the - * mitigation is enabled, it works for all of the kernel code - * with no need to provide any additional instructions here. - * In case of x86, we use 'lfence' insn for mitigation. We - * reuse preexisting logic from Spectre v1 mitigation that - * happens to produce the required code on x86 for v4 as well. + /* Speculation barrier for mitigating Speculative Store Bypass, + * Bounds-Check Bypass and Type Confusion. In case of arm64, we + * rely on the firmware mitigation as controlled via the ssbd + * kernel parameter. Whenever the mitigation is enabled, it + * works for all of the kernel code with no need to provide any + * additional instructions here. In case of x86, we use 'lfence' + * insn for mitigation. We reuse preexisting logic from Spectre + * v1 mitigation that happens to produce the required code on + * x86 for v4 as well. */ barrier_nospec(); CONT; @@ -2351,47 +2371,59 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, const struct bpf_insn *insn) { /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON - * is not working properly, or interpreter is being used when - * prog->jit_requested is not 0, so warn about it! + * is not working properly, so warn about it! */ WARN_ON_ONCE(1); return 0; } -bool bpf_prog_map_compatible(struct bpf_map *map, - const struct bpf_prog *fp) +static bool __bpf_prog_map_compatible(struct bpf_map *map, + const struct bpf_prog *fp) { enum bpf_prog_type prog_type = resolve_prog_type(fp); - bool ret; struct bpf_prog_aux *aux = fp->aux; + enum bpf_cgroup_storage_type i; + bool ret = false; + u64 cookie; if (fp->kprobe_override) - return false; - - /* XDP programs inserted into maps are not guaranteed to run on - * a particular netdev (and can run outside driver context entirely - * in the case of devmap and cpumap). Until device checks - * are implemented, prohibit adding dev-bound programs to program maps. - */ - if (bpf_prog_is_dev_bound(aux)) - return false; + return ret; - spin_lock(&map->owner.lock); - if (!map->owner.type) { - /* There's no owner yet where we could check for - * compatibility. - */ - map->owner.type = prog_type; - map->owner.jited = fp->jited; - map->owner.xdp_has_frags = aux->xdp_has_frags; - map->owner.attach_func_proto = aux->attach_func_proto; + spin_lock(&map->owner_lock); + /* There's no owner yet where we could check for compatibility. */ + if (!map->owner) { + map->owner = bpf_map_owner_alloc(map); + if (!map->owner) + goto err; + map->owner->type = prog_type; + map->owner->jited = fp->jited; + map->owner->xdp_has_frags = aux->xdp_has_frags; + map->owner->expected_attach_type = fp->expected_attach_type; + map->owner->attach_func_proto = aux->attach_func_proto; + for_each_cgroup_storage_type(i) { + map->owner->storage_cookie[i] = + aux->cgroup_storage[i] ? + aux->cgroup_storage[i]->cookie : 0; + } ret = true; } else { - ret = map->owner.type == prog_type && - map->owner.jited == fp->jited && - map->owner.xdp_has_frags == aux->xdp_has_frags; + ret = map->owner->type == prog_type && + map->owner->jited == fp->jited && + map->owner->xdp_has_frags == aux->xdp_has_frags; if (ret && - map->owner.attach_func_proto != aux->attach_func_proto) { + map->map_type == BPF_MAP_TYPE_PROG_ARRAY && + map->owner->expected_attach_type != fp->expected_attach_type) + ret = false; + for_each_cgroup_storage_type(i) { + if (!ret) + break; + cookie = aux->cgroup_storage[i] ? + aux->cgroup_storage[i]->cookie : 0; + ret = map->owner->storage_cookie[i] == cookie || + !cookie; + } + if (ret && + map->owner->attach_func_proto != aux->attach_func_proto) { switch (prog_type) { case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_LSM: @@ -2404,11 +2436,24 @@ bool bpf_prog_map_compatible(struct bpf_map *map, } } } - spin_unlock(&map->owner.lock); - +err: + spin_unlock(&map->owner_lock); return ret; } +bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp) +{ + /* XDP programs inserted into maps are not guaranteed to run on + * a particular netdev (and can run outside driver context entirely + * in the case of devmap and cpumap). Until device checks + * are implemented, prohibit adding dev-bound programs to program maps. + */ + if (bpf_prog_is_dev_bound(fp->aux)) + return false; + + return __bpf_prog_map_compatible(map, fp); +} + static int bpf_check_tail_call(const struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; @@ -2421,7 +2466,7 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) if (!map_type_contains_progs(map)) continue; - if (!bpf_prog_map_compatible(map, fp)) { + if (!__bpf_prog_map_compatible(map, fp)) { ret = -EINVAL; goto out; } @@ -2432,8 +2477,9 @@ out: return ret; } -static void bpf_prog_select_func(struct bpf_prog *fp) +static bool bpf_prog_select_interpreter(struct bpf_prog *fp) { + bool select_interpreter = false; #ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); u32 idx = (round_up(stack_depth, 32) / 32) - 1; @@ -2442,15 +2488,16 @@ static void bpf_prog_select_func(struct bpf_prog *fp) * But for non-JITed programs, we don't need bpf_func, so no bounds * check needed. */ - if (!fp->jit_requested && - !WARN_ON_ONCE(idx >= ARRAY_SIZE(interpreters))) { + if (idx < ARRAY_SIZE(interpreters)) { fp->bpf_func = interpreters[idx]; + select_interpreter = true; } else { fp->bpf_func = __bpf_prog_ret0_warn; } #else fp->bpf_func = __bpf_prog_ret0_warn; #endif + return select_interpreter; } /** @@ -2478,7 +2525,8 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) bpf_prog_has_kfunc_call(fp)) jit_needed = true; - bpf_prog_select_func(fp); + if (!bpf_prog_select_interpreter(fp)) + jit_needed = true; /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during @@ -2856,6 +2904,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) aux = container_of(work, struct bpf_prog_aux, work); #ifdef CONFIG_BPF_SYSCALL bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab); + bpf_prog_stream_free(aux->prog); #endif #ifdef CONFIG_CGROUP_BPF if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID) @@ -2987,7 +3036,10 @@ EXPORT_SYMBOL_GPL(bpf_event_output); /* Always built-in helper functions. */ const struct bpf_func_proto bpf_tail_call_proto = { - .func = NULL, + /* func is unused for tail_call, we set it to pass the + * get_helper_proto check + */ + .func = BPF_PTR_POISON, .gpl_only = false, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_CTX, @@ -3029,6 +3081,21 @@ bool __weak bpf_jit_needs_zext(void) return false; } +/* By default, enable the verifier's mitigations against Spectre v1 and v4 for + * all archs. The value returned must not change at runtime as there is + * currently no support for reloading programs that were loaded without + * mitigations. + */ +bool __weak bpf_jit_bypass_spec_v1(void) +{ + return false; +} + +bool __weak bpf_jit_bypass_spec_v4(void) +{ + return false; +} + /* Return true if the JIT inlines the call to the helper corresponding to * the imm. * @@ -3099,8 +3166,9 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, return -EFAULT; } -int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, - void *addr1, void *addr2) +int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t, + enum bpf_text_poke_type new_t, void *old_addr, + void *new_addr) { return -ENOTSUPP; } @@ -3139,6 +3207,22 @@ u64 __weak arch_bpf_timed_may_goto(void) return 0; } +static noinline void bpf_prog_report_may_goto_violation(void) +{ +#ifdef CONFIG_BPF_SYSCALL + struct bpf_stream_stage ss; + struct bpf_prog *prog; + + prog = bpf_prog_find_from_stack(); + if (!prog) + return; + bpf_stream_stage(ss, prog, BPF_STDERR, ({ + bpf_stream_printk(ss, "ERROR: Timeout detected for may_goto instruction\n"); + bpf_stream_dump_stack(ss); + })); +#endif +} + u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p) { u64 time = ktime_get_mono_fast_ns(); @@ -3149,8 +3233,10 @@ u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p) return BPF_MAX_TIMED_LOOPS; } /* Check if we've exhausted our time slice, and zero count. */ - if (time - p->timestamp >= (NSEC_PER_SEC / 4)) + if (unlikely(time - p->timestamp >= (NSEC_PER_SEC / 4))) { + bpf_prog_report_may_goto_violation(); return 0; + } /* Refresh the count for the stack frame. */ return BPF_MAX_TIMED_LOOPS; } @@ -3187,3 +3273,84 @@ EXPORT_SYMBOL(bpf_stats_enabled_key); EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx); + +#ifdef CONFIG_BPF_SYSCALL + +int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep, + const char **linep, int *nump) +{ + int idx = -1, insn_start, insn_end, len; + struct bpf_line_info *linfo; + void **jited_linfo; + struct btf *btf; + int nr_linfo; + + btf = prog->aux->btf; + linfo = prog->aux->linfo; + jited_linfo = prog->aux->jited_linfo; + + if (!btf || !linfo || !jited_linfo) + return -EINVAL; + len = prog->aux->func ? prog->aux->func[prog->aux->func_idx]->len : prog->len; + + linfo = &prog->aux->linfo[prog->aux->linfo_idx]; + jited_linfo = &prog->aux->jited_linfo[prog->aux->linfo_idx]; + + insn_start = linfo[0].insn_off; + insn_end = insn_start + len; + nr_linfo = prog->aux->nr_linfo - prog->aux->linfo_idx; + + for (int i = 0; i < nr_linfo && + linfo[i].insn_off >= insn_start && linfo[i].insn_off < insn_end; i++) { + if (jited_linfo[i] >= (void *)ip) + break; + idx = i; + } + + if (idx == -1) + return -ENOENT; + + /* Get base component of the file path. */ + *filep = btf_name_by_offset(btf, linfo[idx].file_name_off); + *filep = kbasename(*filep); + /* Obtain the source line, and strip whitespace in prefix. */ + *linep = btf_name_by_offset(btf, linfo[idx].line_off); + while (isspace(**linep)) + *linep += 1; + *nump = BPF_LINE_INFO_LINE_NUM(linfo[idx].line_col); + return 0; +} + +struct walk_stack_ctx { + struct bpf_prog *prog; +}; + +static bool find_from_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp) +{ + struct walk_stack_ctx *ctxp = cookie; + struct bpf_prog *prog; + + /* + * The RCU read lock is held to safely traverse the latch tree, but we + * don't need its protection when accessing the prog, since it has an + * active stack frame on the current stack trace, and won't disappear. + */ + rcu_read_lock(); + prog = bpf_prog_ksym_find(ip); + rcu_read_unlock(); + if (!prog) + return true; + /* Make sure we return the main prog if we found a subprog */ + ctxp->prog = prog->aux->main_prog_aux->prog; + return false; +} + +struct bpf_prog *bpf_prog_find_from_stack(void) +{ + struct walk_stack_ctx ctx = {}; + + arch_bpf_stack_walk(find_from_stack_cb, &ctx); + return ctx.prog; +} + +#endif diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 67e8a2fc1a99..703e5df1f4ef 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -186,7 +186,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, struct xdp_buff xdp; int i, nframes = 0; - xdp_set_return_frame_no_direct(); xdp.rxq = &rxq; for (i = 0; i < n; i++) { @@ -231,7 +230,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, } } - xdp_clear_return_frame_no_direct(); stats->pass += nframes; return nframes; @@ -255,6 +253,7 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + xdp_set_return_frame_no_direct(); ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats); if (unlikely(ret->skb_n)) @@ -264,6 +263,7 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, if (stats->redirect) xdp_do_flush(); + xdp_clear_return_frame_no_direct(); bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); @@ -282,8 +282,7 @@ static void cpu_map_gro_flush(struct bpf_cpu_map_entry *rcpu, bool empty) * This is equivalent to how NAPI decides whether to perform a full * flush. */ - gro_flush(&rcpu->gro, !empty && HZ >= 1000); - gro_normal_list(&rcpu->gro); + gro_flush_normal(&rcpu->gro, !empty && HZ >= 1000); } static int cpu_map_kthread_run(void *data) @@ -551,7 +550,7 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu))); if (old_rcpu) { INIT_RCU_WORK(&old_rcpu->free_work, __cpu_map_entry_free); - queue_rcu_work(system_wq, &old_rcpu->free_work); + queue_rcu_work(system_percpu_wq, &old_rcpu->free_work); } } diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c index 94854cd9c4cc..83c4d9943084 100644 --- a/kernel/bpf/crypto.c +++ b/kernel/bpf/crypto.c @@ -278,7 +278,7 @@ static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx, siv_len = siv ? __bpf_dynptr_size(siv) : 0; src_len = __bpf_dynptr_size(src); dst_len = __bpf_dynptr_size(dst); - if (!src_len || !dst_len) + if (!src_len || !dst_len || src_len > dst_len) return -EINVAL; if (siv_len != ctx->siv_len) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 482d284a1553..2625601de76e 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -865,7 +865,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_dtab_netdev *dev; dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev), - GFP_NOWAIT | __GFP_NOWARN, + GFP_NOWAIT, dtab->map.numa_node); if (!dev) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 20883c6b1546..f8a3c7eb451e 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -358,6 +358,9 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, } else if (insn->code == (BPF_JMP | BPF_JA)) { verbose(cbs->private_data, "(%02x) goto pc%+d\n", insn->code, insn->off); + } else if (insn->code == (BPF_JMP | BPF_JA | BPF_X)) { + verbose(cbs->private_data, "(%02x) gotox r%d\n", + insn->code, insn->dst_reg); } else if (insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO) { verbose(cbs->private_data, "(%02x) may_goto pc%+d\n", diff --git a/kernel/bpf/dmabuf_iter.c b/kernel/bpf/dmabuf_iter.c new file mode 100644 index 000000000000..cd500248abd9 --- /dev/null +++ b/kernel/bpf/dmabuf_iter.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Google LLC */ +#include <linux/bpf.h> +#include <linux/btf_ids.h> +#include <linux/dma-buf.h> +#include <linux/kernel.h> +#include <linux/seq_file.h> + +struct dmabuf_iter_priv { + /* + * If this pointer is non-NULL, the buffer's refcount is elevated to + * prevent destruction between stop/start. If reading is not resumed and + * start is never called again, then dmabuf_iter_seq_fini drops the + * reference when the iterator is released. + */ + struct dma_buf *dmabuf; +}; + +static void *dmabuf_iter_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct dmabuf_iter_priv *p = seq->private; + + if (*pos) { + struct dma_buf *dmabuf = p->dmabuf; + + if (!dmabuf) + return NULL; + + /* + * Always resume from where we stopped, regardless of the value + * of pos. + */ + p->dmabuf = NULL; + return dmabuf; + } + + return dma_buf_iter_begin(); +} + +static void *dmabuf_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct dma_buf *dmabuf = v; + + ++*pos; + + return dma_buf_iter_next(dmabuf); +} + +struct bpf_iter__dmabuf { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct dma_buf *, dmabuf); +}; + +static int __dmabuf_seq_show(struct seq_file *seq, void *v, bool in_stop) +{ + struct bpf_iter_meta meta = { + .seq = seq, + }; + struct bpf_iter__dmabuf ctx = { + .meta = &meta, + .dmabuf = v, + }; + struct bpf_prog *prog = bpf_iter_get_info(&meta, in_stop); + + if (prog) + return bpf_iter_run_prog(prog, &ctx); + + return 0; +} + +static int dmabuf_iter_seq_show(struct seq_file *seq, void *v) +{ + return __dmabuf_seq_show(seq, v, false); +} + +static void dmabuf_iter_seq_stop(struct seq_file *seq, void *v) +{ + struct dma_buf *dmabuf = v; + + if (dmabuf) { + struct dmabuf_iter_priv *p = seq->private; + + p->dmabuf = dmabuf; + } +} + +static const struct seq_operations dmabuf_iter_seq_ops = { + .start = dmabuf_iter_seq_start, + .next = dmabuf_iter_seq_next, + .stop = dmabuf_iter_seq_stop, + .show = dmabuf_iter_seq_show, +}; + +static void bpf_iter_dmabuf_show_fdinfo(const struct bpf_iter_aux_info *aux, + struct seq_file *seq) +{ + seq_puts(seq, "dmabuf iter\n"); +} + +static int dmabuf_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux) +{ + struct dmabuf_iter_priv *p = (struct dmabuf_iter_priv *)priv; + + p->dmabuf = NULL; + return 0; +} + +static void dmabuf_iter_seq_fini(void *priv) +{ + struct dmabuf_iter_priv *p = (struct dmabuf_iter_priv *)priv; + + if (p->dmabuf) + dma_buf_put(p->dmabuf); +} + +static const struct bpf_iter_seq_info dmabuf_iter_seq_info = { + .seq_ops = &dmabuf_iter_seq_ops, + .init_seq_private = dmabuf_iter_seq_init, + .fini_seq_private = dmabuf_iter_seq_fini, + .seq_priv_size = sizeof(struct dmabuf_iter_priv), +}; + +static struct bpf_iter_reg bpf_dmabuf_reg_info = { + .target = "dmabuf", + .feature = BPF_ITER_RESCHED, + .show_fdinfo = bpf_iter_dmabuf_show_fdinfo, + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__dmabuf, dmabuf), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &dmabuf_iter_seq_info, +}; + +DEFINE_BPF_ITER_FUNC(dmabuf, struct bpf_iter_meta *meta, struct dma_buf *dmabuf) +BTF_ID_LIST_SINGLE(bpf_dmabuf_btf_id, struct, dma_buf) + +static int __init dmabuf_iter_init(void) +{ + bpf_dmabuf_reg_info.ctx_arg_info[0].btf_id = bpf_dmabuf_btf_id[0]; + return bpf_iter_reg_target(&bpf_dmabuf_reg_info); +} + +late_initcall(dmabuf_iter_init); + +struct bpf_iter_dmabuf { + /* + * opaque iterator state; having __u64 here allows to preserve correct + * alignment requirements in vmlinux.h, generated from BTF + */ + __u64 __opaque[1]; +} __aligned(8); + +/* Non-opaque version of bpf_iter_dmabuf */ +struct bpf_iter_dmabuf_kern { + struct dma_buf *dmabuf; +} __aligned(8); + +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_iter_dmabuf_new(struct bpf_iter_dmabuf *it) +{ + struct bpf_iter_dmabuf_kern *kit = (void *)it; + + BUILD_BUG_ON(sizeof(*kit) > sizeof(*it)); + BUILD_BUG_ON(__alignof__(*kit) != __alignof__(*it)); + + kit->dmabuf = NULL; + return 0; +} + +__bpf_kfunc struct dma_buf *bpf_iter_dmabuf_next(struct bpf_iter_dmabuf *it) +{ + struct bpf_iter_dmabuf_kern *kit = (void *)it; + + if (kit->dmabuf) + kit->dmabuf = dma_buf_iter_next(kit->dmabuf); + else + kit->dmabuf = dma_buf_iter_begin(); + + return kit->dmabuf; +} + +__bpf_kfunc void bpf_iter_dmabuf_destroy(struct bpf_iter_dmabuf *it) +{ + struct bpf_iter_dmabuf_kern *kit = (void *)it; + + if (kit->dmabuf) + dma_buf_put(kit->dmabuf); +} + +__bpf_kfunc_end_defs(); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 92b606d60020..c8a9b27f8663 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -175,20 +175,30 @@ static bool htab_is_percpu(const struct bpf_htab *htab) htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; } +static inline bool is_fd_htab(const struct bpf_htab *htab) +{ + return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS; +} + +static inline void *htab_elem_value(struct htab_elem *l, u32 key_size) +{ + return l->key + round_up(key_size, 8); +} + static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { - *(void __percpu **)(l->key + roundup(key_size, 8)) = pptr; + *(void __percpu **)htab_elem_value(l, key_size) = pptr; } static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) { - return *(void __percpu **)(l->key + roundup(key_size, 8)); + return *(void __percpu **)htab_elem_value(l, key_size); } static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) { - return *(void **)(l->key + roundup(map->key_size, 8)); + return *(void **)htab_elem_value(l, map->key_size); } static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) @@ -196,12 +206,16 @@ static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size); } +/* Both percpu and fd htab support in-place update, so no need for + * extra elem. LRU itself can remove the least used element, so + * there is no need for an extra elem during map_update. + */ static bool htab_has_extra_elems(struct bpf_htab *htab) { - return !htab_is_percpu(htab) && !htab_is_lru(htab); + return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab); } -static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) +static void htab_free_prealloced_internal_structs(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; int i; @@ -213,12 +227,8 @@ static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) struct htab_elem *elem; elem = get_htab_elem(htab, i); - if (btf_record_has_field(htab->map.record, BPF_TIMER)) - bpf_obj_free_timer(htab->map.record, - elem->key + round_up(htab->map.key_size, 8)); - if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) - bpf_obj_free_workqueue(htab->map.record, - elem->key + round_up(htab->map.key_size, 8)); + bpf_map_free_internal_structs(&htab->map, + htab_elem_value(elem, htab->map.key_size)); cond_resched(); } } @@ -245,7 +255,8 @@ static void htab_free_prealloced_fields(struct bpf_htab *htab) cond_resched(); } } else { - bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); + bpf_obj_free_fields(htab->map.record, + htab_elem_value(elem, htab->map.key_size)); cond_resched(); } cond_resched(); @@ -453,8 +464,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); - bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || - attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); /* percpu_lru means each cpu has its own LRU list. * it is different from BPF_MAP_TYPE_PERCPU_HASH where * the map's value itself is percpu. percpu_lru has @@ -549,10 +558,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (err) goto free_map_locked; - if (!percpu && !lru) { - /* lru itself can remove the least used element, so - * there is no need for an extra elem during map_update. - */ + if (htab_has_extra_elems(htab)) { err = alloc_extra_elems(htab); if (err) goto free_prealloc; @@ -651,8 +657,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) struct htab_elem *l; u32 hash, key_size; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -670,7 +675,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) struct htab_elem *l = __htab_map_lookup_elem(map, key); if (l) - return l->key + round_up(map->key_size, 8); + return htab_elem_value(l, map->key_size); return NULL; } @@ -709,7 +714,7 @@ static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, if (l) { if (mark) bpf_lru_node_set_ref(&l->lru_node); - return l->key + round_up(map->key_size, 8); + return htab_elem_value(l, map->key_size); } return NULL; @@ -763,7 +768,7 @@ static void check_and_free_fields(struct bpf_htab *htab, for_each_possible_cpu(cpu) bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); } else { - void *map_value = elem->key + round_up(htab->map.key_size, 8); + void *map_value = htab_elem_value(elem, htab->map.key_size); bpf_obj_free_fields(htab->map.record, map_value); } @@ -929,15 +934,21 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, void *value, bool onallcpus) { + void *ptr; + if (!onallcpus) { /* copy true value_size bytes */ - copy_map_value(&htab->map, this_cpu_ptr(pptr), value); + ptr = this_cpu_ptr(pptr); + copy_map_value(&htab->map, ptr, value); + bpf_obj_free_fields(htab->map.record, ptr); } else { u32 size = round_up(htab->map.value_size, 8); int off = 0, cpu; for_each_possible_cpu(cpu) { - copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value + off); + ptr = per_cpu_ptr(pptr, cpu); + copy_map_value_long(&htab->map, ptr, value + off); + bpf_obj_free_fields(htab->map.record, ptr); off += size; } } @@ -968,8 +979,7 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) { - return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && - BITS_PER_LONG == 64; + return is_fd_htab(htab) && BITS_PER_LONG == 64; } static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, @@ -1039,11 +1049,9 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, htab_elem_set_ptr(l_new, key_size, pptr); } else if (fd_htab_map_needs_adjust(htab)) { size = round_up(size, 8); - memcpy(l_new->key + round_up(key_size, 8), value, size); + memcpy(htab_elem_value(l_new, key_size), value, size); } else { - copy_map_value(&htab->map, - l_new->key + round_up(key_size, 8), - value); + copy_map_value(&htab->map, htab_elem_value(l_new, key_size), value); } l_new->hash = hash; @@ -1072,10 +1080,9 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l_new = NULL, *l_old; + struct htab_elem *l_new, *l_old; struct hlist_nulls_head *head; unsigned long flags; - void *old_map_ptr; struct bucket *b; u32 key_size, hash; int ret; @@ -1084,8 +1091,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1106,7 +1112,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (l_old) { /* grab the element lock and update value in place */ copy_map_value_locked(map, - l_old->key + round_up(key_size, 8), + htab_elem_value(l_old, key_size), value, false); return 0; } @@ -1134,7 +1140,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, * and update element in place */ copy_map_value_locked(map, - l_old->key + round_up(key_size, 8), + htab_elem_value(l_old, key_size), value, false); ret = 0; goto err; @@ -1156,24 +1162,14 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, hlist_nulls_del_rcu(&l_old->hash_node); /* l_old has already been stashed in htab->extra_elems, free - * its special fields before it is available for reuse. Also - * save the old map pointer in htab of maps before unlock - * and release it after unlock. + * its special fields before it is available for reuse. */ - old_map_ptr = NULL; - if (htab_is_prealloc(htab)) { - if (map->ops->map_fd_put_ptr) - old_map_ptr = fd_htab_map_get_ptr(map, l_old); + if (htab_is_prealloc(htab)) check_and_free_fields(htab, l_old); - } } htab_unlock_bucket(b, flags); - if (l_old) { - if (old_map_ptr) - map->ops->map_fd_put_ptr(map, old_map_ptr, true); - if (!htab_is_prealloc(htab)) - free_htab_elem(htab, l_old); - } + if (l_old && !htab_is_prealloc(htab)) + free_htab_elem(htab, l_old); return 0; err: htab_unlock_bucket(b, flags); @@ -1202,8 +1198,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1220,8 +1215,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value l_new = prealloc_lru_pop(htab, key, hash); if (!l_new) return -ENOMEM; - copy_map_value(&htab->map, - l_new->key + round_up(map->key_size, 8), value); + copy_map_value(&htab->map, htab_elem_value(l_new, map->key_size), value); ret = htab_lock_bucket(b, &flags); if (ret) @@ -1255,13 +1249,14 @@ err_lock_bucket: return ret; } -static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, +static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, void *value, u64 map_flags, - bool onallcpus) + bool percpu, bool onallcpus) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l_new = NULL, *l_old; + struct htab_elem *l_new, *l_old; struct hlist_nulls_head *head; + void *old_map_ptr = NULL; unsigned long flags; struct bucket *b; u32 key_size, hash; @@ -1271,8 +1266,7 @@ static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1292,21 +1286,29 @@ static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, goto err; if (l_old) { - /* per-cpu hash map can update value in-place */ - pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), - value, onallcpus); + /* Update value in-place */ + if (percpu) { + pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), + value, onallcpus); + } else { + void **inner_map_pptr = htab_elem_value(l_old, key_size); + + old_map_ptr = *inner_map_pptr; + WRITE_ONCE(*inner_map_pptr, *(void **)value); + } } else { l_new = alloc_htab_elem(htab, key, value, key_size, - hash, true, onallcpus, NULL); + hash, percpu, onallcpus, NULL); if (IS_ERR(l_new)) { ret = PTR_ERR(l_new); goto err; } hlist_nulls_add_head_rcu(&l_new->hash_node, head); } - ret = 0; err: htab_unlock_bucket(b, flags); + if (old_map_ptr) + map->ops->map_fd_put_ptr(map, old_map_ptr, true); return ret; } @@ -1326,8 +1328,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1383,7 +1384,7 @@ err_lock_bucket: static long htab_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { - return __htab_percpu_map_update_elem(map, key, value, map_flags, false); + return htab_map_update_elem_in_place(map, key, value, map_flags, true, false); } static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, @@ -1404,8 +1405,7 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key) u32 hash, key_size; int ret; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1440,8 +1440,7 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) u32 hash, key_size; int ret; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1486,7 +1485,7 @@ static void delete_all_elements(struct bpf_htab *htab) } } -static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) +static void htab_free_malloced_internal_structs(struct bpf_htab *htab) { int i; @@ -1497,30 +1496,27 @@ static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) struct htab_elem *l; hlist_nulls_for_each_entry(l, n, head, hash_node) { - /* We only free timer on uref dropping to zero */ - if (btf_record_has_field(htab->map.record, BPF_TIMER)) - bpf_obj_free_timer(htab->map.record, - l->key + round_up(htab->map.key_size, 8)); - if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) - bpf_obj_free_workqueue(htab->map.record, - l->key + round_up(htab->map.key_size, 8)); + /* We only free internal structs on uref dropping to zero */ + bpf_map_free_internal_structs(&htab->map, + htab_elem_value(l, htab->map.key_size)); } cond_resched_rcu(); } rcu_read_unlock(); } -static void htab_map_free_timers_and_wq(struct bpf_map *map) +static void htab_map_free_internal_structs(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - /* We only free timer and workqueue on uref dropping to zero */ - if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE)) { - if (!htab_is_prealloc(htab)) - htab_free_malloced_timers_and_wq(htab); - else - htab_free_prealloced_timers_and_wq(htab); - } + /* We only free internal structs on uref dropping to zero */ + if (!bpf_map_has_internal_structs(map)) + return; + + if (htab_is_prealloc(htab)) + htab_free_prealloced_internal_structs(htab); + else + htab_free_malloced_internal_structs(htab); } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ @@ -1615,15 +1611,12 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, off += roundup_value_size; } } else { - u32 roundup_key_size = round_up(map->key_size, 8); + void *src = htab_elem_value(l, map->key_size); if (flags & BPF_F_LOCK) - copy_map_value_locked(map, value, l->key + - roundup_key_size, - true); + copy_map_value_locked(map, value, src, true); else - copy_map_value(map, value, l->key + - roundup_key_size); + copy_map_value(map, value, src); /* Zeroing special fields in the temp buffer */ check_and_init_map_value(map, value); } @@ -1680,12 +1673,12 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, bool is_percpu) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - u32 bucket_cnt, total, key_size, value_size, roundup_key_size; void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; void __user *uvalues = u64_to_user_ptr(attr->batch.values); void __user *ukeys = u64_to_user_ptr(attr->batch.keys); void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); u32 batch, max_count, size, bucket_size, map_id; + u32 bucket_cnt, total, key_size, value_size; struct htab_elem *node_to_free = NULL; u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; @@ -1720,7 +1713,6 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, return -ENOENT; key_size = htab->map.key_size; - roundup_key_size = round_up(htab->map.key_size, 8); value_size = htab->map.value_size; size = round_up(value_size, 8); if (is_percpu) @@ -1812,8 +1804,8 @@ again_nocopy: off += size; } } else { - value = l->key + roundup_key_size; - if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + value = htab_elem_value(l, key_size); + if (is_fd_htab(htab)) { struct bpf_map **inner_map = value; /* Actual value is the id of the inner map */ @@ -2063,11 +2055,11 @@ static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) { struct bpf_iter_seq_hash_map_info *info = seq->private; - u32 roundup_key_size, roundup_value_size; struct bpf_iter__bpf_map_elem ctx = {}; struct bpf_map *map = info->map; struct bpf_iter_meta meta; int ret = 0, off = 0, cpu; + u32 roundup_value_size; struct bpf_prog *prog; void __percpu *pptr; @@ -2077,10 +2069,9 @@ static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) ctx.meta = &meta; ctx.map = info->map; if (elem) { - roundup_key_size = round_up(map->key_size, 8); ctx.key = elem->key; if (!info->percpu_value_buf) { - ctx.value = elem->key + roundup_key_size; + ctx.value = htab_elem_value(elem, map->key_size); } else { roundup_value_size = round_up(map->value_size, 8); pptr = htab_elem_get_ptr(elem, map->key_size); @@ -2165,7 +2156,6 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ struct hlist_nulls_head *head; struct hlist_nulls_node *n; struct htab_elem *elem; - u32 roundup_key_size; int i, num_elems = 0; void __percpu *pptr; struct bucket *b; @@ -2180,7 +2170,6 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ is_percpu = htab_is_percpu(htab); - roundup_key_size = round_up(map->key_size, 8); /* migration has been disabled, so percpu value prepared here will be * the same as the one seen by the bpf program with * bpf_map_lookup_elem(). @@ -2196,7 +2185,7 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ pptr = htab_elem_get_ptr(elem, map->key_size); val = this_cpu_ptr(pptr); } else { - val = elem->key + roundup_key_size; + val = htab_elem_value(elem, map->key_size); } num_elems++; ret = callback_fn((u64)(long)map, (u64)(long)key, @@ -2258,7 +2247,7 @@ const struct bpf_map_ops htab_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers_and_wq, + .map_release_uref = htab_map_free_internal_structs, .map_lookup_elem = htab_map_lookup_elem, .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, .map_update_elem = htab_map_update_elem, @@ -2279,7 +2268,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers_and_wq, + .map_release_uref = htab_map_free_internal_structs, .map_lookup_elem = htab_lru_map_lookup_elem, .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, @@ -2411,8 +2400,8 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, ret = __htab_lru_percpu_map_update_elem(map, key, value, map_flags, true); else - ret = __htab_percpu_map_update_elem(map, key, value, map_flags, - true); + ret = htab_map_update_elem_in_place(map, key, value, map_flags, + true, true); rcu_read_unlock(); return ret; @@ -2536,24 +2525,23 @@ int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) return ret; } -/* only called from syscall */ +/* Only called from syscall */ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags) { void *ptr; int ret; - u32 ufd = *(u32 *)value; - ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); + ptr = map->ops->map_fd_get_ptr(map, map_file, *(int *)value); if (IS_ERR(ptr)) return PTR_ERR(ptr); /* The htab bucket lock is always held during update operations in fd * htab map, and the following rcu_read_lock() is only used to avoid - * the WARN_ON_ONCE in htab_map_update_elem(). + * the WARN_ON_ONCE in htab_map_update_elem_in_place(). */ rcu_read_lock(); - ret = htab_map_update_elem(map, key, &ptr, map_flags); + ret = htab_map_update_elem_in_place(map, key, &ptr, map_flags, false, false); rcu_read_unlock(); if (ret) map->ops->map_fd_put_ptr(map, ptr, false); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index e3a2662f4e33..db72b96f9c8c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -23,6 +23,12 @@ #include <linux/btf_ids.h> #include <linux/bpf_mem_alloc.h> #include <linux/kasan.h> +#include <linux/bpf_verifier.h> +#include <linux/uaccess.h> +#include <linux/verification.h> +#include <linux/task_work.h> +#include <linux/irq_work.h> +#include <linux/buildid.h> #include "../../lib/kstrtox.h" @@ -37,8 +43,7 @@ */ BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); return (unsigned long) map->ops->map_lookup_elem(map, key); } @@ -54,8 +59,7 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = { BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, void *, value, u64, flags) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); return map->ops->map_update_elem(map, key, value, flags); } @@ -72,8 +76,7 @@ const struct bpf_func_proto bpf_map_update_elem_proto = { BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); return map->ops->map_delete_elem(map, key); } @@ -129,7 +132,7 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = { BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu); } @@ -761,22 +764,13 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, return -EINVAL; } -/* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary - * arguments representation. - */ -#define MAX_BPRINTF_BIN_ARGS 512 - /* Support executing three nested bprintf helper calls on a given CPU */ #define MAX_BPRINTF_NEST_LEVEL 3 -struct bpf_bprintf_buffers { - char bin_args[MAX_BPRINTF_BIN_ARGS]; - char buf[MAX_BPRINTF_BUF]; -}; static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs); static DEFINE_PER_CPU(int, bpf_bprintf_nest_level); -static int try_get_buffers(struct bpf_bprintf_buffers **bufs) +int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs) { int nest_level; @@ -792,16 +786,21 @@ static int try_get_buffers(struct bpf_bprintf_buffers **bufs) return 0; } -void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) +void bpf_put_buffers(void) { - if (!data->bin_args && !data->buf) - return; if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0)) return; this_cpu_dec(bpf_bprintf_nest_level); preempt_enable(); } +void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) +{ + if (!data->bin_args && !data->buf) + return; + bpf_put_buffers(); +} + /* * bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers * @@ -816,7 +815,7 @@ void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) * In argument preparation mode, if 0 is returned, safe temporary buffers are * allocated and bpf_bprintf_cleanup should be called to free them after use. */ -int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, +int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args, u32 num_args, struct bpf_bprintf_data *data) { bool get_buffers = (data->get_bin_args && num_args) || data->get_buf; @@ -832,7 +831,7 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, return -EINVAL; fmt_size = fmt_end - fmt; - if (get_buffers && try_get_buffers(&buffers)) + if (get_buffers && bpf_try_get_buffers(&buffers)) return -EBUSY; if (data->get_bin_args) { @@ -882,6 +881,13 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, if (fmt[i] == 'p') { sizeof_cur_arg = sizeof(long); + if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) || + ispunct(fmt[i + 1])) { + if (tmp_buf) + cur_arg = raw_args[num_spec]; + goto nocopy_fmt; + } + if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') && fmt[i + 2] == 's') { fmt_ptype = fmt[i + 1]; @@ -889,11 +895,9 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, goto fmt_str; } - if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) || - ispunct(fmt[i + 1]) || fmt[i + 1] == 'K' || + if (fmt[i + 1] == 'K' || fmt[i + 1] == 'x' || fmt[i + 1] == 's' || fmt[i + 1] == 'S') { - /* just kernel pointers */ if (tmp_buf) cur_arg = raw_args[num_spec]; i++; @@ -1080,6 +1084,17 @@ const struct bpf_func_proto bpf_snprintf_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx) +{ + if (map->map_type == BPF_MAP_TYPE_ARRAY) { + struct bpf_array *array = container_of(map, struct bpf_array, map); + + *arr_idx = ((char *)value - array->value) / array->elem_size; + return arr_idx; + } + return (void *)value - round_up(map->key_size, 8); +} + struct bpf_async_cb { struct bpf_map *map; struct bpf_prog *prog; @@ -1162,15 +1177,8 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) * bpf_map_delete_elem() on the same timer. */ this_cpu_write(hrtimer_running, t); - if (map->map_type == BPF_MAP_TYPE_ARRAY) { - struct bpf_array *array = container_of(map, struct bpf_array, map); - /* compute the key */ - idx = ((char *)value - array->value) / array->elem_size; - key = &idx; - } else { /* hash or lru */ - key = value - round_up(map->key_size, 8); - } + key = map_key_from_value(map, value, &idx); callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0); /* The verifier checked that return value is zero. */ @@ -1196,15 +1204,7 @@ static void bpf_wq_work(struct work_struct *work) if (!callback_fn) return; - if (map->map_type == BPF_MAP_TYPE_ARRAY) { - struct bpf_array *array = container_of(map, struct bpf_array, map); - - /* compute the key */ - idx = ((char *)value - array->value) / array->elem_size; - key = &idx; - } else { /* hash or lru */ - key = value - round_up(map->key_size, 8); - } + key = map_key_from_value(map, value, &idx); rcu_read_lock_trace(); migrate_disable(); @@ -1215,13 +1215,20 @@ static void bpf_wq_work(struct work_struct *work) rcu_read_unlock_trace(); } +static void bpf_async_cb_rcu_free(struct rcu_head *rcu) +{ + struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu); + + kfree_nolock(cb); +} + static void bpf_wq_delete_work(struct work_struct *work) { struct bpf_work *w = container_of(work, struct bpf_work, delete_work); cancel_work_sync(&w->work); - kfree_rcu(w, cb.rcu); + call_rcu(&w->cb.rcu, bpf_async_cb_rcu_free); } static void bpf_timer_delete_work(struct work_struct *work) @@ -1230,13 +1237,13 @@ static void bpf_timer_delete_work(struct work_struct *work) /* Cancel the timer and wait for callback to complete if it was running. * If hrtimer_cancel() can be safely called it's safe to call - * kfree_rcu(t) right after for both preallocated and non-preallocated + * call_rcu() right after for both preallocated and non-preallocated * maps. The async->cb = NULL was already done and no code path can see * address 't' anymore. Timer if armed for existing bpf_hrtimer before * bpf_timer_cancel_and_free will have been cancelled. */ hrtimer_cancel(&t->timer); - kfree_rcu(t, cb.rcu); + call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free); } static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, @@ -1270,8 +1277,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u goto out; } - /* allocate hrtimer via map_kmalloc to use memcg accounting */ - cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node); + cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node); if (!cb) { ret = -ENOMEM; goto out; @@ -1312,7 +1318,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u * or pinned in bpffs. */ WRITE_ONCE(async->cb, NULL); - kfree(cb); + kfree_nolock(cb); ret = -EPERM; } out: @@ -1577,7 +1583,7 @@ void bpf_timer_cancel_and_free(void *val) * timer _before_ calling us, such that failing to cancel it here will * cause it to possibly use struct hrtimer after freeing bpf_hrtimer. * Therefore, we _need_ to cancel any outstanding timers before we do - * kfree_rcu, even though no more timers can be armed. + * call_rcu, even though no more timers can be armed. * * Moreover, we need to schedule work even if timer does not belong to * the calling callback_fn, as on two different CPUs, we can end up in a @@ -1593,7 +1599,7 @@ void bpf_timer_cancel_and_free(void *val) * timer callback. */ if (this_cpu_read(hrtimer_running)) { - queue_work(system_unbound_wq, &t->cb.delete_work); + queue_work(system_dfl_wq, &t->cb.delete_work); return; } @@ -1604,9 +1610,9 @@ void bpf_timer_cancel_and_free(void *val) * completion. */ if (hrtimer_try_to_cancel(&t->timer) >= 0) - kfree_rcu(t, cb.rcu); + call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free); else - queue_work(system_unbound_wq, &t->cb.delete_work); + queue_work(system_dfl_wq, &t->cb.delete_work); } else { bpf_timer_delete_work(&t->cb.delete_work); } @@ -1654,6 +1660,13 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = { .arg2_btf_id = BPF_PTR_POISON, }; +struct bpf_dynptr_file_impl { + struct freader freader; + /* 64 bit offset and size overriding 32 bit ones in bpf_dynptr_kern */ + u64 offset; + u64 size; +}; + /* Since the upper 8 bits of dynptr->size is reserved, the * maximum supported size is 2^24 - 1. */ @@ -1682,23 +1695,65 @@ static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *pt return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT; } -u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr) +u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr) { + if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) { + struct bpf_dynptr_file_impl *df = ptr->data; + + return df->size; + } + return ptr->size & DYNPTR_SIZE_MASK; } -static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u32 new_size) +static void bpf_dynptr_advance_offset(struct bpf_dynptr_kern *ptr, u64 off) +{ + if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) { + struct bpf_dynptr_file_impl *df = ptr->data; + + df->offset += off; + return; + } + ptr->offset += off; +} + +static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u64 new_size) { u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK; - ptr->size = new_size | metadata; + if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) { + struct bpf_dynptr_file_impl *df = ptr->data; + + df->size = new_size; + return; + } + ptr->size = (u32)new_size | metadata; } -int bpf_dynptr_check_size(u32 size) +int bpf_dynptr_check_size(u64 size) { return size > DYNPTR_MAX_SIZE ? -E2BIG : 0; } +static int bpf_file_fetch_bytes(struct bpf_dynptr_file_impl *df, u64 offset, void *buf, u64 len) +{ + const void *ptr; + + if (!buf) + return -EINVAL; + + df->freader.buf = buf; + df->freader.buf_sz = len; + ptr = freader_fetch(&df->freader, offset + df->offset, len); + if (!ptr) + return df->freader.err; + + if (ptr != buf) /* Force copying into the buffer */ + memcpy(buf, ptr, len); + + return 0; +} + void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size) { @@ -1713,17 +1768,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) memset(ptr, 0, sizeof(*ptr)); } -static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len) -{ - u32 size = __bpf_dynptr_size(ptr); - - if (len > size || offset > size - len) - return -E2BIG; - - return 0; -} - -BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr) +BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u64, size, u64, flags, struct bpf_dynptr_kern *, ptr) { int err; @@ -1758,8 +1803,8 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE, }; -static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *src, - u32 offset, u64 flags) +static int __bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr_kern *src, + u64 offset, u64 flags) { enum bpf_dynptr_type type; int err; @@ -1786,14 +1831,19 @@ static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *s return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len); case BPF_DYNPTR_TYPE_XDP: return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len); + case BPF_DYNPTR_TYPE_SKB_META: + memmove(dst, bpf_skb_meta_pointer(src->data, src->offset + offset), len); + return 0; + case BPF_DYNPTR_TYPE_FILE: + return bpf_file_fetch_bytes(src->data, offset, dst, len); default: WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type); return -EFAULT; } } -BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, - u32, offset, u64, flags) +BPF_CALL_5(bpf_dynptr_read, void *, dst, u64, len, const struct bpf_dynptr_kern *, src, + u64, offset, u64, flags) { return __bpf_dynptr_read(dst, len, src, offset, flags); } @@ -1809,8 +1859,8 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = { .arg5_type = ARG_ANYTHING, }; -static int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, - u32 len, u64 flags) +int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, void *src, + u64 len, u64 flags) { enum bpf_dynptr_type type; int err; @@ -1842,14 +1892,17 @@ static int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, voi if (flags) return -EINVAL; return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len); + case BPF_DYNPTR_TYPE_SKB_META: + return __bpf_skb_meta_store_bytes(dst->data, dst->offset + offset, src, + len, flags); default: WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type); return -EFAULT; } } -BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, - u32, len, u64, flags) +BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u64, offset, void *, src, + u64, len, u64, flags) { return __bpf_dynptr_write(dst, offset, src, len, flags); } @@ -1865,7 +1918,7 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = { .arg5_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) +BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u64, offset, u64, len) { enum bpf_dynptr_type type; int err; @@ -1888,6 +1941,7 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3 return (unsigned long)(ptr->data + ptr->offset + offset); case BPF_DYNPTR_TYPE_SKB: case BPF_DYNPTR_TYPE_XDP: + case BPF_DYNPTR_TYPE_SKB_META: /* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */ return 0; default: @@ -1912,6 +1966,12 @@ const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; const struct bpf_func_proto bpf_task_pt_regs_proto __weak; +const struct bpf_func_proto bpf_perf_event_read_proto __weak; +const struct bpf_func_proto bpf_send_signal_proto __weak; +const struct bpf_func_proto bpf_send_signal_thread_proto __weak; +const struct bpf_func_proto bpf_get_task_stack_sleepable_proto __weak; +const struct bpf_func_proto bpf_get_task_stack_proto __weak; +const struct bpf_func_proto bpf_get_branch_snapshot_proto __weak; const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) @@ -1965,6 +2025,8 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_get_ns_current_pid_tgid: return &bpf_get_ns_current_pid_tgid_proto; + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; default: break; } @@ -2022,7 +2084,21 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_get_current_ancestor_cgroup_id: return &bpf_get_current_ancestor_cgroup_id_proto; + case BPF_FUNC_current_task_under_cgroup: + return &bpf_current_task_under_cgroup_proto; +#endif +#ifdef CONFIG_CGROUP_NET_CLASSID + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_curr_proto; #endif + case BPF_FUNC_task_storage_get: + if (bpf_prog_check_recur(prog)) + return &bpf_task_storage_get_recur_proto; + return &bpf_task_storage_get_proto; + case BPF_FUNC_task_storage_delete: + if (bpf_prog_check_recur(prog)) + return &bpf_task_storage_delete_recur_proto; + return &bpf_task_storage_delete_proto; default: break; } @@ -2037,6 +2113,8 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_task_proto; case BPF_FUNC_get_current_task_btf: return &bpf_get_current_task_btf_proto; + case BPF_FUNC_get_current_comm: + return &bpf_get_current_comm_proto; case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: @@ -2047,6 +2125,10 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_probe_read_kernel_str: return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_kernel_str_proto; + case BPF_FUNC_copy_from_user: + return &bpf_copy_from_user_proto; + case BPF_FUNC_copy_from_user_task: + return &bpf_copy_from_user_task_proto; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; case BPF_FUNC_snprintf: @@ -2057,6 +2139,19 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return bpf_get_trace_vprintk_proto(); case BPF_FUNC_perf_event_read_value: return bpf_get_perf_event_read_value_proto(); + case BPF_FUNC_perf_event_read: + return &bpf_perf_event_read_proto; + case BPF_FUNC_send_signal: + return &bpf_send_signal_proto; + case BPF_FUNC_send_signal_thread: + return &bpf_send_signal_thread_proto; + case BPF_FUNC_get_task_stack: + return prog->sleepable ? &bpf_get_task_stack_sleepable_proto + : &bpf_get_task_stack_proto; + case BPF_FUNC_get_branch_snapshot: + return &bpf_get_branch_snapshot_proto; + case BPF_FUNC_find_vma: + return &bpf_find_vma_proto; default: return NULL; } @@ -2293,6 +2388,26 @@ __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) return __bpf_list_del(head, true); } +__bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head) +{ + struct list_head *h = (struct list_head *)head; + + if (list_empty(h) || unlikely(!h->next)) + return NULL; + + return (struct bpf_list_node *)h->next; +} + +__bpf_kfunc struct bpf_list_node *bpf_list_back(struct bpf_list_head *head) +{ + struct list_head *h = (struct list_head *)head; + + if (list_empty(h) || unlikely(!h->next)) + return NULL; + + return (struct bpf_list_node *)h->prev; +} + __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, struct bpf_rb_node *node) { @@ -2366,6 +2481,33 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) return (struct bpf_rb_node *)rb_first_cached(r); } +__bpf_kfunc struct bpf_rb_node *bpf_rbtree_root(struct bpf_rb_root *root) +{ + struct rb_root_cached *r = (struct rb_root_cached *)root; + + return (struct bpf_rb_node *)r->rb_root.rb_node; +} + +__bpf_kfunc struct bpf_rb_node *bpf_rbtree_left(struct bpf_rb_root *root, struct bpf_rb_node *node) +{ + struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node; + + if (READ_ONCE(node_internal->owner) != root) + return NULL; + + return (struct bpf_rb_node *)node_internal->rb_node.rb_left; +} + +__bpf_kfunc struct bpf_rb_node *bpf_rbtree_right(struct bpf_rb_root *root, struct bpf_rb_node *node) +{ + struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node; + + if (READ_ONCE(node_internal->owner) != root) + return NULL; + + return (struct bpf_rb_node *)node_internal->rb_node.rb_right; +} + /** * bpf_task_acquire - Acquire a reference to a task. A task acquired by this * kfunc which is not stored in a map as a kptr, must be released by calling @@ -2455,7 +2597,7 @@ __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid) { struct cgroup *cgrp; - cgrp = cgroup_get_from_id(cgid); + cgrp = __cgroup_get_from_id(cgid); if (IS_ERR(cgrp)) return NULL; return cgrp; @@ -2591,12 +2733,12 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid) * provided buffer, with its contents containing the data, if unable to obtain * direct pointer) */ -__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, - void *buffer__opt, u32 buffer__szk) +__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, + void *buffer__opt, u64 buffer__szk) { const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; enum bpf_dynptr_type type; - u32 len = buffer__szk; + u64 len = buffer__szk; int err; if (!ptr->data) @@ -2628,6 +2770,11 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false); return buffer__opt; } + case BPF_DYNPTR_TYPE_SKB_META: + return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset); + case BPF_DYNPTR_TYPE_FILE: + err = bpf_file_fetch_bytes(ptr->data, offset, buffer__opt, buffer__szk); + return err ? NULL : buffer__opt; default: WARN_ONCE(true, "unknown dynptr type %d\n", type); return NULL; @@ -2676,8 +2823,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, * provided buffer, with its contents containing the data, if unable to obtain * direct pointer) */ -__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset, - void *buffer__opt, u32 buffer__szk) +__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, + void *buffer__opt, u64 buffer__szk) { const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; @@ -2709,10 +2856,10 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset, return bpf_dynptr_slice(p, offset, buffer__opt, buffer__szk); } -__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u32 start, u32 end) +__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; - u32 size; + u64 size; if (!ptr->data || start > end) return -EINVAL; @@ -2722,7 +2869,7 @@ __bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u32 start, u32 end if (start > size || end > size) return -ERANGE; - ptr->offset += start; + bpf_dynptr_advance_offset(ptr, start); bpf_dynptr_set_size(ptr, end - start); return 0; @@ -2745,7 +2892,7 @@ __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p) return __bpf_dynptr_is_rdonly(ptr); } -__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr *p) +__bpf_kfunc u64 bpf_dynptr_size(const struct bpf_dynptr *p) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; @@ -2782,14 +2929,14 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p, * Copies data from source dynptr to destination dynptr. * Returns 0 on success; negative error, otherwise. */ -__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, - struct bpf_dynptr *src_ptr, u32 src_off, u32 size) +__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off, + struct bpf_dynptr *src_ptr, u64 src_off, u64 size) { struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr; struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr; void *src_slice, *dst_slice; char buf[256]; - u32 off; + u64 off; src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size); dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size); @@ -2811,7 +2958,7 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, off = 0; while (off < size) { - u32 chunk_sz = min_t(u32, sizeof(buf), size - off); + u64 chunk_sz = min_t(u64, sizeof(buf), size - off); int err; err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0); @@ -2826,6 +2973,52 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, return 0; } +/** + * bpf_dynptr_memset() - Fill dynptr memory with a constant byte. + * @p: Destination dynptr - where data will be filled + * @offset: Offset into the dynptr to start filling from + * @size: Number of bytes to fill + * @val: Constant byte to fill the memory with + * + * Fills the @size bytes of the memory area pointed to by @p + * at @offset with the constant byte @val. + * Returns 0 on success; negative error, otherwise. + */ +__bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u64 offset, u64 size, u8 val) +{ + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + u64 chunk_sz, write_off; + char buf[256]; + void* slice; + int err; + + slice = bpf_dynptr_slice_rdwr(p, offset, NULL, size); + if (likely(slice)) { + memset(slice, val, size); + return 0; + } + + if (__bpf_dynptr_is_rdonly(ptr)) + return -EINVAL; + + err = bpf_dynptr_check_off_len(ptr, offset, size); + if (err) + return err; + + /* Non-linear data under the dynptr, write from a local buffer */ + chunk_sz = min_t(u64, sizeof(buf), size); + memset(buf, val, chunk_sz); + + for (write_off = 0; write_off < size; write_off += chunk_sz) { + chunk_sz = min_t(u64, sizeof(buf), size - write_off); + err = __bpf_dynptr_write(ptr, offset + write_off, buf, chunk_sz, 0); + if (err) + return err; + } + + return 0; +} + __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj) { return obj; @@ -2858,9 +3051,16 @@ static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp) struct bpf_throw_ctx *ctx = cookie; struct bpf_prog *prog; - if (!is_bpf_text_address(ip)) - return !ctx->cnt; + /* + * The RCU read lock is held to safely traverse the latch tree, but we + * don't need its protection when accessing the prog, since it has an + * active stack frame on the current stack trace, and won't disappear. + */ + rcu_read_lock(); prog = bpf_prog_ksym_find(ip); + rcu_read_unlock(); + if (!prog) + return !ctx->cnt; ctx->cnt++; if (bpf_is_subprog(prog)) return true; @@ -2923,9 +3123,9 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, int (callback_fn)(void *map, int *key, void *value), unsigned int flags, - void *aux__ign) + void *aux__prog) { - struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__ign; + struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__prog; struct bpf_async_kern *async = (struct bpf_async_kern *)wq; if (flags) @@ -3194,8 +3394,1003 @@ __bpf_kfunc void bpf_local_irq_restore(unsigned long *flags__irq_flag) local_irq_restore(*flags__irq_flag); } +__bpf_kfunc void __bpf_trap(void) +{ +} + +/* + * Kfuncs for string operations. + * + * Since strings are not necessarily %NUL-terminated, we cannot directly call + * in-kernel implementations. Instead, we open-code the implementations using + * __get_kernel_nofault instead of plain dereference to make them safe. + */ + +static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case) +{ + char c1, c2; + int i; + + if (!copy_from_kernel_nofault_allowed(s1, 1) || + !copy_from_kernel_nofault_allowed(s2, 1)) { + return -ERANGE; + } + + guard(pagefault)(); + for (i = 0; i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&c1, s1, char, err_out); + __get_kernel_nofault(&c2, s2, char, err_out); + if (ignore_case) { + c1 = tolower(c1); + c2 = tolower(c2); + } + if (c1 != c2) + return c1 < c2 ? -1 : 1; + if (c1 == '\0') + return 0; + s1++; + s2++; + } + return -E2BIG; +err_out: + return -EFAULT; +} + +/** + * bpf_strcmp - Compare two strings + * @s1__ign: One string + * @s2__ign: Another string + * + * Return: + * * %0 - Strings are equal + * * %-1 - @s1__ign is smaller + * * %1 - @s2__ign is smaller + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of strings is too large + * * %-ERANGE - One of strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign) +{ + return __bpf_strcasecmp(s1__ign, s2__ign, false); +} + +/** + * bpf_strcasecmp - Compare two strings, ignoring the case of the characters + * @s1__ign: One string + * @s2__ign: Another string + * + * Return: + * * %0 - Strings are equal + * * %-1 - @s1__ign is smaller + * * %1 - @s2__ign is smaller + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of strings is too large + * * %-ERANGE - One of strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strcasecmp(const char *s1__ign, const char *s2__ign) +{ + return __bpf_strcasecmp(s1__ign, s2__ign, true); +} + +/** + * bpf_strnchr - Find a character in a length limited string + * @s__ign: The string to be searched + * @count: The number of characters to be searched + * @c: The character to search for + * + * Note that the %NUL-terminator is considered part of the string, and can + * be searched for. + * + * Return: + * * >=0 - Index of the first occurrence of @c within @s__ign + * * %-ENOENT - @c not found in the first @count characters of @s__ign + * * %-EFAULT - Cannot read @s__ign + * * %-E2BIG - @s__ign is too large + * * %-ERANGE - @s__ign is outside of kernel address space + */ +__bpf_kfunc int bpf_strnchr(const char *s__ign, size_t count, char c) +{ + char sc; + int i; + + if (!copy_from_kernel_nofault_allowed(s__ign, 1)) + return -ERANGE; + + guard(pagefault)(); + for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&sc, s__ign, char, err_out); + if (sc == c) + return i; + if (sc == '\0') + return -ENOENT; + s__ign++; + } + return i == XATTR_SIZE_MAX ? -E2BIG : -ENOENT; +err_out: + return -EFAULT; +} + +/** + * bpf_strchr - Find the first occurrence of a character in a string + * @s__ign: The string to be searched + * @c: The character to search for + * + * Note that the %NUL-terminator is considered part of the string, and can + * be searched for. + * + * Return: + * * >=0 - The index of the first occurrence of @c within @s__ign + * * %-ENOENT - @c not found in @s__ign + * * %-EFAULT - Cannot read @s__ign + * * %-E2BIG - @s__ign is too large + * * %-ERANGE - @s__ign is outside of kernel address space + */ +__bpf_kfunc int bpf_strchr(const char *s__ign, char c) +{ + return bpf_strnchr(s__ign, XATTR_SIZE_MAX, c); +} + +/** + * bpf_strchrnul - Find and return a character in a string, or end of string + * @s__ign: The string to be searched + * @c: The character to search for + * + * Return: + * * >=0 - Index of the first occurrence of @c within @s__ign or index of + * the null byte at the end of @s__ign when @c is not found + * * %-EFAULT - Cannot read @s__ign + * * %-E2BIG - @s__ign is too large + * * %-ERANGE - @s__ign is outside of kernel address space + */ +__bpf_kfunc int bpf_strchrnul(const char *s__ign, char c) +{ + char sc; + int i; + + if (!copy_from_kernel_nofault_allowed(s__ign, 1)) + return -ERANGE; + + guard(pagefault)(); + for (i = 0; i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&sc, s__ign, char, err_out); + if (sc == '\0' || sc == c) + return i; + s__ign++; + } + return -E2BIG; +err_out: + return -EFAULT; +} + +/** + * bpf_strrchr - Find the last occurrence of a character in a string + * @s__ign: The string to be searched + * @c: The character to search for + * + * Return: + * * >=0 - Index of the last occurrence of @c within @s__ign + * * %-ENOENT - @c not found in @s__ign + * * %-EFAULT - Cannot read @s__ign + * * %-E2BIG - @s__ign is too large + * * %-ERANGE - @s__ign is outside of kernel address space + */ +__bpf_kfunc int bpf_strrchr(const char *s__ign, int c) +{ + char sc; + int i, last = -ENOENT; + + if (!copy_from_kernel_nofault_allowed(s__ign, 1)) + return -ERANGE; + + guard(pagefault)(); + for (i = 0; i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&sc, s__ign, char, err_out); + if (sc == c) + last = i; + if (sc == '\0') + return last; + s__ign++; + } + return -E2BIG; +err_out: + return -EFAULT; +} + +/** + * bpf_strnlen - Calculate the length of a length-limited string + * @s__ign: The string + * @count: The maximum number of characters to count + * + * Return: + * * >=0 - The length of @s__ign + * * %-EFAULT - Cannot read @s__ign + * * %-E2BIG - @s__ign is too large + * * %-ERANGE - @s__ign is outside of kernel address space + */ +__bpf_kfunc int bpf_strnlen(const char *s__ign, size_t count) +{ + char c; + int i; + + if (!copy_from_kernel_nofault_allowed(s__ign, 1)) + return -ERANGE; + + guard(pagefault)(); + for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&c, s__ign, char, err_out); + if (c == '\0') + return i; + s__ign++; + } + return i == XATTR_SIZE_MAX ? -E2BIG : i; +err_out: + return -EFAULT; +} + +/** + * bpf_strlen - Calculate the length of a string + * @s__ign: The string + * + * Return: + * * >=0 - The length of @s__ign + * * %-EFAULT - Cannot read @s__ign + * * %-E2BIG - @s__ign is too large + * * %-ERANGE - @s__ign is outside of kernel address space + */ +__bpf_kfunc int bpf_strlen(const char *s__ign) +{ + return bpf_strnlen(s__ign, XATTR_SIZE_MAX); +} + +/** + * bpf_strspn - Calculate the length of the initial substring of @s__ign which + * only contains letters in @accept__ign + * @s__ign: The string to be searched + * @accept__ign: The string to search for + * + * Return: + * * >=0 - The length of the initial substring of @s__ign which only + * contains letters from @accept__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strspn(const char *s__ign, const char *accept__ign) +{ + char cs, ca; + int i, j; + + if (!copy_from_kernel_nofault_allowed(s__ign, 1) || + !copy_from_kernel_nofault_allowed(accept__ign, 1)) { + return -ERANGE; + } + + guard(pagefault)(); + for (i = 0; i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&cs, s__ign, char, err_out); + if (cs == '\0') + return i; + for (j = 0; j < XATTR_SIZE_MAX; j++) { + __get_kernel_nofault(&ca, accept__ign + j, char, err_out); + if (cs == ca || ca == '\0') + break; + } + if (j == XATTR_SIZE_MAX) + return -E2BIG; + if (ca == '\0') + return i; + s__ign++; + } + return -E2BIG; +err_out: + return -EFAULT; +} + +/** + * bpf_strcspn - Calculate the length of the initial substring of @s__ign which + * does not contain letters in @reject__ign + * @s__ign: The string to be searched + * @reject__ign: The string to search for + * + * Return: + * * >=0 - The length of the initial substring of @s__ign which does not + * contain letters from @reject__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strcspn(const char *s__ign, const char *reject__ign) +{ + char cs, cr; + int i, j; + + if (!copy_from_kernel_nofault_allowed(s__ign, 1) || + !copy_from_kernel_nofault_allowed(reject__ign, 1)) { + return -ERANGE; + } + + guard(pagefault)(); + for (i = 0; i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&cs, s__ign, char, err_out); + if (cs == '\0') + return i; + for (j = 0; j < XATTR_SIZE_MAX; j++) { + __get_kernel_nofault(&cr, reject__ign + j, char, err_out); + if (cs == cr || cr == '\0') + break; + } + if (j == XATTR_SIZE_MAX) + return -E2BIG; + if (cr != '\0') + return i; + s__ign++; + } + return -E2BIG; +err_out: + return -EFAULT; +} + +static int __bpf_strnstr(const char *s1, const char *s2, size_t len, + bool ignore_case) +{ + char c1, c2; + int i, j; + + if (!copy_from_kernel_nofault_allowed(s1, 1) || + !copy_from_kernel_nofault_allowed(s2, 1)) { + return -ERANGE; + } + + guard(pagefault)(); + for (i = 0; i < XATTR_SIZE_MAX; i++) { + for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) { + __get_kernel_nofault(&c2, s2 + j, char, err_out); + if (c2 == '\0') + return i; + /* + * We allow reading an extra byte from s2 (note the + * `i + j <= len` above) to cover the case when s2 is + * a suffix of the first len chars of s1. + */ + if (i + j == len) + break; + __get_kernel_nofault(&c1, s1 + j, char, err_out); + + if (ignore_case) { + c1 = tolower(c1); + c2 = tolower(c2); + } + + if (c1 == '\0') + return -ENOENT; + if (c1 != c2) + break; + } + if (j == XATTR_SIZE_MAX) + return -E2BIG; + if (i + j == len) + return -ENOENT; + s1++; + } + return -E2BIG; +err_out: + return -EFAULT; +} + +/** + * bpf_strstr - Find the first substring in a string + * @s1__ign: The string to be searched + * @s2__ign: The string to search for + * + * Return: + * * >=0 - Index of the first character of the first occurrence of @s2__ign + * within @s1__ign + * * %-ENOENT - @s2__ign is not a substring of @s1__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign) +{ + return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, false); +} + +/** + * bpf_strcasestr - Find the first substring in a string, ignoring the case of + * the characters + * @s1__ign: The string to be searched + * @s2__ign: The string to search for + * + * Return: + * * >=0 - Index of the first character of the first occurrence of @s2__ign + * within @s1__ign + * * %-ENOENT - @s2__ign is not a substring of @s1__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strcasestr(const char *s1__ign, const char *s2__ign) +{ + return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, true); +} + +/** + * bpf_strnstr - Find the first substring in a length-limited string + * @s1__ign: The string to be searched + * @s2__ign: The string to search for + * @len: the maximum number of characters to search + * + * Return: + * * >=0 - Index of the first character of the first occurrence of @s2__ign + * within the first @len characters of @s1__ign + * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, + size_t len) +{ + return __bpf_strnstr(s1__ign, s2__ign, len, false); +} + +/** + * bpf_strncasestr - Find the first substring in a length-limited string, + * ignoring the case of the characters + * @s1__ign: The string to be searched + * @s2__ign: The string to search for + * @len: the maximum number of characters to search + * + * Return: + * * >=0 - Index of the first character of the first occurrence of @s2__ign + * within the first @len characters of @s1__ign + * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strncasestr(const char *s1__ign, const char *s2__ign, + size_t len) +{ + return __bpf_strnstr(s1__ign, s2__ign, len, true); +} + +#ifdef CONFIG_KEYS +/** + * bpf_lookup_user_key - lookup a key by its serial + * @serial: key handle serial number + * @flags: lookup-specific flags + * + * Search a key with a given *serial* and the provided *flags*. + * If found, increment the reference count of the key by one, and + * return it in the bpf_key structure. + * + * The bpf_key structure must be passed to bpf_key_put() when done + * with it, so that the key reference count is decremented and the + * bpf_key structure is freed. + * + * Permission checks are deferred to the time the key is used by + * one of the available key-specific kfuncs. + * + * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested + * special keyring (e.g. session keyring), if it doesn't yet exist. + * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting + * for the key construction, and to retrieve uninstantiated keys (keys + * without data attached to them). + * + * Return: a bpf_key pointer with a valid key pointer if the key is found, a + * NULL pointer otherwise. + */ +__bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags) +{ + key_ref_t key_ref; + struct bpf_key *bkey; + + if (flags & ~KEY_LOOKUP_ALL) + return NULL; + + /* + * Permission check is deferred until the key is used, as the + * intent of the caller is unknown here. + */ + key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK); + if (IS_ERR(key_ref)) + return NULL; + + bkey = kmalloc(sizeof(*bkey), GFP_KERNEL); + if (!bkey) { + key_put(key_ref_to_ptr(key_ref)); + return NULL; + } + + bkey->key = key_ref_to_ptr(key_ref); + bkey->has_ref = true; + + return bkey; +} + +/** + * bpf_lookup_system_key - lookup a key by a system-defined ID + * @id: key ID + * + * Obtain a bpf_key structure with a key pointer set to the passed key ID. + * The key pointer is marked as invalid, to prevent bpf_key_put() from + * attempting to decrement the key reference count on that pointer. The key + * pointer set in such way is currently understood only by + * verify_pkcs7_signature(). + * + * Set *id* to one of the values defined in include/linux/verification.h: + * 0 for the primary keyring (immutable keyring of system keys); + * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring + * (where keys can be added only if they are vouched for by existing keys + * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform + * keyring (primarily used by the integrity subsystem to verify a kexec'ed + * kerned image and, possibly, the initramfs signature). + * + * Return: a bpf_key pointer with an invalid key pointer set from the + * pre-determined ID on success, a NULL pointer otherwise + */ +__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id) +{ + struct bpf_key *bkey; + + if (system_keyring_id_check(id) < 0) + return NULL; + + bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC); + if (!bkey) + return NULL; + + bkey->key = (struct key *)(unsigned long)id; + bkey->has_ref = false; + + return bkey; +} + +/** + * bpf_key_put - decrement key reference count if key is valid and free bpf_key + * @bkey: bpf_key structure + * + * Decrement the reference count of the key inside *bkey*, if the pointer + * is valid, and free *bkey*. + */ +__bpf_kfunc void bpf_key_put(struct bpf_key *bkey) +{ + if (bkey->has_ref) + key_put(bkey->key); + + kfree(bkey); +} + +/** + * bpf_verify_pkcs7_signature - verify a PKCS#7 signature + * @data_p: data to verify + * @sig_p: signature of the data + * @trusted_keyring: keyring with keys trusted for signature verification + * + * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr* + * with keys in a keyring referenced by *trusted_keyring*. + * + * Return: 0 on success, a negative value on error. + */ +__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, + struct bpf_dynptr *sig_p, + struct bpf_key *trusted_keyring) +{ +#ifdef CONFIG_SYSTEM_DATA_VERIFICATION + struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p; + struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p; + const void *data, *sig; + u32 data_len, sig_len; + int ret; + + if (trusted_keyring->has_ref) { + /* + * Do the permission check deferred in bpf_lookup_user_key(). + * See bpf_lookup_user_key() for more details. + * + * A call to key_task_permission() here would be redundant, as + * it is already done by keyring_search() called by + * find_asymmetric_key(). + */ + ret = key_validate(trusted_keyring->key); + if (ret < 0) + return ret; + } + + data_len = __bpf_dynptr_size(data_ptr); + data = __bpf_dynptr_data(data_ptr, data_len); + sig_len = __bpf_dynptr_size(sig_ptr); + sig = __bpf_dynptr_data(sig_ptr, sig_len); + + return verify_pkcs7_signature(data, data_len, sig, sig_len, + trusted_keyring->key, + VERIFYING_BPF_SIGNATURE, NULL, + NULL); +#else + return -EOPNOTSUPP; +#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ +} +#endif /* CONFIG_KEYS */ + +typedef int (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *value); + +enum bpf_task_work_state { + /* bpf_task_work is ready to be used */ + BPF_TW_STANDBY = 0, + /* irq work scheduling in progress */ + BPF_TW_PENDING, + /* task work scheduling in progress */ + BPF_TW_SCHEDULING, + /* task work is scheduled successfully */ + BPF_TW_SCHEDULED, + /* callback is running */ + BPF_TW_RUNNING, + /* associated BPF map value is deleted */ + BPF_TW_FREED, +}; + +struct bpf_task_work_ctx { + enum bpf_task_work_state state; + refcount_t refcnt; + struct callback_head work; + struct irq_work irq_work; + /* bpf_prog that schedules task work */ + struct bpf_prog *prog; + /* task for which callback is scheduled */ + struct task_struct *task; + /* the map and map value associated with this context */ + struct bpf_map *map; + void *map_val; + enum task_work_notify_mode mode; + bpf_task_work_callback_t callback_fn; + struct rcu_head rcu; +} __aligned(8); + +/* Actual type for struct bpf_task_work */ +struct bpf_task_work_kern { + struct bpf_task_work_ctx *ctx; +}; + +static void bpf_task_work_ctx_reset(struct bpf_task_work_ctx *ctx) +{ + if (ctx->prog) { + bpf_prog_put(ctx->prog); + ctx->prog = NULL; + } + if (ctx->task) { + bpf_task_release(ctx->task); + ctx->task = NULL; + } +} + +static bool bpf_task_work_ctx_tryget(struct bpf_task_work_ctx *ctx) +{ + return refcount_inc_not_zero(&ctx->refcnt); +} + +static void bpf_task_work_ctx_put(struct bpf_task_work_ctx *ctx) +{ + if (!refcount_dec_and_test(&ctx->refcnt)) + return; + + bpf_task_work_ctx_reset(ctx); + + /* bpf_mem_free expects migration to be disabled */ + migrate_disable(); + bpf_mem_free(&bpf_global_ma, ctx); + migrate_enable(); +} + +static void bpf_task_work_cancel(struct bpf_task_work_ctx *ctx) +{ + /* + * Scheduled task_work callback holds ctx ref, so if we successfully + * cancelled, we put that ref on callback's behalf. If we couldn't + * cancel, callback will inevitably run or has already completed + * running, and it would have taken care of its ctx ref itself. + */ + if (task_work_cancel(ctx->task, &ctx->work)) + bpf_task_work_ctx_put(ctx); +} + +static void bpf_task_work_callback(struct callback_head *cb) +{ + struct bpf_task_work_ctx *ctx = container_of(cb, struct bpf_task_work_ctx, work); + enum bpf_task_work_state state; + u32 idx; + void *key; + + /* Read lock is needed to protect ctx and map key/value access */ + guard(rcu_tasks_trace)(); + /* + * This callback may start running before bpf_task_work_irq() switched to + * SCHEDULED state, so handle both transition variants SCHEDULING|SCHEDULED -> RUNNING. + */ + state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_RUNNING); + if (state == BPF_TW_SCHEDULED) + state = cmpxchg(&ctx->state, BPF_TW_SCHEDULED, BPF_TW_RUNNING); + if (state == BPF_TW_FREED) { + bpf_task_work_ctx_put(ctx); + return; + } + + key = (void *)map_key_from_value(ctx->map, ctx->map_val, &idx); + + migrate_disable(); + ctx->callback_fn(ctx->map, key, ctx->map_val); + migrate_enable(); + + bpf_task_work_ctx_reset(ctx); + (void)cmpxchg(&ctx->state, BPF_TW_RUNNING, BPF_TW_STANDBY); + + bpf_task_work_ctx_put(ctx); +} + +static void bpf_task_work_irq(struct irq_work *irq_work) +{ + struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work); + enum bpf_task_work_state state; + int err; + + guard(rcu_tasks_trace)(); + + if (cmpxchg(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING) != BPF_TW_PENDING) { + bpf_task_work_ctx_put(ctx); + return; + } + + err = task_work_add(ctx->task, &ctx->work, ctx->mode); + if (err) { + bpf_task_work_ctx_reset(ctx); + /* + * try to switch back to STANDBY for another task_work reuse, but we might have + * gone to FREED already, which is fine as we already cleaned up after ourselves + */ + (void)cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_STANDBY); + bpf_task_work_ctx_put(ctx); + return; + } + + /* + * It's technically possible for just scheduled task_work callback to + * complete running by now, going SCHEDULING -> RUNNING and then + * dropping its ctx refcount. Instead of capturing extra ref just to + * protected below ctx->state access, we rely on RCU protection to + * perform below SCHEDULING -> SCHEDULED attempt. + */ + state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED); + if (state == BPF_TW_FREED) + bpf_task_work_cancel(ctx); /* clean up if we switched into FREED state */ +} + +static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *tw, + struct bpf_map *map) +{ + struct bpf_task_work_kern *twk = (void *)tw; + struct bpf_task_work_ctx *ctx, *old_ctx; + + ctx = READ_ONCE(twk->ctx); + if (ctx) + return ctx; + + ctx = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_task_work_ctx)); + if (!ctx) + return ERR_PTR(-ENOMEM); + + memset(ctx, 0, sizeof(*ctx)); + refcount_set(&ctx->refcnt, 1); /* map's own ref */ + ctx->state = BPF_TW_STANDBY; + + old_ctx = cmpxchg(&twk->ctx, NULL, ctx); + if (old_ctx) { + /* + * tw->ctx is set by concurrent BPF program, release allocated + * memory and try to reuse already set context. + */ + bpf_mem_free(&bpf_global_ma, ctx); + return old_ctx; + } + + return ctx; /* Success */ +} + +static struct bpf_task_work_ctx *bpf_task_work_acquire_ctx(struct bpf_task_work *tw, + struct bpf_map *map) +{ + struct bpf_task_work_ctx *ctx; + + ctx = bpf_task_work_fetch_ctx(tw, map); + if (IS_ERR(ctx)) + return ctx; + + /* try to get ref for task_work callback to hold */ + if (!bpf_task_work_ctx_tryget(ctx)) + return ERR_PTR(-EBUSY); + + if (cmpxchg(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING) != BPF_TW_STANDBY) { + /* lost acquiring race or map_release_uref() stole it from us, put ref and bail */ + bpf_task_work_ctx_put(ctx); + return ERR_PTR(-EBUSY); + } + + /* + * If no process or bpffs is holding a reference to the map, no new callbacks should be + * scheduled. This does not address any race or correctness issue, but rather is a policy + * choice: dropping user references should stop everything. + */ + if (!atomic64_read(&map->usercnt)) { + /* drop ref we just got for task_work callback itself */ + bpf_task_work_ctx_put(ctx); + /* transfer map's ref into cancel_and_free() */ + bpf_task_work_cancel_and_free(tw); + return ERR_PTR(-EBUSY); + } + + return ctx; +} + +static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work *tw, + struct bpf_map *map, bpf_task_work_callback_t callback_fn, + struct bpf_prog_aux *aux, enum task_work_notify_mode mode) +{ + struct bpf_prog *prog; + struct bpf_task_work_ctx *ctx; + int err; + + BTF_TYPE_EMIT(struct bpf_task_work); + + prog = bpf_prog_inc_not_zero(aux->prog); + if (IS_ERR(prog)) + return -EBADF; + task = bpf_task_acquire(task); + if (!task) { + err = -EBADF; + goto release_prog; + } + + ctx = bpf_task_work_acquire_ctx(tw, map); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto release_all; + } + + ctx->task = task; + ctx->callback_fn = callback_fn; + ctx->prog = prog; + ctx->mode = mode; + ctx->map = map; + ctx->map_val = (void *)tw - map->record->task_work_off; + init_task_work(&ctx->work, bpf_task_work_callback); + init_irq_work(&ctx->irq_work, bpf_task_work_irq); + + irq_work_queue(&ctx->irq_work); + return 0; + +release_all: + bpf_task_release(task); +release_prog: + bpf_prog_put(prog); + return err; +} + +/** + * bpf_task_work_schedule_signal_impl - Schedule BPF callback using task_work_add with TWA_SIGNAL + * mode + * @task: Task struct for which callback should be scheduled + * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping + * @map__map: bpf_map that embeds struct bpf_task_work in the values + * @callback: pointer to BPF subprogram to call + * @aux__prog: user should pass NULL + * + * Return: 0 if task work has been scheduled successfully, negative error code otherwise + */ +__bpf_kfunc int bpf_task_work_schedule_signal_impl(struct task_struct *task, + struct bpf_task_work *tw, void *map__map, + bpf_task_work_callback_t callback, + void *aux__prog) +{ + return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL); +} + +/** + * bpf_task_work_schedule_resume_impl - Schedule BPF callback using task_work_add with TWA_RESUME + * mode + * @task: Task struct for which callback should be scheduled + * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping + * @map__map: bpf_map that embeds struct bpf_task_work in the values + * @callback: pointer to BPF subprogram to call + * @aux__prog: user should pass NULL + * + * Return: 0 if task work has been scheduled successfully, negative error code otherwise + */ +__bpf_kfunc int bpf_task_work_schedule_resume_impl(struct task_struct *task, + struct bpf_task_work *tw, void *map__map, + bpf_task_work_callback_t callback, + void *aux__prog) +{ + return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME); +} + +static int make_file_dynptr(struct file *file, u32 flags, bool may_sleep, + struct bpf_dynptr_kern *ptr) +{ + struct bpf_dynptr_file_impl *state; + + /* flags is currently unsupported */ + if (flags) { + bpf_dynptr_set_null(ptr); + return -EINVAL; + } + + state = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_dynptr_file_impl)); + if (!state) { + bpf_dynptr_set_null(ptr); + return -ENOMEM; + } + state->offset = 0; + state->size = U64_MAX; /* Don't restrict size, as file may change anyways */ + freader_init_from_file(&state->freader, NULL, 0, file, may_sleep); + bpf_dynptr_init(ptr, state, BPF_DYNPTR_TYPE_FILE, 0, 0); + bpf_dynptr_set_rdonly(ptr); + return 0; +} + +__bpf_kfunc int bpf_dynptr_from_file(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit) +{ + return make_file_dynptr(file, flags, false, (struct bpf_dynptr_kern *)ptr__uninit); +} + +int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit) +{ + return make_file_dynptr(file, flags, true, (struct bpf_dynptr_kern *)ptr__uninit); +} + +__bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr) +{ + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)dynptr; + struct bpf_dynptr_file_impl *df = ptr->data; + + if (!df) + return 0; + + freader_cleanup(&df->freader); + bpf_mem_free(&bpf_global_ma, df); + bpf_dynptr_set_null(ptr); + return 0; +} + __bpf_kfunc_end_defs(); +static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work) +{ + struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work); + + bpf_task_work_cancel(ctx); /* this might put task_work callback's ref */ + bpf_task_work_ctx_put(ctx); /* and here we put map's own ref that was transferred to us */ +} + +void bpf_task_work_cancel_and_free(void *val) +{ + struct bpf_task_work_kern *twk = val; + struct bpf_task_work_ctx *ctx; + enum bpf_task_work_state state; + + ctx = xchg(&twk->ctx, NULL); + if (!ctx) + return; + + state = xchg(&ctx->state, BPF_TW_FREED); + if (state == BPF_TW_SCHEDULED) { + /* run in irq_work to avoid locks in NMI */ + init_irq_work(&ctx->irq_work, bpf_task_work_cancel_scheduled); + irq_work_queue(&ctx->irq_work); + return; + } + + bpf_task_work_ctx_put(ctx); /* put bpf map's ref */ +} + BTF_KFUNCS_START(generic_btf_ids) #ifdef CONFIG_CRASH_DUMP BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) @@ -3209,11 +4404,16 @@ BTF_ID_FLAGS(func, bpf_list_push_front_impl) BTF_ID_FLAGS(func, bpf_list_push_back_impl) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_rbtree_add_impl) BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_rbtree_root, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_rbtree_left, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_rbtree_right, KF_RET_NULL) #ifdef CONFIG_CGROUPS BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) @@ -3229,6 +4429,14 @@ BTF_ID_FLAGS(func, bpf_throw) #ifdef CONFIG_BPF_EVENTS BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS) #endif +#ifdef CONFIG_KEYS +BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) +#ifdef CONFIG_SYSTEM_DATA_VERIFICATION +BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE) +#endif +#endif BTF_KFUNCS_END(generic_btf_ids) static const struct btf_kfunc_id_set generic_kfunc_set = { @@ -3275,6 +4483,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) BTF_ID_FLAGS(func, bpf_dynptr_copy) +BTF_ID_FLAGS(func, bpf_dynptr_memset) #ifdef CONFIG_NET BTF_ID_FLAGS(func, bpf_modify_return_test_tp) #endif @@ -3294,6 +4503,44 @@ BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLE BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_local_irq_save) BTF_ID_FLAGS(func, bpf_local_irq_restore) +#ifdef CONFIG_BPF_EVENTS +BTF_ID_FLAGS(func, bpf_probe_read_user_dynptr) +BTF_ID_FLAGS(func, bpf_probe_read_kernel_dynptr) +BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr) +BTF_ID_FLAGS(func, bpf_probe_read_kernel_str_dynptr) +BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +#endif +#ifdef CONFIG_DMA_SHARED_BUFFER +BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) +#endif +BTF_ID_FLAGS(func, __bpf_trap) +BTF_ID_FLAGS(func, bpf_strcmp); +BTF_ID_FLAGS(func, bpf_strcasecmp); +BTF_ID_FLAGS(func, bpf_strchr); +BTF_ID_FLAGS(func, bpf_strchrnul); +BTF_ID_FLAGS(func, bpf_strnchr); +BTF_ID_FLAGS(func, bpf_strrchr); +BTF_ID_FLAGS(func, bpf_strlen); +BTF_ID_FLAGS(func, bpf_strnlen); +BTF_ID_FLAGS(func, bpf_strspn); +BTF_ID_FLAGS(func, bpf_strcspn); +BTF_ID_FLAGS(func, bpf_strstr); +BTF_ID_FLAGS(func, bpf_strcasestr); +BTF_ID_FLAGS(func, bpf_strnstr); +BTF_ID_FLAGS(func, bpf_strncasestr); +#if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) +BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) +#endif +BTF_ID_FLAGS(func, bpf_stream_vprintk_impl, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_from_file, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_file_discard) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { @@ -3334,7 +4581,7 @@ late_initcall(kfunc_init); /* Get a pointer to dynptr data up to len bytes for read only access. If * the dynptr doesn't have continuous data up to len bytes, return NULL. */ -const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len) +const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len) { const struct bpf_dynptr *p = (struct bpf_dynptr *)ptr; @@ -3345,9 +4592,19 @@ const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len) * the dynptr doesn't have continuous data up to len bytes, or the dynptr * is read only, return NULL. */ -void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len) +void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len) { if (__bpf_dynptr_is_rdonly(ptr)) return NULL; return (void *)__bpf_dynptr_data(ptr, len); } + +void bpf_map_free_internal_structs(struct bpf_map *map, void *val) +{ + if (btf_record_has_field(map->record, BPF_TIMER)) + bpf_obj_free_timer(map->record, val); + if (btf_record_has_field(map->record, BPF_WORKQUEUE)) + bpf_obj_free_workqueue(map->record, val); + if (btf_record_has_field(map->record, BPF_TASK_WORK)) + bpf_obj_free_task_work(map->record, val); +} diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index dc3aa91a6ba0..9f866a010dad 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -144,8 +144,7 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type) static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, struct inode *dir) { - d_instantiate(dentry, inode); - dget(dentry); + d_make_persistent(dentry, inode); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } @@ -420,16 +419,12 @@ static int bpf_iter_link_pin_kernel(struct dentry *parent, struct dentry *dentry; int ret; - inode_lock(parent->d_inode); - dentry = lookup_one_len(name, parent, strlen(name)); - if (IS_ERR(dentry)) { - inode_unlock(parent->d_inode); + dentry = simple_start_creating(parent, name); + if (IS_ERR(dentry)) return PTR_ERR(dentry); - } ret = bpf_mkobj_ops(dentry, mode, link, &bpf_link_iops, &bpf_iter_fops); - dput(dentry); - inode_unlock(parent->d_inode); + simple_done_creating(dentry); return ret; } @@ -442,7 +437,7 @@ static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw, umode_t mode; int ret; - dentry = user_path_create(path_fd, pathname, &path, 0); + dentry = start_creating_user_path(path_fd, pathname, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -471,7 +466,7 @@ static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw, ret = -EPERM; } out: - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return ret; } @@ -775,7 +770,7 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) return 0; } -static void bpf_free_inode(struct inode *inode) +static void bpf_destroy_inode(struct inode *inode) { enum bpf_type type; @@ -788,9 +783,9 @@ static void bpf_free_inode(struct inode *inode) const struct super_operations bpf_super_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .show_options = bpf_show_options, - .free_inode = bpf_free_inode, + .destroy_inode = bpf_destroy_inode, }; enum { @@ -1080,7 +1075,7 @@ static void bpf_kill_super(struct super_block *sb) { struct bpf_mount_opts *opts = sb->s_fs_info; - kill_litter_super(sb); + kill_anon_super(sb); kfree(opts); } diff --git a/kernel/bpf/link_iter.c b/kernel/bpf/link_iter.c index fec8005a121c..8158e9c1af7b 100644 --- a/kernel/bpf/link_iter.c +++ b/kernel/bpf/link_iter.c @@ -78,8 +78,7 @@ static const struct seq_operations bpf_link_seq_ops = { .show = bpf_link_seq_show, }; -BTF_ID_LIST(btf_bpf_link_id) -BTF_ID(struct, bpf_link) +BTF_ID_LIST_SINGLE(btf_bpf_link_id, struct, bpf_link) static const struct bpf_iter_seq_info bpf_link_seq_info = { .seq_ops = &bpf_link_seq_ops, diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c new file mode 100644 index 000000000000..60db5d655495 --- /dev/null +++ b/kernel/bpf/liveness.c @@ -0,0 +1,753 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <linux/bpf_verifier.h> +#include <linux/hashtable.h> +#include <linux/jhash.h> +#include <linux/slab.h> + +/* + * This file implements live stack slots analysis. After accumulating + * stack usage data, the analysis answers queries about whether a + * particular stack slot may be read by an instruction or any of it's + * successors. This data is consumed by the verifier states caching + * mechanism to decide which stack slots are important when looking for a + * visited state corresponding to the current state. + * + * The analysis is call chain sensitive, meaning that data is collected + * and queried for tuples (call chain, subprogram instruction index). + * Such sensitivity allows identifying if some subprogram call always + * leads to writes in the caller's stack. + * + * The basic idea is as follows: + * - As the verifier accumulates a set of visited states, the analysis instance + * accumulates a conservative estimate of stack slots that can be read + * or must be written for each visited tuple (call chain, instruction index). + * - If several states happen to visit the same instruction with the same + * call chain, stack usage information for the corresponding tuple is joined: + * - "may_read" set represents a union of all possibly read slots + * (any slot in "may_read" set might be read at or after the instruction); + * - "must_write" set represents an intersection of all possibly written slots + * (any slot in "must_write" set is guaranteed to be written by the instruction). + * - The analysis is split into two phases: + * - read and write marks accumulation; + * - read and write marks propagation. + * - The propagation phase is a textbook live variable data flow analysis: + * + * state[cc, i].live_after = U [state[cc, s].live_before for s in bpf_insn_successors(i)] + * state[cc, i].live_before = + * (state[cc, i].live_after / state[cc, i].must_write) U state[i].may_read + * + * Where: + * - `U` stands for set union + * - `/` stands for set difference; + * - `cc` stands for a call chain; + * - `i` and `s` are instruction indexes; + * + * The above equations are computed for each call chain and instruction + * index until state stops changing. + * - Additionally, in order to transfer "must_write" information from a + * subprogram to call instructions invoking this subprogram, + * the "must_write_acc" set is tracked for each (cc, i) tuple. + * A set of stack slots that are guaranteed to be written by this + * instruction or any of its successors (within the subprogram). + * The equation for "must_write_acc" propagation looks as follows: + * + * state[cc, i].must_write_acc = + * ∩ [state[cc, s].must_write_acc for s in bpf_insn_successors(i)] + * U state[cc, i].must_write + * + * (An intersection of all "must_write_acc" for instruction successors + * plus all "must_write" slots for the instruction itself). + * - After the propagation phase completes for a subprogram, information from + * (cc, 0) tuple (subprogram entry) is transferred to the caller's call chain: + * - "must_write_acc" set is intersected with the call site's "must_write" set; + * - "may_read" set is added to the call site's "may_read" set. + * - Any live stack queries must be taken after the propagation phase. + * - Accumulation and propagation phases can be entered multiple times, + * at any point in time: + * - "may_read" set only grows; + * - "must_write" set only shrinks; + * - for each visited verifier state with zero branches, all relevant + * read and write marks are already recorded by the analysis instance. + * + * Technically, the analysis is facilitated by the following data structures: + * - Call chain: for given verifier state, the call chain is a tuple of call + * instruction indexes leading to the current subprogram plus the subprogram + * entry point index. + * - Function instance: for a given call chain, for each instruction in + * the current subprogram, a mapping between instruction index and a + * set of "may_read", "must_write" and other marks accumulated for this + * instruction. + * - A hash table mapping call chains to function instances. + */ + +struct callchain { + u32 callsites[MAX_CALL_FRAMES]; /* instruction pointer for each frame */ + /* cached subprog_info[*].start for functions owning the frames: + * - sp_starts[curframe] used to get insn relative index within current function; + * - sp_starts[0..current-1] used for fast callchain_frame_up(). + */ + u32 sp_starts[MAX_CALL_FRAMES]; + u32 curframe; /* depth of callsites and sp_starts arrays */ +}; + +struct per_frame_masks { + u64 may_read; /* stack slots that may be read by this instruction */ + u64 must_write; /* stack slots written by this instruction */ + u64 must_write_acc; /* stack slots written by this instruction and its successors */ + u64 live_before; /* stack slots that may be read by this insn and its successors */ +}; + +/* + * A function instance created for a specific callchain. + * Encapsulates read and write marks for each instruction in the function. + * Marks are tracked for each frame in the callchain. + */ +struct func_instance { + struct hlist_node hl_node; + struct callchain callchain; + u32 insn_cnt; /* cached number of insns in the function */ + bool updated; + bool must_write_dropped; + /* Per frame, per instruction masks, frames allocated lazily. */ + struct per_frame_masks *frames[MAX_CALL_FRAMES]; + /* For each instruction a flag telling if "must_write" had been initialized for it. */ + bool *must_write_set; +}; + +struct live_stack_query { + struct func_instance *instances[MAX_CALL_FRAMES]; /* valid in range [0..curframe] */ + u32 curframe; + u32 insn_idx; +}; + +struct bpf_liveness { + DECLARE_HASHTABLE(func_instances, 8); /* maps callchain to func_instance */ + struct live_stack_query live_stack_query; /* cache to avoid repetitive ht lookups */ + /* Cached instance corresponding to env->cur_state, avoids per-instruction ht lookup */ + struct func_instance *cur_instance; + /* + * Below fields are used to accumulate stack write marks for instruction at + * @write_insn_idx before submitting the marks to @cur_instance. + */ + u64 write_masks_acc[MAX_CALL_FRAMES]; + u32 write_insn_idx; +}; + +/* Compute callchain corresponding to state @st at depth @frameno */ +static void compute_callchain(struct bpf_verifier_env *env, struct bpf_verifier_state *st, + struct callchain *callchain, u32 frameno) +{ + struct bpf_subprog_info *subprog_info = env->subprog_info; + u32 i; + + memset(callchain, 0, sizeof(*callchain)); + for (i = 0; i <= frameno; i++) { + callchain->sp_starts[i] = subprog_info[st->frame[i]->subprogno].start; + if (i < st->curframe) + callchain->callsites[i] = st->frame[i + 1]->callsite; + } + callchain->curframe = frameno; + callchain->callsites[callchain->curframe] = callchain->sp_starts[callchain->curframe]; +} + +static u32 hash_callchain(struct callchain *callchain) +{ + return jhash2(callchain->callsites, callchain->curframe, 0); +} + +static bool same_callsites(struct callchain *a, struct callchain *b) +{ + int i; + + if (a->curframe != b->curframe) + return false; + for (i = a->curframe; i >= 0; i--) + if (a->callsites[i] != b->callsites[i]) + return false; + return true; +} + +/* + * Find existing or allocate new function instance corresponding to @callchain. + * Instances are accumulated in env->liveness->func_instances and persist + * until the end of the verification process. + */ +static struct func_instance *__lookup_instance(struct bpf_verifier_env *env, + struct callchain *callchain) +{ + struct bpf_liveness *liveness = env->liveness; + struct bpf_subprog_info *subprog; + struct func_instance *result; + u32 subprog_sz, size, key; + + key = hash_callchain(callchain); + hash_for_each_possible(liveness->func_instances, result, hl_node, key) + if (same_callsites(&result->callchain, callchain)) + return result; + + subprog = bpf_find_containing_subprog(env, callchain->sp_starts[callchain->curframe]); + subprog_sz = (subprog + 1)->start - subprog->start; + size = sizeof(struct func_instance); + result = kvzalloc(size, GFP_KERNEL_ACCOUNT); + if (!result) + return ERR_PTR(-ENOMEM); + result->must_write_set = kvcalloc(subprog_sz, sizeof(*result->must_write_set), + GFP_KERNEL_ACCOUNT); + if (!result->must_write_set) { + kvfree(result); + return ERR_PTR(-ENOMEM); + } + memcpy(&result->callchain, callchain, sizeof(*callchain)); + result->insn_cnt = subprog_sz; + hash_add(liveness->func_instances, &result->hl_node, key); + return result; +} + +static struct func_instance *lookup_instance(struct bpf_verifier_env *env, + struct bpf_verifier_state *st, + u32 frameno) +{ + struct callchain callchain; + + compute_callchain(env, st, &callchain, frameno); + return __lookup_instance(env, &callchain); +} + +int bpf_stack_liveness_init(struct bpf_verifier_env *env) +{ + env->liveness = kvzalloc(sizeof(*env->liveness), GFP_KERNEL_ACCOUNT); + if (!env->liveness) + return -ENOMEM; + hash_init(env->liveness->func_instances); + return 0; +} + +void bpf_stack_liveness_free(struct bpf_verifier_env *env) +{ + struct func_instance *instance; + struct hlist_node *tmp; + int bkt, i; + + if (!env->liveness) + return; + hash_for_each_safe(env->liveness->func_instances, bkt, tmp, instance, hl_node) { + for (i = 0; i <= instance->callchain.curframe; i++) + kvfree(instance->frames[i]); + kvfree(instance->must_write_set); + kvfree(instance); + } + kvfree(env->liveness); +} + +/* + * Convert absolute instruction index @insn_idx to an index relative + * to start of the function corresponding to @instance. + */ +static int relative_idx(struct func_instance *instance, u32 insn_idx) +{ + return insn_idx - instance->callchain.sp_starts[instance->callchain.curframe]; +} + +static struct per_frame_masks *get_frame_masks(struct func_instance *instance, + u32 frame, u32 insn_idx) +{ + if (!instance->frames[frame]) + return NULL; + + return &instance->frames[frame][relative_idx(instance, insn_idx)]; +} + +static struct per_frame_masks *alloc_frame_masks(struct bpf_verifier_env *env, + struct func_instance *instance, + u32 frame, u32 insn_idx) +{ + struct per_frame_masks *arr; + + if (!instance->frames[frame]) { + arr = kvcalloc(instance->insn_cnt, sizeof(*arr), GFP_KERNEL_ACCOUNT); + instance->frames[frame] = arr; + if (!arr) + return ERR_PTR(-ENOMEM); + } + return get_frame_masks(instance, frame, insn_idx); +} + +void bpf_reset_live_stack_callchain(struct bpf_verifier_env *env) +{ + env->liveness->cur_instance = NULL; +} + +/* If @env->liveness->cur_instance is null, set it to instance corresponding to @env->cur_state. */ +static int ensure_cur_instance(struct bpf_verifier_env *env) +{ + struct bpf_liveness *liveness = env->liveness; + struct func_instance *instance; + + if (liveness->cur_instance) + return 0; + + instance = lookup_instance(env, env->cur_state, env->cur_state->curframe); + if (IS_ERR(instance)) + return PTR_ERR(instance); + + liveness->cur_instance = instance; + return 0; +} + +/* Accumulate may_read masks for @frame at @insn_idx */ +static int mark_stack_read(struct bpf_verifier_env *env, + struct func_instance *instance, u32 frame, u32 insn_idx, u64 mask) +{ + struct per_frame_masks *masks; + u64 new_may_read; + + masks = alloc_frame_masks(env, instance, frame, insn_idx); + if (IS_ERR(masks)) + return PTR_ERR(masks); + new_may_read = masks->may_read | mask; + if (new_may_read != masks->may_read && + ((new_may_read | masks->live_before) != masks->live_before)) + instance->updated = true; + masks->may_read |= mask; + return 0; +} + +int bpf_mark_stack_read(struct bpf_verifier_env *env, u32 frame, u32 insn_idx, u64 mask) +{ + int err; + + err = ensure_cur_instance(env); + err = err ?: mark_stack_read(env, env->liveness->cur_instance, frame, insn_idx, mask); + return err; +} + +static void reset_stack_write_marks(struct bpf_verifier_env *env, + struct func_instance *instance, u32 insn_idx) +{ + struct bpf_liveness *liveness = env->liveness; + int i; + + liveness->write_insn_idx = insn_idx; + for (i = 0; i <= instance->callchain.curframe; i++) + liveness->write_masks_acc[i] = 0; +} + +int bpf_reset_stack_write_marks(struct bpf_verifier_env *env, u32 insn_idx) +{ + struct bpf_liveness *liveness = env->liveness; + int err; + + err = ensure_cur_instance(env); + if (err) + return err; + + reset_stack_write_marks(env, liveness->cur_instance, insn_idx); + return 0; +} + +void bpf_mark_stack_write(struct bpf_verifier_env *env, u32 frame, u64 mask) +{ + env->liveness->write_masks_acc[frame] |= mask; +} + +static int commit_stack_write_marks(struct bpf_verifier_env *env, + struct func_instance *instance) +{ + struct bpf_liveness *liveness = env->liveness; + u32 idx, frame, curframe, old_must_write; + struct per_frame_masks *masks; + u64 mask; + + if (!instance) + return 0; + + curframe = instance->callchain.curframe; + idx = relative_idx(instance, liveness->write_insn_idx); + for (frame = 0; frame <= curframe; frame++) { + mask = liveness->write_masks_acc[frame]; + /* avoid allocating frames for zero masks */ + if (mask == 0 && !instance->must_write_set[idx]) + continue; + masks = alloc_frame_masks(env, instance, frame, liveness->write_insn_idx); + if (IS_ERR(masks)) + return PTR_ERR(masks); + old_must_write = masks->must_write; + /* + * If instruction at this callchain is seen for a first time, set must_write equal + * to @mask. Otherwise take intersection with the previous value. + */ + if (instance->must_write_set[idx]) + mask &= old_must_write; + if (old_must_write != mask) { + masks->must_write = mask; + instance->updated = true; + } + if (old_must_write & ~mask) + instance->must_write_dropped = true; + } + instance->must_write_set[idx] = true; + liveness->write_insn_idx = 0; + return 0; +} + +/* + * Merge stack writes marks in @env->liveness->write_masks_acc + * with information already in @env->liveness->cur_instance. + */ +int bpf_commit_stack_write_marks(struct bpf_verifier_env *env) +{ + return commit_stack_write_marks(env, env->liveness->cur_instance); +} + +static char *fmt_callchain(struct bpf_verifier_env *env, struct callchain *callchain) +{ + char *buf_end = env->tmp_str_buf + sizeof(env->tmp_str_buf); + char *buf = env->tmp_str_buf; + int i; + + buf += snprintf(buf, buf_end - buf, "("); + for (i = 0; i <= callchain->curframe; i++) + buf += snprintf(buf, buf_end - buf, "%s%d", i ? "," : "", callchain->callsites[i]); + snprintf(buf, buf_end - buf, ")"); + return env->tmp_str_buf; +} + +static void log_mask_change(struct bpf_verifier_env *env, struct callchain *callchain, + char *pfx, u32 frame, u32 insn_idx, u64 old, u64 new) +{ + u64 changed_bits = old ^ new; + u64 new_ones = new & changed_bits; + u64 new_zeros = ~new & changed_bits; + + if (!changed_bits) + return; + bpf_log(&env->log, "%s frame %d insn %d ", fmt_callchain(env, callchain), frame, insn_idx); + if (new_ones) { + bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_ones); + bpf_log(&env->log, "+%s %s ", pfx, env->tmp_str_buf); + } + if (new_zeros) { + bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_zeros); + bpf_log(&env->log, "-%s %s", pfx, env->tmp_str_buf); + } + bpf_log(&env->log, "\n"); +} + +int bpf_jmp_offset(struct bpf_insn *insn) +{ + u8 code = insn->code; + + if (code == (BPF_JMP32 | BPF_JA)) + return insn->imm; + return insn->off; +} + +__diag_push(); +__diag_ignore_all("-Woverride-init", "Allow field initialization overrides for opcode_info_tbl"); + +/* + * Returns an array of instructions succ, with succ->items[0], ..., + * succ->items[n-1] with successor instructions, where n=succ->cnt + */ +inline struct bpf_iarray * +bpf_insn_successors(struct bpf_verifier_env *env, u32 idx) +{ + static const struct opcode_info { + bool can_jump; + bool can_fallthrough; + } opcode_info_tbl[256] = { + [0 ... 255] = {.can_jump = false, .can_fallthrough = true}, + #define _J(code, ...) \ + [BPF_JMP | code] = __VA_ARGS__, \ + [BPF_JMP32 | code] = __VA_ARGS__ + + _J(BPF_EXIT, {.can_jump = false, .can_fallthrough = false}), + _J(BPF_JA, {.can_jump = true, .can_fallthrough = false}), + _J(BPF_JEQ, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JNE, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JLT, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JLE, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JGT, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JGE, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JSGT, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JSGE, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JSLT, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JSLE, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JCOND, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JSET, {.can_jump = true, .can_fallthrough = true}), + #undef _J + }; + struct bpf_prog *prog = env->prog; + struct bpf_insn *insn = &prog->insnsi[idx]; + const struct opcode_info *opcode_info; + struct bpf_iarray *succ, *jt; + int insn_sz; + + jt = env->insn_aux_data[idx].jt; + if (unlikely(jt)) + return jt; + + /* pre-allocated array of size up to 2; reset cnt, as it may have been used already */ + succ = env->succ; + succ->cnt = 0; + + opcode_info = &opcode_info_tbl[BPF_CLASS(insn->code) | BPF_OP(insn->code)]; + insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; + if (opcode_info->can_fallthrough) + succ->items[succ->cnt++] = idx + insn_sz; + + if (opcode_info->can_jump) + succ->items[succ->cnt++] = idx + bpf_jmp_offset(insn) + 1; + + return succ; +} + +__diag_pop(); + +static struct func_instance *get_outer_instance(struct bpf_verifier_env *env, + struct func_instance *instance) +{ + struct callchain callchain = instance->callchain; + + /* Adjust @callchain to represent callchain one frame up */ + callchain.callsites[callchain.curframe] = 0; + callchain.sp_starts[callchain.curframe] = 0; + callchain.curframe--; + callchain.callsites[callchain.curframe] = callchain.sp_starts[callchain.curframe]; + return __lookup_instance(env, &callchain); +} + +static u32 callchain_subprog_start(struct callchain *callchain) +{ + return callchain->sp_starts[callchain->curframe]; +} + +/* + * Transfer @may_read and @must_write_acc marks from the first instruction of @instance, + * to the call instruction in function instance calling @instance. + */ +static int propagate_to_outer_instance(struct bpf_verifier_env *env, + struct func_instance *instance) +{ + struct callchain *callchain = &instance->callchain; + u32 this_subprog_start, callsite, frame; + struct func_instance *outer_instance; + struct per_frame_masks *insn; + int err; + + this_subprog_start = callchain_subprog_start(callchain); + outer_instance = get_outer_instance(env, instance); + if (IS_ERR(outer_instance)) + return PTR_ERR(outer_instance); + callsite = callchain->callsites[callchain->curframe - 1]; + + reset_stack_write_marks(env, outer_instance, callsite); + for (frame = 0; frame < callchain->curframe; frame++) { + insn = get_frame_masks(instance, frame, this_subprog_start); + if (!insn) + continue; + bpf_mark_stack_write(env, frame, insn->must_write_acc); + err = mark_stack_read(env, outer_instance, frame, callsite, insn->live_before); + if (err) + return err; + } + commit_stack_write_marks(env, outer_instance); + return 0; +} + +static inline bool update_insn(struct bpf_verifier_env *env, + struct func_instance *instance, u32 frame, u32 insn_idx) +{ + struct bpf_insn_aux_data *aux = env->insn_aux_data; + u64 new_before, new_after, must_write_acc; + struct per_frame_masks *insn, *succ_insn; + struct bpf_iarray *succ; + u32 s; + bool changed; + + succ = bpf_insn_successors(env, insn_idx); + if (succ->cnt == 0) + return false; + + changed = false; + insn = get_frame_masks(instance, frame, insn_idx); + new_before = 0; + new_after = 0; + /* + * New "must_write_acc" is an intersection of all "must_write_acc" + * of successors plus all "must_write" slots of instruction itself. + */ + must_write_acc = U64_MAX; + for (s = 0; s < succ->cnt; ++s) { + succ_insn = get_frame_masks(instance, frame, succ->items[s]); + new_after |= succ_insn->live_before; + must_write_acc &= succ_insn->must_write_acc; + } + must_write_acc |= insn->must_write; + /* + * New "live_before" is a union of all "live_before" of successors + * minus slots written by instruction plus slots read by instruction. + */ + new_before = (new_after & ~insn->must_write) | insn->may_read; + changed |= new_before != insn->live_before; + changed |= must_write_acc != insn->must_write_acc; + if (unlikely(env->log.level & BPF_LOG_LEVEL2) && + (insn->may_read || insn->must_write || + insn_idx == callchain_subprog_start(&instance->callchain) || + aux[insn_idx].prune_point)) { + log_mask_change(env, &instance->callchain, "live", + frame, insn_idx, insn->live_before, new_before); + log_mask_change(env, &instance->callchain, "written", + frame, insn_idx, insn->must_write_acc, must_write_acc); + } + insn->live_before = new_before; + insn->must_write_acc = must_write_acc; + return changed; +} + +/* Fixed-point computation of @live_before and @must_write_acc marks */ +static int update_instance(struct bpf_verifier_env *env, struct func_instance *instance) +{ + u32 i, frame, po_start, po_end, cnt, this_subprog_start; + struct callchain *callchain = &instance->callchain; + int *insn_postorder = env->cfg.insn_postorder; + struct bpf_subprog_info *subprog; + struct per_frame_masks *insn; + bool changed; + int err; + + this_subprog_start = callchain_subprog_start(callchain); + /* + * If must_write marks were updated must_write_acc needs to be reset + * (to account for the case when new must_write sets became smaller). + */ + if (instance->must_write_dropped) { + for (frame = 0; frame <= callchain->curframe; frame++) { + if (!instance->frames[frame]) + continue; + + for (i = 0; i < instance->insn_cnt; i++) { + insn = get_frame_masks(instance, frame, this_subprog_start + i); + insn->must_write_acc = 0; + } + } + } + + subprog = bpf_find_containing_subprog(env, this_subprog_start); + po_start = subprog->postorder_start; + po_end = (subprog + 1)->postorder_start; + cnt = 0; + /* repeat until fixed point is reached */ + do { + cnt++; + changed = false; + for (frame = 0; frame <= instance->callchain.curframe; frame++) { + if (!instance->frames[frame]) + continue; + + for (i = po_start; i < po_end; i++) + changed |= update_insn(env, instance, frame, insn_postorder[i]); + } + } while (changed); + + if (env->log.level & BPF_LOG_LEVEL2) + bpf_log(&env->log, "%s live stack update done in %d iterations\n", + fmt_callchain(env, callchain), cnt); + + /* transfer marks accumulated for outer frames to outer func instance (caller) */ + if (callchain->curframe > 0) { + err = propagate_to_outer_instance(env, instance); + if (err) + return err; + } + + return 0; +} + +/* + * Prepare all callchains within @env->cur_state for querying. + * This function should be called after each verifier.c:pop_stack() + * and whenever verifier.c:do_check_insn() processes subprogram exit. + * This would guarantee that visited verifier states with zero branches + * have their bpf_mark_stack_{read,write}() effects propagated in + * @env->liveness. + */ +int bpf_update_live_stack(struct bpf_verifier_env *env) +{ + struct func_instance *instance; + int err, frame; + + bpf_reset_live_stack_callchain(env); + for (frame = env->cur_state->curframe; frame >= 0; --frame) { + instance = lookup_instance(env, env->cur_state, frame); + if (IS_ERR(instance)) + return PTR_ERR(instance); + + if (instance->updated) { + err = update_instance(env, instance); + if (err) + return err; + instance->updated = false; + instance->must_write_dropped = false; + } + } + return 0; +} + +static bool is_live_before(struct func_instance *instance, u32 insn_idx, u32 frameno, u32 spi) +{ + struct per_frame_masks *masks; + + masks = get_frame_masks(instance, frameno, insn_idx); + return masks && (masks->live_before & BIT(spi)); +} + +int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + struct live_stack_query *q = &env->liveness->live_stack_query; + struct func_instance *instance; + u32 frame; + + memset(q, 0, sizeof(*q)); + for (frame = 0; frame <= st->curframe; frame++) { + instance = lookup_instance(env, st, frame); + if (IS_ERR(instance)) + return PTR_ERR(instance); + q->instances[frame] = instance; + } + q->curframe = st->curframe; + q->insn_idx = st->insn_idx; + return 0; +} + +bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 spi) +{ + /* + * Slot is alive if it is read before q->st->insn_idx in current func instance, + * or if for some outer func instance: + * - alive before callsite if callsite calls callback, otherwise + * - alive after callsite + */ + struct live_stack_query *q = &env->liveness->live_stack_query; + struct func_instance *instance, *curframe_instance; + u32 i, callsite; + bool alive; + + curframe_instance = q->instances[q->curframe]; + if (is_live_before(curframe_instance, q->insn_idx, frameno, spi)) + return true; + + for (i = frameno; i < q->curframe; i++) { + callsite = curframe_instance->callchain.callsites[i]; + instance = q->instances[i]; + alive = bpf_calls_callback(env, callsite) + ? is_live_before(instance, callsite, frameno, spi) + : is_live_before(instance, callsite + 1, frameno, spi); + if (alive) + return true; + } + + return false; +} diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 3969eb0382af..c93a756e035c 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -165,7 +165,7 @@ static long cgroup_storage_update_elem(struct bpf_map *map, void *key, } new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size), - __GFP_ZERO | GFP_NOWAIT | __GFP_NOWARN, + __GFP_ZERO | GFP_NOWAIT, map->numa_node); if (!new) return -ENOMEM; @@ -394,17 +394,10 @@ static int cgroup_storage_check_btf(const struct bpf_map *map, if (!btf_member_is_reg_int(btf, key_type, m, offset, size)) return -EINVAL; } else { - u32 int_data; - /* * Key is expected to be u64, which stores the cgroup_inode_id */ - - if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) - return -EINVAL; - - int_data = *(u32 *)(key_type + 1); - if (BTF_INT_BITS(int_data) != 64 || BTF_INT_OFFSET(int_data)) + if (!btf_type_is_i64(key_type)) return -EINVAL; } diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 38050f4ee400..a0c3b35de2ce 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -461,6 +461,7 @@ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) [PTR_TO_ARENA] = "arena", [PTR_TO_BUF] = "buf", [PTR_TO_FUNC] = "func", + [PTR_TO_INSN] = "insn", [PTR_TO_MAP_KEY] = "map_key", [CONST_PTR_TO_DYNPTR] = "dynptr_ptr", }; @@ -498,6 +499,10 @@ const char *dynptr_type_str(enum bpf_dynptr_type type) return "skb"; case BPF_DYNPTR_TYPE_XDP: return "xdp"; + case BPF_DYNPTR_TYPE_SKB_META: + return "skb_meta"; + case BPF_DYNPTR_TYPE_FILE: + return "file"; case BPF_DYNPTR_TYPE_INVALID: return "<invalid>"; default: @@ -540,19 +545,6 @@ static char slot_type_char[] = { [STACK_IRQ_FLAG] = 'f' }; -static void print_liveness(struct bpf_verifier_env *env, - enum bpf_reg_liveness live) -{ - if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE)) - verbose(env, "_"); - if (live & REG_LIVE_READ) - verbose(env, "r"); - if (live & REG_LIVE_WRITTEN) - verbose(env, "w"); - if (live & REG_LIVE_DONE) - verbose(env, "D"); -} - #define UNUM_MAX_DECIMAL U16_MAX #define SNUM_MAX_DECIMAL S16_MAX #define SNUM_MIN_DECIMAL S16_MIN @@ -770,7 +762,6 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie if (!print_all && !reg_scratched(env, i)) continue; verbose(env, " R%d", i); - print_liveness(env, reg->live); verbose(env, "="); print_reg_state(env, state, reg); } @@ -803,9 +794,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie break; types_buf[j] = '\0'; - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=%s", types_buf); + verbose(env, " fp%d=%s", (-i - 1) * BPF_REG_SIZE, types_buf); print_reg_state(env, state, reg); break; case STACK_DYNPTR: @@ -814,7 +803,6 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie reg = &state->stack[i].spilled_ptr; verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); verbose(env, "=dynptr_%s(", dynptr_type_str(reg->dynptr.type)); if (reg->id) verbose_a("id=%d", reg->id); @@ -829,9 +817,8 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie if (!reg->ref_obj_id) continue; - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)", + verbose(env, " fp%d=iter_%s(ref_id=%d,state=%s,depth=%u)", + (-i - 1) * BPF_REG_SIZE, iter_type_str(reg->iter.btf, reg->iter.btf_id), reg->ref_obj_id, iter_state_str(reg->iter.state), reg->iter.depth); @@ -839,9 +826,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie case STACK_MISC: case STACK_ZERO: default: - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=%s", types_buf); + verbose(env, " fp%d=%s", (-i - 1) * BPF_REG_SIZE, types_buf); break; } } diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 889374722d0a..bd45dda9dc35 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -736,7 +736,7 @@ static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) /* Defer barriers into worker to let the rest of map memory to be freed */ memset(ma, 0, sizeof(*ma)); INIT_WORK(©->work, free_mem_alloc_deferred); - queue_work(system_unbound_wq, ©->work); + queue_work(system_dfl_wq, ©->work); } void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 868cc2c43899..8e88201c98bf 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -11,8 +11,6 @@ struct bpf_netns_link { struct bpf_link link; - enum bpf_attach_type type; - enum netns_bpf_attach_type netns_type; /* We don't hold a ref to net in order to auto-detach the link * when netns is going away. Instead we rely on pernet @@ -21,6 +19,7 @@ struct bpf_netns_link { */ struct net *net; struct list_head node; /* node in list of links attached to net */ + enum netns_bpf_attach_type netns_type; }; /* Protects updates to netns_bpf */ @@ -216,7 +215,7 @@ static int bpf_netns_link_fill_info(const struct bpf_link *link, mutex_unlock(&netns_bpf_mutex); info->netns.netns_ino = inum; - info->netns.attach_type = net_link->type; + info->netns.attach_type = link->attach_type; return 0; } @@ -230,7 +229,7 @@ static void bpf_netns_link_show_fdinfo(const struct bpf_link *link, "netns_ino:\t%u\n" "attach_type:\t%u\n", info.netns.netns_ino, - info.netns.attach_type); + link->attach_type); } static const struct bpf_link_ops bpf_netns_link_ops = { @@ -501,9 +500,8 @@ int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog) goto out_put_net; } bpf_link_init(&net_link->link, BPF_LINK_TYPE_NETNS, - &bpf_netns_link_ops, prog); + &bpf_netns_link_ops, prog, type); net_link->net = net; - net_link->type = type; net_link->netns_type = netns_type; err = bpf_link_prime(&net_link->link, &link_primer); diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig index c9d45c9d6918..aef7b0bc96d6 100644 --- a/kernel/bpf/preload/Kconfig +++ b/kernel/bpf/preload/Kconfig @@ -1,8 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -config USERMODE_DRIVER - bool - default n - menuconfig BPF_PRELOAD bool "Preload BPF file system with kernel specific program and map iterators" depends on BPF @@ -10,7 +6,6 @@ menuconfig BPF_PRELOAD # The dependency on !COMPILE_TEST prevents it from being enabled # in allmodconfig or allyesconfig configurations depends on !COMPILE_TEST - select USERMODE_DRIVER help This builds kernel module with several embedded BPF programs that are pinned into BPF FS mount point as human readable files that are diff --git a/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h b/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h index ebdc6c0cdb70..49b1d515a847 100644 --- a/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h +++ b/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h @@ -89,10 +89,7 @@ iterators_bpf__load(struct iterators_bpf *skel) { struct bpf_load_and_run_opts opts = {}; int err; - - opts.ctx = (struct bpf_loader_ctx *)skel; - opts.data_sz = 6008; - opts.data = (void *)"\ + static const char opts_data[] __attribute__((__aligned__(8))) = "\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ @@ -126,190 +123,196 @@ iterators_bpf__load(struct iterators_bpf *skel) \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xeb\x9f\x01\0\ -\0\0\0\x18\0\0\0\0\0\0\x04\x1c\0\0\x04\x1c\0\0\x05\x18\0\0\0\0\x02\0\0\0\0\0\0\ +\0\0\0\x18\0\0\0\0\0\0\x04\x80\0\0\x04\x80\0\0\x05\x44\0\0\0\0\x02\0\0\0\0\0\0\ \x02\0\0\0\x01\x04\0\0\x02\0\0\0\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\ \0\x04\0\0\0\x40\0\0\0\0\x02\0\0\0\0\0\0\x08\0\0\0\0\x02\0\0\0\0\0\0\x0d\0\0\0\ \0\x0d\0\0\x01\0\0\0\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\x01\0\0\0\0\0\0\x04\x01\ -\0\0\x20\0\0\0\x24\x0c\0\0\x01\0\0\0\x05\0\0\0\xc2\x04\0\0\x03\0\0\0\x18\0\0\0\ -\xd0\0\0\0\x09\0\0\0\0\0\0\0\xd4\0\0\0\x0b\0\0\0\x40\0\0\0\xdf\0\0\0\x0b\0\0\0\ -\x80\0\0\0\0\x02\0\0\0\0\0\0\x0a\0\0\0\xe7\x07\0\0\0\0\0\0\0\0\0\0\xf0\x08\0\0\ -\0\0\0\0\x0c\0\0\0\xf6\x01\0\0\0\0\0\0\x08\0\0\0\x40\0\0\x01\xb3\x04\0\0\x03\0\ -\0\0\x18\0\0\x01\xbb\0\0\0\x0e\0\0\0\0\0\0\x01\xbe\0\0\0\x11\0\0\0\x20\0\0\x01\ -\xc3\0\0\0\x0e\0\0\0\xa0\0\0\x01\xcf\x08\0\0\0\0\0\0\x0f\0\0\x01\xd5\x01\0\0\0\ -\0\0\0\x04\0\0\0\x20\0\0\x01\xe2\x01\0\0\0\0\0\0\x01\x01\0\0\x08\0\0\0\0\x03\0\ -\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\x01\xe7\x01\0\0\0\0\0\0\x04\0\0\ -\0\x20\0\0\0\0\x02\0\0\0\0\0\0\x14\0\0\x02\x4b\x04\0\0\x02\0\0\0\x10\0\0\0\x13\ -\0\0\0\x03\0\0\0\0\0\0\x02\x5e\0\0\0\x15\0\0\0\x40\0\0\0\0\x02\0\0\0\0\0\0\x18\ -\0\0\0\0\x0d\0\0\x01\0\0\0\x06\0\0\0\x1c\0\0\0\x13\0\0\x02\x63\x0c\0\0\x01\0\0\ -\0\x16\0\0\x02\xaf\x04\0\0\x01\0\0\0\x08\0\0\x02\xb8\0\0\0\x19\0\0\0\0\0\0\0\0\ -\x02\0\0\0\0\0\0\x1a\0\0\x03\x09\x04\0\0\x06\0\0\0\x38\0\0\x01\xbb\0\0\0\x0e\0\ -\0\0\0\0\0\x01\xbe\0\0\0\x11\0\0\0\x20\0\0\x03\x16\0\0\0\x1b\0\0\0\xc0\0\0\x03\ -\x27\0\0\0\x15\0\0\x01\0\0\0\x03\x30\0\0\0\x1d\0\0\x01\x40\0\0\x03\x3a\0\0\0\ -\x1e\0\0\x01\x80\0\0\0\0\x02\0\0\0\0\0\0\x1c\0\0\0\0\x0a\0\0\0\0\0\0\x10\0\0\0\ -\0\x02\0\0\0\0\0\0\x1f\0\0\0\0\x02\0\0\0\0\0\0\x20\0\0\x03\x84\x04\0\0\x02\0\0\ -\0\x08\0\0\x03\x92\0\0\0\x0e\0\0\0\0\0\0\x03\x9b\0\0\0\x0e\0\0\0\x20\0\0\x03\ -\x3a\x04\0\0\x03\0\0\0\x18\0\0\x03\xa5\0\0\0\x1b\0\0\0\0\0\0\x03\xad\0\0\0\x21\ -\0\0\0\x40\0\0\x03\xb3\0\0\0\x23\0\0\0\x80\0\0\0\0\x02\0\0\0\0\0\0\x22\0\0\0\0\ -\x02\0\0\0\0\0\0\x24\0\0\x03\xb7\x04\0\0\x01\0\0\0\x04\0\0\x03\xc2\0\0\0\x0e\0\ -\0\0\0\0\0\x04\x2b\x04\0\0\x01\0\0\0\x04\0\0\x04\x34\0\0\0\x0e\0\0\0\0\0\0\0\0\ -\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\x04\xaa\x0e\0\0\0\0\0\0\ -\x25\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\x04\ -\xbe\x0e\0\0\0\0\0\0\x27\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\ -\0\0\0\x20\0\0\x04\xd4\x0e\0\0\0\0\0\0\x29\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\ -\0\0\x1c\0\0\0\x12\0\0\0\x11\0\0\x04\xe9\x0e\0\0\0\0\0\0\x2b\0\0\0\0\0\0\0\0\ -\x03\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\x05\0\x0e\0\0\0\0\0\0\x2d\ -\0\0\0\x01\0\0\x05\x08\x0f\0\0\x04\0\0\0\x62\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\0\ -\x28\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\0\ -\0\x11\0\0\x05\x10\x0f\0\0\x01\0\0\0\x04\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\x62\x70\ -\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\x61\ -\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\ -\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x30\x3a\ -\x30\0\x2f\x68\x6f\x6d\x65\x2f\x69\x69\x69\x2f\x6c\x69\x6e\x75\x78\x2d\x6b\x65\ -\x72\x6e\x65\x6c\x2d\x74\x6f\x6f\x6c\x63\x68\x61\x69\x6e\x2f\x73\x72\x63\x2f\ -\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\x70\x72\ -\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\x74\x65\ -\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\x63\x74\ -\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\x63\x74\ -\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\x69\x74\ -\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\x6e\x5f\ -\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\ -\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\x6e\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\ -\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\ -\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\ -\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\x29\0\x30\x3a\x32\0\x09\x5f\x5f\x75\ -\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\ -\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\x3b\0\x09\x69\x66\x20\x28\x73\ -\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\x29\0\x09\x09\x42\x50\x46\x5f\x53\ -\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\ -\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\ -\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\ -\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\x6d\x61\x78\x5f\x65\x6e\x74\x72\ -\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\x73\x69\x67\x6e\x65\x64\x20\x69\ -\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\x52\x41\x59\x5f\x53\x49\x5a\x45\ -\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\ -\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\ -\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\x2d\x3e\x69\x64\x2c\x20\x6d\x61\ -\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\x70\x2d\x3e\x6d\x61\x78\x5f\x65\ -\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\ -\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\x6f\x67\0\x64\x75\x6d\x70\x5f\ -\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\ -\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x70\x72\x6f\x67\x20\ -\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\ -\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\x62\x70\x66\x5f\x70\x72\x6f\x67\0\ -\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\ -\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\ -\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\ -\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\ -\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\x5f\x61\x75\x78\0\x61\x74\x74\x61\ -\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\x65\0\x64\x73\x74\x5f\x70\x72\x6f\ -\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x62\x74\x66\0\x09\x42\x50\x46\x5f\ -\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\ -\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x5c\x6e\x22\x2c\x20\x61\ -\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\x30\x3a\x35\0\x09\x69\x66\x20\x28\ -\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\ -\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\ -\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\ -\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\x65\x6e\0\x09\x74\x79\x70\x65\x73\ -\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\x65\x73\x3b\0\x09\x62\x70\x66\x5f\ -\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\x6b\x65\x72\x6e\x65\x6c\x28\x26\ -\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\x29\x2c\x20\x74\x79\x70\x65\x73\ -\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\x09\x73\x74\x72\x20\x3d\x20\x62\ -\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\x3b\0\x62\x74\x66\x5f\x74\x79\x70\ -\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\x6e\x61\x6d\x65\x5f\x6f\x66\x66\ -\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\x5f\x52\x45\x41\x44\x28\x74\x2c\ -\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\x30\x3a\x32\x3a\x30\0\x09\x69\ -\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3e\x3d\x20\x62\x74\x66\x2d\ -\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\x29\0\x09\x72\x65\x74\x75\x72\ -\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x3b\0\x30\x3a\ -\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\ -\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\ -\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\ -\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\ -\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\x4e\x53\x45\0\x2e\x72\x6f\x64\ -\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\x09\x4c\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\0\x04\0\0\0\x62\0\0\0\ -\x01\0\0\0\x80\0\0\0\0\0\0\0\0\x69\x74\x65\x72\x61\x74\x6f\x72\x2e\x72\x6f\x64\ -\x61\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x2f\0\0\0\0\0\0\0\0\0\0\0\0\x20\ -\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\ -\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\ -\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\ +\0\0\x20\0\0\0\x24\x0c\0\0\x01\0\0\0\x05\0\0\0\xc3\x04\0\0\x03\0\0\0\x18\0\0\0\ +\xd1\0\0\0\x09\0\0\0\0\0\0\0\xd5\0\0\0\x0b\0\0\0\x40\0\0\0\xe0\0\0\0\x0b\0\0\0\ +\x80\0\0\0\0\x02\0\0\0\0\0\0\x0a\0\0\0\xe8\x07\0\0\0\0\0\0\0\0\0\0\xf1\x08\0\0\ +\0\0\0\0\x0c\0\0\0\xf7\x01\0\0\0\0\0\0\x08\0\0\0\x40\0\0\x01\xc1\x04\0\0\x03\0\ +\0\0\x18\0\0\x01\xc9\0\0\0\x0e\0\0\0\0\0\0\x01\xcc\0\0\0\x11\0\0\0\x20\0\0\x01\ +\xd1\0\0\0\x0e\0\0\0\xa0\0\0\x01\xdd\x08\0\0\0\0\0\0\x0f\0\0\x01\xe3\x01\0\0\0\ +\0\0\0\x04\0\0\0\x20\0\0\x01\xf0\x01\0\0\0\0\0\0\x01\x01\0\0\x08\0\0\0\0\x03\0\ +\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\x01\xf5\x01\0\0\0\0\0\0\x04\0\0\ +\0\x20\0\0\0\0\x0d\0\0\x01\0\0\0\x14\0\0\x05\x39\0\0\0\x04\0\0\x02\x3e\x08\0\0\ +\0\0\0\0\x15\0\0\x02\x44\x01\0\0\0\0\0\0\x08\x01\0\0\x40\0\0\x02\x4e\x0c\0\0\ +\x01\0\0\0\x13\0\0\0\0\x02\0\0\0\0\0\0\x18\0\0\x02\x65\x04\0\0\x02\0\0\0\x10\0\ +\0\0\x13\0\0\0\x03\0\0\0\0\0\0\x02\x78\0\0\0\x19\0\0\0\x40\0\0\0\0\x02\0\0\0\0\ +\0\0\x1c\0\0\0\0\x0d\0\0\x01\0\0\0\x06\0\0\0\x1c\0\0\0\x17\0\0\x02\x7d\x0c\0\0\ +\x01\0\0\0\x1a\0\0\x02\xc9\x04\0\0\x01\0\0\0\x08\0\0\x02\xd2\0\0\0\x1d\0\0\0\0\ +\0\0\0\0\x02\0\0\0\0\0\0\x1e\0\0\x03\x23\x04\0\0\x06\0\0\0\x38\0\0\x01\xc9\0\0\ +\0\x0e\0\0\0\0\0\0\x01\xcc\0\0\0\x11\0\0\0\x20\0\0\x03\x30\0\0\0\x1f\0\0\0\xc0\ +\0\0\x03\x41\0\0\0\x19\0\0\x01\0\0\0\x03\x4a\0\0\0\x21\0\0\x01\x40\0\0\x03\x54\ +\0\0\0\x22\0\0\x01\x80\0\0\0\0\x02\0\0\0\0\0\0\x20\0\0\0\0\x0a\0\0\0\0\0\0\x10\ +\0\0\0\0\x02\0\0\0\0\0\0\x23\0\0\0\0\x02\0\0\0\0\0\0\x24\0\0\x03\x9e\x04\0\0\ +\x02\0\0\0\x08\0\0\x03\xac\0\0\0\x0e\0\0\0\0\0\0\x03\xb5\0\0\0\x0e\0\0\0\x20\0\ +\0\x03\x54\x04\0\0\x03\0\0\0\x18\0\0\x03\xbf\0\0\0\x1f\0\0\0\0\0\0\x03\xc7\0\0\ +\0\x25\0\0\0\x40\0\0\x03\xcd\0\0\0\x27\0\0\0\x80\0\0\0\0\x02\0\0\0\0\0\0\x26\0\ +\0\0\0\x02\0\0\0\0\0\0\x28\0\0\x03\xd1\x04\0\0\x01\0\0\0\x04\0\0\x03\xdc\0\0\0\ +\x0e\0\0\0\0\0\0\x04\x45\x04\0\0\x01\0\0\0\x04\0\0\x04\x4e\0\0\0\x0e\0\0\0\0\0\ +\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x20\0\0\0\x12\0\0\0\x30\0\0\x04\xc4\x0e\0\0\0\0\ +\0\0\x29\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x20\0\0\0\x12\0\0\0\x1a\0\0\ +\x04\xd8\x0e\0\0\0\0\0\0\x2b\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x20\0\0\0\ +\x12\0\0\0\x20\0\0\x04\xee\x0e\0\0\0\0\0\0\x2d\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\ +\0\0\0\0\x20\0\0\0\x12\0\0\0\x11\0\0\x05\x03\x0e\0\0\0\0\0\0\x2f\0\0\0\0\0\0\0\ +\0\x03\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\x05\x1a\x0e\0\0\0\0\0\0\ +\x31\0\0\0\x01\0\0\x05\x22\x0f\0\0\x01\0\0\0\x04\0\0\0\x36\0\0\0\0\0\0\0\x04\0\ +\0\x05\x29\x0f\0\0\x04\0\0\0\x7b\0\0\0\x2a\0\0\0\0\0\0\0\x30\0\0\0\x2c\0\0\0\ +\x30\0\0\0\x1a\0\0\0\x2e\0\0\0\x4a\0\0\0\x20\0\0\0\x30\0\0\0\x6a\0\0\0\x11\0\0\ +\x05\x31\x0f\0\0\x01\0\0\0\x04\0\0\0\x32\0\0\0\0\0\0\0\x04\0\0\x05\x39\x0e\0\0\ +\0\0\0\0\x06\0\0\0\x01\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\ +\x5f\x6d\x61\x70\0\x6d\x65\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\ +\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\ +\x66\x5f\x6d\x61\x70\0\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x32\x2f\x69\x69\x69\ +\x2f\x6c\x69\x6e\x75\x78\x2d\x6b\x65\x72\x6e\x65\x6c\x2d\x74\x6f\x6f\x6c\x63\ +\x68\x61\x69\x6e\x2f\x73\x72\x63\x2f\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\ +\x65\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\ +\x61\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\ +\x2e\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\ +\x2a\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\ +\x65\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\ +\x71\0\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\ +\x73\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\ +\x6e\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\ +\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\ +\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\ +\x70\x29\0\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\ +\x6d\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\ +\x6e\x75\x6d\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\ +\x20\x30\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\ +\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\ +\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\ +\x65\x73\x20\x20\x63\x75\x72\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\x6e\x22\x29\ +\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\x6d\x61\x78\ +\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\x73\x69\x67\ +\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\x52\x41\x59\ +\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\x5f\x53\x45\ +\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\ +\x25\x2d\x31\x36\x73\x20\x20\x25\x31\x30\x64\x20\x20\x20\x25\x31\x30\x6c\x6c\ +\x64\x5c\x6e\x22\x2c\0\x7d\0\x5f\x5f\x73\x36\x34\0\x6c\x6f\x6e\x67\x20\x6c\x6f\ +\x6e\x67\0\x62\x70\x66\x5f\x6d\x61\x70\x5f\x73\x75\x6d\x5f\x65\x6c\x65\x6d\x5f\ +\x63\x6f\x75\x6e\x74\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\ +\x70\x72\x6f\x67\0\x70\x72\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\ +\x72\x6f\x67\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\ +\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\ +\x67\x20\x3d\x20\x63\x74\x78\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\ +\x21\x70\x72\x6f\x67\x29\0\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\ +\x61\x75\x78\x20\x3d\x20\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\ +\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\ +\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\ +\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\ +\x66\x5f\x70\x72\x6f\x67\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\ +\x6e\x63\x5f\x6e\x61\x6d\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\ +\x63\x5f\x69\x6e\x66\x6f\0\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\ +\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\ +\x31\x36\x73\x20\x25\x73\x20\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\ +\x69\x64\x2c\0\x30\x3a\x34\0\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\ +\x29\0\x62\x70\x66\x5f\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\ +\x5f\x6f\x66\x66\0\x74\x79\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\ +\x73\0\x74\x79\x70\x65\x73\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\ +\x72\0\x73\x74\x72\x5f\x6c\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\ +\x74\x66\x2d\x3e\x74\x79\x70\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\ +\x65\x5f\x72\x65\x61\x64\x5f\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\ +\x69\x7a\x65\x6f\x66\x28\x74\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\ +\x74\x66\x5f\x69\x64\x29\x3b\0\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\ +\x73\x74\x72\x69\x6e\x67\x73\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\ +\x6d\x65\x5f\x6f\x66\x66\0\x09\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\ +\x50\x46\x5f\x43\x4f\x52\x45\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\ +\x65\x5f\x6f\x66\x66\x29\x3b\0\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\ +\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\ +\x2e\x73\x74\x72\x5f\x6c\x65\x6e\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\ +\x72\x20\x2b\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\ +\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\ +\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\ +\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\ +\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\ +\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\x4e\x53\x45\0\x2e\x6b\x73\x79\x6d\x73\0\x2e\ +\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x64\x75\x6d\x6d\x79\ +\x5f\x6b\x73\x79\x6d\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x09\xdc\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\0\x04\0\0\0\x7b\0\0\0\x01\0\0\0\x80\0\0\0\0\ +\0\0\0\0\x69\x74\x65\x72\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\x74\x61\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\x34\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\x69\x64\x20\x6e\x61\ +\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\ +\x65\x6e\x74\x72\x69\x65\x73\x20\x20\x63\x75\x72\x5f\x65\x6e\x74\x72\x69\x65\ +\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x20\x25\x31\x30\x64\x20\x20\ +\x20\x25\x31\x30\x6c\x6c\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\ \x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\ \x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\0\0\0\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x21\0\0\0\0\0\0\x79\x62\0\0\ -\0\0\0\0\x79\x71\0\x08\0\0\0\0\x15\x70\0\x1a\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\ -\x10\0\x08\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\xff\xe8\xbf\x16\0\0\ -\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xb7\x30\0\0\0\0\0\x23\xb7\x50\0\0\ -\0\0\0\0\x85\0\0\0\0\0\0\x7e\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xe8\0\0\0\0\xb7\ -\x10\0\0\0\0\0\x04\xbf\x27\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x7b\xa2\xff\xf0\0\0\ -\0\0\x61\x17\0\x14\0\0\0\0\x7b\xa1\xff\xf8\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\ -\0\0\xff\xff\xff\xe8\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x23\ -\xb7\x30\0\0\0\0\0\x0e\xb7\x50\0\0\0\0\0\x18\x85\0\0\0\0\0\0\x7e\xb7\0\0\0\0\0\ -\0\0\x95\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x9a\0\x01\x3c\ -\x1e\0\0\0\x01\0\0\0\x42\0\0\0\x9a\0\x01\x3c\x24\0\0\0\x02\0\0\0\x42\0\0\x01\ -\x0d\0\x01\x44\x1d\0\0\0\x03\0\0\0\x42\0\0\x01\x2e\0\x01\x4c\x06\0\0\0\x04\0\0\ -\0\x42\0\0\x01\x3d\0\x01\x40\x1d\0\0\0\x05\0\0\0\x42\0\0\x01\x62\0\x01\x58\x06\ -\0\0\0\x07\0\0\0\x42\0\0\x01\x75\0\x01\x5c\x03\0\0\0\x0e\0\0\0\x42\0\0\x01\xfb\ -\0\x01\x64\x02\0\0\0\x1e\0\0\0\x42\0\0\x02\x49\0\x01\x6c\x01\0\0\0\0\0\0\0\x02\ -\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\ -\0\x01\x09\0\0\0\0\0\0\0\x20\0\0\0\x08\0\0\x01\x39\0\0\0\0\0\0\0\x70\0\0\0\x0d\ -\0\0\0\x3e\0\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\x01\x09\0\0\0\0\0\0\0\xa0\0\0\0\x0d\ -\0\0\x01\x39\0\0\0\0\0\0\0\x1a\0\0\0\x20\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\ -\x6d\x61\x70\0\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\ -\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x09\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\ -\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x21\0\0\ -\0\0\0\0\x79\x62\0\0\0\0\0\0\x79\x11\0\x08\0\0\0\0\x15\x10\0\x3b\0\0\0\0\x79\ -\x71\0\0\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\x10\0\x08\0\0\0\0\xbf\x4a\0\0\0\0\0\ -\0\x07\x40\0\0\xff\xff\xff\xd0\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\x31\xb7\x30\0\0\0\0\0\x20\xb7\x50\0\0\0\0\0\0\x85\0\0\0\0\0\0\x7e\x7b\ -\xa6\xff\xc8\0\0\0\0\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xd0\0\0\0\0\xb7\x30\0\0\0\ -\0\0\x04\xbf\x97\0\0\0\0\0\0\x0f\x93\0\0\0\0\0\0\x79\x17\0\x28\0\0\0\0\x79\x87\ -\0\x30\0\0\0\0\x15\x80\0\x18\0\0\0\0\xb7\x20\0\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\ -\x61\x11\0\x04\0\0\0\0\x79\x38\0\x08\0\0\0\0\x67\x10\0\0\0\0\0\x03\x0f\x31\0\0\ -\0\0\0\0\x79\x68\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\xf8\ -\xb7\x20\0\0\0\0\0\x08\x85\0\0\0\0\0\0\x71\xb7\x10\0\0\0\0\0\0\x79\x3a\xff\xf8\ +\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x21\0\0\0\0\0\0\x79\x62\0\0\0\ +\0\0\0\x79\x71\0\x08\0\0\0\0\x15\x70\0\x1d\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\ +\x10\0\x08\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\xff\xe0\xbf\x16\0\0\ +\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xb4\x30\0\0\0\0\0\x30\xb4\x50\0\0\ +\0\0\0\0\x85\0\0\0\0\0\0\x7e\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xe0\0\0\0\0\xb7\ +\x10\0\0\0\0\0\x04\xbf\x27\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x7b\xa2\xff\xe8\0\0\ +\0\0\x61\x17\0\x14\0\0\0\0\x7b\xa1\xff\xf0\0\0\0\0\xbf\x17\0\0\0\0\0\0\x85\x02\ +\0\0\0\0\0\0\x7b\xa0\xff\xf8\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\ +\xff\xe0\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x30\xb4\x30\0\0\ +\0\0\0\x1a\xb4\x50\0\0\0\0\0\x20\x85\0\0\0\0\0\0\x7e\xb4\0\0\0\0\0\0\0\x95\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x9b\0\x01\x44\x1e\0\0\0\ +\x01\0\0\0\x42\0\0\0\x9b\0\x01\x44\x24\0\0\0\x02\0\0\0\x42\0\0\x01\x0e\0\x01\ +\x4c\x1d\0\0\0\x03\0\0\0\x42\0\0\x01\x2f\0\x01\x54\x06\0\0\0\x04\0\0\0\x42\0\0\ +\x01\x3e\0\x01\x48\x1d\0\0\0\x05\0\0\0\x42\0\0\x01\x63\0\x01\x60\x0e\0\0\0\x08\ +\0\0\0\x42\0\0\x01\x76\0\x01\x64\x03\0\0\0\x0e\0\0\0\x42\0\0\x02\x09\0\x01\x6c\ +\x02\0\0\0\x21\0\0\0\x42\0\0\x02\x3c\0\x01\x80\x01\0\0\0\0\0\0\0\x02\0\0\0\x3e\ +\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\0\x01\x0a\ +\0\0\0\0\0\0\0\x20\0\0\0\x08\0\0\x01\x3a\0\0\0\0\0\0\0\x70\0\0\0\x0d\0\0\0\x3e\ +\0\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\x01\x0a\0\0\0\0\0\0\0\xa0\0\0\0\x0d\0\0\x01\ +\x3a\0\0\0\0\0\0\0\x1a\0\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\ +\x70\0\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\ +\0\x10\0\0\0\0\0\0\0\0\0\0\0\x09\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\ +\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x62\x70\x66\x5f\x6d\x61\x70\x5f\x73\x75\ +\x6d\x5f\x65\x6c\x65\x6d\x5f\x63\x6f\x75\x6e\x74\0\0\x47\x50\x4c\0\0\0\0\0\x79\ +\x21\0\0\0\0\0\0\x79\x62\0\0\0\0\0\0\x79\x11\0\x08\0\0\0\0\x15\x10\0\x3b\0\0\0\ +\0\x79\x71\0\0\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\x10\0\x08\0\0\0\0\xbf\x4a\0\0\ +\0\0\0\0\x07\x40\0\0\xff\xff\xff\xd0\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\x4a\xb4\x30\0\0\0\0\0\x20\xb4\x50\0\0\0\0\0\0\x85\0\0\0\0\0\0\x7e\ +\x7b\xa6\xff\xc8\0\0\0\0\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xd0\0\0\0\0\xb7\x30\0\ +\0\0\0\0\x04\xbf\x97\0\0\0\0\0\0\x0f\x93\0\0\0\0\0\0\x79\x17\0\x28\0\0\0\0\x79\ +\x87\0\x30\0\0\0\0\x15\x80\0\x18\0\0\0\0\xb7\x20\0\0\0\0\0\0\x0f\x12\0\0\0\0\0\ +\0\x61\x11\0\x04\0\0\0\0\x79\x38\0\x08\0\0\0\0\x67\x10\0\0\0\0\0\x03\x0f\x31\0\ +\0\0\0\0\0\x79\x68\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\xf8\ +\xb4\x20\0\0\0\0\0\x08\x85\0\0\0\0\0\0\x71\xb7\x10\0\0\0\0\0\0\x79\x3a\xff\xf8\ \0\0\0\0\x0f\x31\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\xf4\ -\xb7\x20\0\0\0\0\0\x04\x85\0\0\0\0\0\0\x71\xb7\x30\0\0\0\0\0\x04\x61\x1a\xff\ -\xf4\0\0\0\0\x61\x28\0\x10\0\0\0\0\x3d\x12\0\x02\0\0\0\0\x0f\x61\0\0\0\0\0\0\ +\xb4\x20\0\0\0\0\0\x04\x85\0\0\0\0\0\0\x71\xb7\x30\0\0\0\0\0\x04\x61\x1a\xff\ +\xf4\0\0\0\0\x61\x28\0\x10\0\0\0\0\x3e\x12\0\x02\0\0\0\0\x0f\x61\0\0\0\0\0\0\ \xbf\x96\0\0\0\0\0\0\x7b\xa9\xff\xd8\0\0\0\0\x79\x17\0\x18\0\0\0\0\x7b\xa1\xff\ \xe0\0\0\0\0\x79\x17\0\x20\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x13\0\0\0\0\0\0\x7b\ \xa1\xff\xe8\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\xff\xd0\x79\x1a\ -\xff\xc8\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x51\xb7\x30\0\0\0\0\0\x11\ -\xb7\x50\0\0\0\0\0\x20\x85\0\0\0\0\0\0\x7e\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\x17\0\0\0\0\0\0\0\x42\0\0\0\x9a\0\x01\x80\x1e\0\0\0\x01\0\0\0\ -\x42\0\0\0\x9a\0\x01\x80\x24\0\0\0\x02\0\0\0\x42\0\0\x02\x7f\0\x01\x88\x1f\0\0\ -\0\x03\0\0\0\x42\0\0\x02\xa3\0\x01\x94\x06\0\0\0\x04\0\0\0\x42\0\0\x02\xbc\0\ -\x01\xa0\x0e\0\0\0\x05\0\0\0\x42\0\0\x01\x3d\0\x01\x84\x1d\0\0\0\x06\0\0\0\x42\ -\0\0\x01\x62\0\x01\xa4\x06\0\0\0\x08\0\0\0\x42\0\0\x02\xce\0\x01\xa8\x03\0\0\0\ -\x10\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x17\0\0\0\x42\0\0\x03\x79\0\x01\ -\x04\x06\0\0\0\x1a\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x1b\0\0\0\x42\0\0\ -\x03\xca\0\x01\x10\x0f\0\0\0\x1c\0\0\0\x42\0\0\x03\xdf\0\x01\x14\x2d\0\0\0\x1e\ -\0\0\0\x42\0\0\x04\x16\0\x01\x0c\x0d\0\0\0\x20\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\ -\x02\0\0\0\x21\0\0\0\x42\0\0\x03\xdf\0\x01\x14\x02\0\0\0\x24\0\0\0\x42\0\0\x04\ -\x3d\0\x01\x18\x0d\0\0\0\x27\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x28\0\0\ -\0\x42\0\0\x04\x3d\0\x01\x18\x0d\0\0\0\x2b\0\0\0\x42\0\0\x04\x3d\0\x01\x18\x0d\ -\0\0\0\x2c\0\0\0\x42\0\0\x04\x6b\0\x01\x1c\x1b\0\0\0\x2d\0\0\0\x42\0\0\x04\x6b\ -\0\x01\x1c\x06\0\0\0\x2e\0\0\0\x42\0\0\x04\x8e\0\x01\x24\x0d\0\0\0\x30\0\0\0\ -\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x3f\0\0\0\x42\0\0\x02\x49\0\x01\xc0\x01\0\ -\0\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\ -\x10\0\0\0\x14\0\0\x01\x09\0\0\0\0\0\0\0\x20\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\ -\x28\0\0\0\x08\0\0\x01\x39\0\0\0\0\0\0\0\x80\0\0\0\x1a\0\0\0\x3e\0\0\0\0\0\0\0\ -\x90\0\0\0\x1a\0\0\x01\x09\0\0\0\0\0\0\0\xa8\0\0\0\x1a\0\0\x03\x71\0\0\0\0\0\0\ -\0\xb0\0\0\0\x1a\0\0\x03\x75\0\0\0\0\0\0\0\xc0\0\0\0\x1f\0\0\x03\xa3\0\0\0\0\0\ -\0\0\xd8\0\0\0\x20\0\0\x01\x09\0\0\0\0\0\0\0\xf0\0\0\0\x20\0\0\0\x3e\0\0\0\0\0\ -\0\x01\x18\0\0\0\x24\0\0\0\x3e\0\0\0\0\0\0\x01\x50\0\0\0\x1a\0\0\x01\x09\0\0\0\ -\0\0\0\x01\x60\0\0\0\x20\0\0\x04\x65\0\0\0\0\0\0\x01\x88\0\0\0\x1a\0\0\x01\x39\ -\0\0\0\0\0\0\x01\x98\0\0\0\x1a\0\0\x04\xa6\0\0\0\0\0\0\x01\xa0\0\0\0\x18\0\0\0\ -\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x41\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\ -\x6f\x67\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\ -\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x19\0\0\0\x01\0\0\0\0\0\0\0\x12\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\ -\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0"; - opts.insns_sz = 2216; - opts.insns = (void *)"\ +\xff\xc8\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x6a\xb4\x30\0\0\0\0\0\x11\ +\xb4\x50\0\0\0\0\0\x20\x85\0\0\0\0\0\0\x7e\xb4\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\x1b\0\0\0\0\0\0\0\x42\0\0\0\x9b\0\x01\x94\x1e\0\0\0\x01\0\0\0\ +\x42\0\0\0\x9b\0\x01\x94\x24\0\0\0\x02\0\0\0\x42\0\0\x02\x99\0\x01\x9c\x1f\0\0\ +\0\x03\0\0\0\x42\0\0\x02\xbd\0\x01\xa8\x06\0\0\0\x04\0\0\0\x42\0\0\x02\xd6\0\ +\x01\xb4\x0e\0\0\0\x05\0\0\0\x42\0\0\x01\x3e\0\x01\x98\x1d\0\0\0\x06\0\0\0\x42\ +\0\0\x01\x63\0\x01\xb8\x0e\0\0\0\x09\0\0\0\x42\0\0\x02\xe8\0\x01\xbc\x03\0\0\0\ +\x10\0\0\0\x42\0\0\x03\x58\0\x01\xc4\x02\0\0\0\x17\0\0\0\x42\0\0\x03\x93\0\x01\ +\x04\x06\0\0\0\x1a\0\0\0\x42\0\0\x03\x58\0\x01\xc4\x02\0\0\0\x1b\0\0\0\x42\0\0\ +\x03\xe4\0\x01\x10\x0f\0\0\0\x1c\0\0\0\x42\0\0\x03\xf9\0\x01\x14\x2d\0\0\0\x1e\ +\0\0\0\x42\0\0\x04\x30\0\x01\x0c\x0d\0\0\0\x21\0\0\0\x42\0\0\x03\xf9\0\x01\x14\ +\x02\0\0\0\x24\0\0\0\x42\0\0\x04\x57\0\x01\x18\x0d\0\0\0\x2b\0\0\0\x42\0\0\x04\ +\x57\0\x01\x18\x0d\0\0\0\x2c\0\0\0\x42\0\0\x04\x85\0\x01\x1c\x1b\0\0\0\x2d\0\0\ +\0\x42\0\0\x04\x85\0\x01\x1c\x0f\0\0\0\x2e\0\0\0\x42\0\0\x04\xa8\0\x01\x24\x0d\ +\0\0\0\x30\0\0\0\x42\0\0\x03\x58\0\x01\xc4\x02\0\0\0\x3f\0\0\0\x42\0\0\x02\x3c\ +\0\x01\xd4\x01\0\0\0\0\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\ +\x3e\0\0\0\0\0\0\0\x10\0\0\0\x18\0\0\x01\x0a\0\0\0\0\0\0\0\x20\0\0\0\x1c\0\0\0\ +\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\x01\x3a\0\0\0\0\0\0\0\x80\0\0\0\x1e\0\0\0\ +\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1e\0\0\x01\x0a\0\0\0\0\0\0\0\xa8\0\0\0\x1e\0\0\ +\x03\x8b\0\0\0\0\0\0\0\xb0\0\0\0\x1e\0\0\x03\x8f\0\0\0\0\0\0\0\xc0\0\0\0\x23\0\ +\0\x03\xbd\0\0\0\0\0\0\0\xd8\0\0\0\x24\0\0\x01\x0a\0\0\0\0\0\0\0\xf0\0\0\0\x24\ +\0\0\0\x3e\0\0\0\0\0\0\x01\x18\0\0\0\x28\0\0\0\x3e\0\0\0\0\0\0\x01\x50\0\0\0\ +\x1e\0\0\x01\x0a\0\0\0\0\0\0\x01\x60\0\0\0\x24\0\0\x04\x7f\0\0\0\0\0\0\x01\x88\ +\0\0\0\x1e\0\0\x01\x3a\0\0\0\0\0\0\x01\x98\0\0\0\x1e\0\0\x04\xc0\0\0\0\0\0\0\ +\x01\xa0\0\0\0\x1c\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x41\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\ +\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\ +\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x16\0\0\0\x01\0\0\0\0\0\ +\0\0\x12\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\ +\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0"; + static const char opts_insn[] __attribute__((__aligned__(8))) = "\ \xbf\x61\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\x78\xb7\x20\0\ \0\0\0\0\x88\xb7\x30\0\0\0\0\0\0\x85\0\0\0\0\0\0\x71\x05\0\0\x14\0\0\0\0\x61\ \x1a\xff\x78\0\0\0\0\xd5\x10\0\x01\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x1a\xff\x7c\ @@ -318,72 +321,87 @@ iterators_bpf__load(struct iterators_bpf *skel) \0\0\0\x85\0\0\0\0\0\0\xa8\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\ \0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xbf\x07\0\0\ \0\0\0\0\x95\0\0\0\0\0\0\0\x61\x06\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\ -\0\x0e\x68\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\ -\0\0\0\x0e\x64\x63\x10\0\0\0\0\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\ -\0\0\0\0\0\x0e\x58\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x05\0\ -\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0e\x50\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\ -\x12\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0e\x50\xb7\x30\0\0\0\0\0\x1c\x85\0\0\0\0\ +\0\x0e\xf8\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\ +\0\0\0\x0e\xf4\x63\x10\0\0\0\0\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\ +\0\0\0\0\0\x0e\xe8\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x05\0\ +\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xe0\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\ +\x12\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xe0\xb7\x30\0\0\0\0\0\x1c\x85\0\0\0\0\ \0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\xd4\0\0\0\0\x63\xa7\xff\x78\0\0\0\0\ -\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xa0\x63\x10\0\0\0\ +\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x30\x63\x10\0\0\0\ \0\0\0\x61\x06\0\x1c\0\0\0\0\x15\0\0\x03\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\ -\0\x0e\x7c\x63\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\ -\0\0\x0e\x70\xb7\x30\0\0\0\0\0\x48\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\ +\0\x0f\x0c\x63\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\ +\0\0\x0f\0\xb7\x30\0\0\0\0\0\x48\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\ \x70\xff\xc3\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x17\0\0\0\0\0\0\ \x79\x36\0\x20\0\0\0\0\x15\x30\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\ -\x0e\xb8\xb7\x20\0\0\0\0\0\x62\x61\x06\0\x04\0\0\0\0\x45\0\0\x02\0\0\0\x01\x85\ +\x0f\x48\xb7\x20\0\0\0\0\0\x7b\x61\x06\0\x04\0\0\0\0\x45\0\0\x02\0\0\0\x01\x85\ \0\0\0\0\0\0\x94\x05\0\0\x01\0\0\0\0\x85\0\0\0\0\0\0\x71\x18\x26\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x28\x63\ -\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x20\x18\x16\0\0\0\0\0\0\0\ -\0\0\0\0\0\x0f\x30\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xb8\ -\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x38\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\ -\x02\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x28\xb7\x30\0\0\0\0\0\x20\x85\0\0\0\0\ +\0\0\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xd0\x63\ +\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xc8\x18\x16\0\0\0\0\0\0\0\ +\0\0\0\0\0\x0f\xd8\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x48\ +\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xe0\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\ +\x02\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xd0\xb7\x30\0\0\0\0\0\x20\x85\0\0\0\0\ \0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\x9f\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x48\x63\x10\ -\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\x16\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x48\xb7\ +\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xf0\x63\x10\ +\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\x16\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xf0\xb7\ \x30\0\0\0\0\0\x04\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\x92\0\0\ -\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x50\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\ -\x11\x70\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x58\x18\x16\0\ -\0\0\0\0\0\0\0\0\0\0\0\x11\x68\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\ -\0\0\x10\x58\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xb0\x7b\x10\0\0\0\0\0\0\x18\ -\x06\0\0\0\0\0\0\0\0\0\0\0\0\x10\x60\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xc0\ -\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x10\xf0\x18\x16\0\0\0\0\0\ -\0\0\0\0\0\0\0\x11\xe0\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xd8\x7b\x10\0\0\0\0\0\0\x61\x06\0\x08\0\0\ -\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x78\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\ -\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x7c\x63\x10\0\0\0\0\0\0\x79\x06\0\ -\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x80\x7b\x10\0\0\0\0\0\0\x61\ -\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xa8\x63\x10\0\0\0\0\0\ -\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xf0\xb7\x20\0\0\0\0\0\x11\xb7\x30\0\0\0\ -\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\0\0\xc5\x70\ -\xff\x5c\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x11\x60\x63\x07\0\x6c\0\0\0\0\ -\x77\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\xb7\x10\0\0\0\0\0\x05\x18\x26\0\0\ -\0\0\0\0\0\0\0\0\0\0\x11\x60\xb7\x30\0\0\0\0\0\x8c\x85\0\0\0\0\0\0\xa6\xbf\x70\ -\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x11\xd0\x61\x10\0\0\0\0\0\0\xd5\ -\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xc5\x70\xff\x4a\0\0\ -\0\0\x63\xa7\xff\x80\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\x08\x18\x16\0\ -\0\0\0\0\0\0\0\0\0\0\0\x16\xe0\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\ -\0\0\x12\x10\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd8\x7b\x10\0\0\0\0\0\0\x18\ -\x06\0\0\0\0\0\0\0\0\0\0\0\0\x14\x18\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x20\ -\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x14\x20\x18\x16\0\0\0\0\0\ -\0\0\0\0\0\0\0\x17\x30\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x15\ -\xb0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x50\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x48\x7b\x10\0\0\0\0\ -\0\0\x61\x06\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xe8\x63\x10\0\0\ -\0\0\0\0\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xec\x63\x10\ -\0\0\0\0\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xf0\x7b\ -\x10\0\0\0\0\0\0\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\ -\x18\x63\x10\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x60\xb7\x20\0\0\0\ -\0\0\x12\xb7\x30\0\0\0\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\ -\0\0\0\0\0\0\xc5\x70\xff\x13\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd0\ -\x63\x07\0\x6c\0\0\0\0\x77\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\xb7\x10\0\0\ -\0\0\0\x05\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd0\xb7\x30\0\0\0\0\0\x8c\x85\0\ -\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x17\x40\x61\ -\x10\0\0\0\0\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\ -\xc5\x70\xff\x01\0\0\0\0\x63\xa7\xff\x84\0\0\0\0\x61\x1a\xff\x78\0\0\0\0\xd5\ -\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x0a\xff\x80\0\0\ -\0\0\x63\x60\0\x28\0\0\0\0\x61\x0a\xff\x84\0\0\0\0\x63\x60\0\x2c\0\0\0\0\x18\ -\x16\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\0\0\x63\x60\0\x18\0\0\0\0\xb7\ -\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0"; +\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xf8\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\ +\x12\x30\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\x18\x16\0\0\ +\0\0\0\0\0\0\0\0\0\0\x12\x28\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\ +\0\x11\x18\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x70\x7b\x10\0\0\0\0\0\0\x18\x06\ +\0\0\0\0\0\0\0\0\0\0\0\0\x11\x20\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x80\x7b\ +\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x11\xb0\x18\x16\0\0\0\0\0\0\0\ +\0\0\0\0\0\x12\xa0\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x18\ +\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x98\x7b\x10\0\0\0\0\0\0\x61\x06\0\x08\0\0\0\0\ +\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x38\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\0\0\ +\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x3c\x63\x10\0\0\0\0\0\0\x79\x06\0\x10\ +\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x40\x7b\x10\0\0\0\0\0\0\x61\x0a\ +\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x68\x63\x10\0\0\0\0\0\0\ +\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\xb0\xb7\x20\0\0\0\0\0\x11\xb7\x30\0\0\0\0\ +\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\ +\x5c\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\x20\x63\x07\0\x6c\0\0\0\0\x77\ +\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\x18\x86\0\0\0\0\0\0\0\0\0\0\0\0\x10\ +\xb8\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\xc8\xb7\x20\0\0\0\0\0\x17\xb7\x30\0\0\ +\0\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\0\0\xc5\x70\ +\xff\x4d\0\0\0\0\x75\x70\0\x03\0\0\0\0\x62\x80\0\x04\0\0\0\0\x6a\x80\0\x02\0\0\ +\0\0\x05\0\0\x0a\0\0\0\0\x63\x87\0\x04\0\0\0\0\xbf\x97\0\0\0\0\0\0\x77\x90\0\0\ +\0\0\0\x20\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\x63\x09\0\0\0\0\0\0\x55\x90\0\ +\x02\0\0\0\0\x6a\x80\0\x02\0\0\0\0\x05\0\0\x01\0\0\0\0\x6a\x80\0\x02\0\0\0\x40\ +\xb7\x10\0\0\0\0\0\x05\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x12\x20\xb7\x30\0\0\0\0\ +\0\x8c\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\ +\x01\0\x61\x10\0\0\0\0\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\ +\0\0\0\xa8\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\x90\x61\x10\0\0\0\0\0\0\xd5\x10\ +\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xc5\x70\xff\x2c\0\0\0\0\ +\x63\xa7\xff\x80\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\xe0\x18\x16\0\0\0\ +\0\0\0\0\0\0\0\0\0\x17\x88\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\ +\x12\xe8\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x80\x7b\x10\0\0\0\0\0\0\x18\x06\0\ +\0\0\0\0\0\0\0\0\0\0\0\x14\xf0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\xc8\x7b\x10\ +\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x14\xf8\x18\x16\0\0\0\0\0\0\0\0\0\ +\0\0\0\x17\xd8\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x16\x58\x18\ +\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\xf8\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\xf0\x7b\x10\0\0\0\0\0\0\x61\ +\x06\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x90\x63\x10\0\0\0\0\0\0\ +\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x94\x63\x10\0\0\0\0\ +\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x98\x7b\x10\0\0\ +\0\0\0\0\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\xc0\x63\ +\x10\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x18\x08\xb7\x20\0\0\0\0\0\x12\ +\xb7\x30\0\0\0\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\ +\0\0\xc5\x70\xfe\xf5\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x17\x78\x63\x07\0\ +\x6c\0\0\0\0\x77\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\xb7\x10\0\0\0\0\0\x05\ +\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x17\x78\xb7\x30\0\0\0\0\0\x8c\x85\0\0\0\0\0\0\ +\xa6\xbf\x70\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x17\xe8\x61\x10\0\0\0\ +\0\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xc5\x70\ +\xfe\xe3\0\0\0\0\x63\xa7\xff\x84\0\0\0\0\x61\x1a\xff\x78\0\0\0\0\xd5\x10\0\x02\ +\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x0a\xff\x80\0\0\0\0\x63\ +\x60\0\x28\0\0\0\0\x61\x0a\xff\x84\0\0\0\0\x63\x60\0\x2c\0\0\0\0\x18\x16\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\0\0\x63\x60\0\x18\0\0\0\0\xb7\0\0\0\0\0\ +\0\0\x95\0\0\0\0\0\0\0"; + + opts.ctx = (struct bpf_loader_ctx *)skel; + opts.data_sz = sizeof(opts_data) - 1; + opts.data = (void *)opts_data; + opts.insns_sz = sizeof(opts_insn) - 1; + opts.insns = (void *)opts_insn; + err = bpf_load_and_run(&opts); if (err < 0) return err; diff --git a/kernel/bpf/prog_iter.c b/kernel/bpf/prog_iter.c index 53a73c841c13..85d8fcb56fb7 100644 --- a/kernel/bpf/prog_iter.c +++ b/kernel/bpf/prog_iter.c @@ -78,8 +78,7 @@ static const struct seq_operations bpf_prog_seq_ops = { .show = bpf_prog_seq_show, }; -BTF_ID_LIST(btf_bpf_prog_id) -BTF_ID(struct, bpf_prog) +BTF_ID_LIST_SINGLE(btf_bpf_prog_id, struct, bpf_prog) static const struct bpf_iter_seq_info bpf_prog_seq_info = { .seq_ops = &bpf_prog_seq_ops, diff --git a/kernel/bpf/range_tree.c b/kernel/bpf/range_tree.c index 37b80a23ae1a..99c63d982c5d 100644 --- a/kernel/bpf/range_tree.c +++ b/kernel/bpf/range_tree.c @@ -2,7 +2,6 @@ /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ #include <linux/interval_tree_generic.h> #include <linux/slab.h> -#include <linux/bpf_mem_alloc.h> #include <linux/bpf.h> #include "range_tree.h" @@ -21,7 +20,7 @@ * in commit 6772fcc8890a ("xfs: convert xbitmap to interval tree"). * * The implementation relies on external lock to protect rbtree-s. - * The alloc/free of range_node-s is done via bpf_mem_alloc. + * The alloc/free of range_node-s is done via kmalloc_nolock(). * * bpf arena is using range_tree to represent unallocated slots. * At init time: @@ -150,9 +149,7 @@ int range_tree_clear(struct range_tree *rt, u32 start, u32 len) range_it_insert(rn, rt); /* Add a range */ - migrate_disable(); - new_rn = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node)); - migrate_enable(); + new_rn = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE); if (!new_rn) return -ENOMEM; new_rn->rn_start = last + 1; @@ -172,9 +169,7 @@ int range_tree_clear(struct range_tree *rt, u32 start, u32 len) } else { /* in the middle of the clearing range */ range_it_remove(rn, rt); - migrate_disable(); - bpf_mem_free(&bpf_global_ma, rn); - migrate_enable(); + kfree_nolock(rn); } } return 0; @@ -227,9 +222,7 @@ int range_tree_set(struct range_tree *rt, u32 start, u32 len) range_it_remove(right, rt); left->rn_last = right->rn_last; range_it_insert(left, rt); - migrate_disable(); - bpf_mem_free(&bpf_global_ma, right); - migrate_enable(); + kfree_nolock(right); } else if (left) { /* Combine with the left range */ range_it_remove(left, rt); @@ -241,9 +234,7 @@ int range_tree_set(struct range_tree *rt, u32 start, u32 len) right->rn_start = start; range_it_insert(right, rt); } else { - migrate_disable(); - left = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node)); - migrate_enable(); + left = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE); if (!left) return -ENOMEM; left->rn_start = start; @@ -259,7 +250,7 @@ void range_tree_destroy(struct range_tree *rt) while ((rn = range_it_iter_first(rt, 0, -1U))) { range_it_remove(rn, rt); - bpf_mem_free(&bpf_global_ma, rn); + kfree_nolock(rn); } } diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 719d73299397..f6a075ffac63 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -13,7 +13,7 @@ #include <linux/btf_ids.h> #include <asm/rqspinlock.h> -#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) +#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE | BPF_F_RB_OVERWRITE) /* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */ #define RINGBUF_PGOFF \ @@ -30,6 +30,7 @@ struct bpf_ringbuf { u64 mask; struct page **pages; int nr_pages; + bool overwrite_mode; rqspinlock_t spinlock ____cacheline_aligned_in_smp; /* For user-space producer ring buffers, an atomic_t busy bit is used * to synchronize access to the ring buffers in the kernel, rather than @@ -73,6 +74,7 @@ struct bpf_ringbuf { unsigned long consumer_pos __aligned(PAGE_SIZE); unsigned long producer_pos __aligned(PAGE_SIZE); unsigned long pending_pos; + unsigned long overwrite_pos; /* position after the last overwritten record */ char data[] __aligned(PAGE_SIZE); }; @@ -166,7 +168,7 @@ static void bpf_ringbuf_notify(struct irq_work *work) * considering that the maximum value of data_sz is (4GB - 1), there * will be no overflow, so just note the size limit in the comments. */ -static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) +static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node, bool overwrite_mode) { struct bpf_ringbuf *rb; @@ -183,17 +185,25 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) rb->consumer_pos = 0; rb->producer_pos = 0; rb->pending_pos = 0; + rb->overwrite_mode = overwrite_mode; return rb; } static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) { + bool overwrite_mode = false; struct bpf_ringbuf_map *rb_map; if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); + if (attr->map_flags & BPF_F_RB_OVERWRITE) { + if (attr->map_type != BPF_MAP_TYPE_RINGBUF) + return ERR_PTR(-EINVAL); + overwrite_mode = true; + } + if (attr->key_size || attr->value_size || !is_power_of_2(attr->max_entries) || !PAGE_ALIGNED(attr->max_entries)) @@ -205,7 +215,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) bpf_map_init_from_attr(&rb_map->map, attr); - rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); + rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node, overwrite_mode); if (!rb_map->rb) { bpf_map_area_free(rb_map); return ERR_PTR(-ENOMEM); @@ -216,6 +226,8 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) static void bpf_ringbuf_free(struct bpf_ringbuf *rb) { + irq_work_sync(&rb->work); + /* copy pages pointer and nr_pages to local variable, as we are going * to unmap rb itself with vunmap() below */ @@ -293,13 +305,26 @@ static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); } +/* + * Return an estimate of the available data in the ring buffer. + * Note: the returned value can exceed the actual ring buffer size because the + * function is not synchronized with the producer. The producer acquires the + * ring buffer's spinlock, but this function does not. + */ static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) { - unsigned long cons_pos, prod_pos; + unsigned long cons_pos, prod_pos, over_pos; cons_pos = smp_load_acquire(&rb->consumer_pos); - prod_pos = smp_load_acquire(&rb->producer_pos); - return prod_pos - cons_pos; + + if (unlikely(rb->overwrite_mode)) { + over_pos = smp_load_acquire(&rb->overwrite_pos); + prod_pos = smp_load_acquire(&rb->producer_pos); + return prod_pos - max(cons_pos, over_pos); + } else { + prod_pos = smp_load_acquire(&rb->producer_pos); + return prod_pos - cons_pos; + } } static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb) @@ -402,11 +427,43 @@ bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr) return (void*)((addr & PAGE_MASK) - off); } +static bool bpf_ringbuf_has_space(const struct bpf_ringbuf *rb, + unsigned long new_prod_pos, + unsigned long cons_pos, + unsigned long pend_pos) +{ + /* + * No space if oldest not yet committed record until the newest + * record span more than (ringbuf_size - 1). + */ + if (new_prod_pos - pend_pos > rb->mask) + return false; + + /* Ok, we have space in overwrite mode */ + if (unlikely(rb->overwrite_mode)) + return true; + + /* + * No space if producer position advances more than (ringbuf_size - 1) + * ahead of consumer position when not in overwrite mode. + */ + if (new_prod_pos - cons_pos > rb->mask) + return false; + + return true; +} + +static u32 bpf_ringbuf_round_up_hdr_len(u32 hdr_len) +{ + hdr_len &= ~BPF_RINGBUF_DISCARD_BIT; + return round_up(hdr_len + BPF_RINGBUF_HDR_SZ, 8); +} + static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) { - unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, flags; + unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, over_pos, flags; struct bpf_ringbuf_hdr *hdr; - u32 len, pg_off, tmp_size, hdr_len; + u32 len, pg_off, hdr_len; if (unlikely(size > RINGBUF_MAX_RECORD_SZ)) return NULL; @@ -429,24 +486,43 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) hdr_len = READ_ONCE(hdr->len); if (hdr_len & BPF_RINGBUF_BUSY_BIT) break; - tmp_size = hdr_len & ~BPF_RINGBUF_DISCARD_BIT; - tmp_size = round_up(tmp_size + BPF_RINGBUF_HDR_SZ, 8); - pend_pos += tmp_size; + pend_pos += bpf_ringbuf_round_up_hdr_len(hdr_len); } rb->pending_pos = pend_pos; - /* check for out of ringbuf space: - * - by ensuring producer position doesn't advance more than - * (ringbuf_size - 1) ahead - * - by ensuring oldest not yet committed record until newest - * record does not span more than (ringbuf_size - 1) - */ - if (new_prod_pos - cons_pos > rb->mask || - new_prod_pos - pend_pos > rb->mask) { + if (!bpf_ringbuf_has_space(rb, new_prod_pos, cons_pos, pend_pos)) { raw_res_spin_unlock_irqrestore(&rb->spinlock, flags); return NULL; } + /* + * In overwrite mode, advance overwrite_pos when the ring buffer is full. + * The key points are to stay on record boundaries and consume enough records + * to fit the new one. + */ + if (unlikely(rb->overwrite_mode)) { + over_pos = rb->overwrite_pos; + while (new_prod_pos - over_pos > rb->mask) { + hdr = (void *)rb->data + (over_pos & rb->mask); + hdr_len = READ_ONCE(hdr->len); + /* + * The bpf_ringbuf_has_space() check above ensures we won’t + * step over a record currently being worked on by another + * producer. + */ + over_pos += bpf_ringbuf_round_up_hdr_len(hdr_len); + } + /* + * smp_store_release(&rb->producer_pos, new_prod_pos) at + * the end of the function ensures that when consumer sees + * the updated rb->producer_pos, it always sees the updated + * rb->overwrite_pos, so when consumer reads overwrite_pos + * after smp_load_acquire(r->producer_pos), the overwrite_pos + * will always be valid. + */ + WRITE_ONCE(rb->overwrite_pos, over_pos); + } + hdr = (void *)rb->data + (prod_pos & rb->mask); pg_off = bpf_ringbuf_rec_pg_off(rb, hdr); hdr->len = size | BPF_RINGBUF_BUSY_BIT; @@ -576,6 +652,8 @@ BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) return smp_load_acquire(&rb->consumer_pos); case BPF_RB_PROD_POS: return smp_load_acquire(&rb->producer_pos); + case BPF_RB_OVERWRITE_POS: + return smp_load_acquire(&rb->overwrite_pos); default: return 0; } diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c index 338305c8852c..f7d0c8d4644e 100644 --- a/kernel/bpf/rqspinlock.c +++ b/kernel/bpf/rqspinlock.c @@ -89,15 +89,14 @@ struct rqspinlock_timeout { DEFINE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); EXPORT_SYMBOL_GPL(rqspinlock_held_locks); -static bool is_lock_released(rqspinlock_t *lock, u32 mask, struct rqspinlock_timeout *ts) +static bool is_lock_released(rqspinlock_t *lock, u32 mask) { if (!(atomic_read_acquire(&lock->val) & (mask))) return true; return false; } -static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask, - struct rqspinlock_timeout *ts) +static noinline int check_deadlock_AA(rqspinlock_t *lock) { struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); int cnt = min(RES_NR_HELD, rqh->cnt); @@ -118,8 +117,7 @@ static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask, * more locks, which reduce to ABBA). This is not exhaustive, and we rely on * timeouts as the final line of defense. */ -static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask, - struct rqspinlock_timeout *ts) +static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask) { struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); int rqh_cnt = min(RES_NR_HELD, rqh->cnt); @@ -142,7 +140,7 @@ static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask, * Let's ensure to break out of this loop if the lock is available for * us to potentially acquire. */ - if (is_lock_released(lock, mask, ts)) + if (is_lock_released(lock, mask)) return 0; /* @@ -198,33 +196,21 @@ static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask, return 0; } -static noinline int check_deadlock(rqspinlock_t *lock, u32 mask, - struct rqspinlock_timeout *ts) -{ - int ret; - - ret = check_deadlock_AA(lock, mask, ts); - if (ret) - return ret; - ret = check_deadlock_ABBA(lock, mask, ts); - if (ret) - return ret; - - return 0; -} - static noinline int check_timeout(rqspinlock_t *lock, u32 mask, struct rqspinlock_timeout *ts) { - u64 time = ktime_get_mono_fast_ns(); u64 prev = ts->cur; + u64 time; if (!ts->timeout_end) { - ts->cur = time; - ts->timeout_end = time + ts->duration; + if (check_deadlock_AA(lock)) + return -EDEADLK; + ts->cur = ktime_get_mono_fast_ns(); + ts->timeout_end = ts->cur + ts->duration; return 0; } + time = ktime_get_mono_fast_ns(); if (time > ts->timeout_end) return -ETIMEDOUT; @@ -234,7 +220,7 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask, */ if (prev + NSEC_PER_MSEC < time) { ts->cur = time; - return check_deadlock(lock, mask, ts); + return check_deadlock_ABBA(lock, mask); } return 0; @@ -278,6 +264,10 @@ int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock) int val, ret = 0; RES_INIT_TIMEOUT(ts); + /* + * The fast path is not invoked for the TAS fallback, so we must grab + * the deadlock detection entry here. + */ grab_held_lock_entry(lock); /* @@ -400,10 +390,7 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) goto queue; } - /* - * Grab an entry in the held locks array, to enable deadlock detection. - */ - grab_held_lock_entry(lock); + /* Deadlock detection entry already held after failing fast path. */ /* * We're pending, wait for the owner to go away. @@ -450,12 +437,21 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) * queuing. */ queue: - lockevent_inc(lock_slowpath); /* - * Grab deadlock detection entry for the queue path. + * Do not queue if we're a waiter and someone is attempting this lock on + * the same CPU. In case of NMIs, this prevents long timeouts where we + * interrupt the pending waiter, and the owner, that will eventually + * signal the head of our queue, both of which are logically but not + * physically part of the queue, hence outside the scope of the idx > 0 + * check above for the trylock fallback. */ - grab_held_lock_entry(lock); + if (check_deadlock_AA(lock)) { + ret = -EDEADLK; + goto err_release_entry; + } + lockevent_inc(lock_slowpath); + /* Deadlock detection entry already held after failing fast path. */ node = this_cpu_ptr(&rqnodes[0].mcs); idx = node->count++; tail = encode_tail(smp_processor_id(), idx); @@ -467,19 +463,17 @@ queue: * not be nested NMIs taking spinlocks. That may not be true in * some architectures even though the chance of needing more than * 4 nodes will still be extremely unlikely. When that happens, - * we fall back to spinning on the lock directly without using - * any MCS node. This is not the most elegant solution, but is - * simple enough. + * we fall back to attempting a trylock operation without using + * any MCS node. Unlike qspinlock which cannot fail, we have the + * option of failing the slow path, and under contention, such a + * trylock spinning will likely be treated unfairly due to lack of + * queueing, hence do not spin. */ - if (unlikely(idx >= _Q_MAX_NODES)) { + if (unlikely(idx >= _Q_MAX_NODES || (in_nmi() && idx > 0))) { lockevent_inc(lock_no_node); - RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); - while (!queued_spin_trylock(lock)) { - if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) { - lockevent_inc(rqspinlock_lock_timeout); - goto err_release_node; - } - cpu_relax(); + if (!queued_spin_trylock(lock)) { + ret = -EDEADLK; + goto err_release_node; } goto release; } @@ -540,7 +534,7 @@ queue: val = arch_mcs_spin_lock_contended(&node->locked); if (val == RES_TIMEOUT_VAL) { - ret = -EDEADLK; + ret = -ETIMEDOUT; goto waitq_timeout; } @@ -575,6 +569,14 @@ queue: val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK)); + /* Disable queue destruction when we detect deadlocks. */ + if (ret == -EDEADLK) { + if (!next) + next = smp_cond_load_relaxed(&node->next, (VAL)); + arch_mcs_spin_unlock_contended(&next->locked); + goto err_release_node; + } + waitq_timeout: if (ret) { /* @@ -666,6 +668,27 @@ EXPORT_SYMBOL_GPL(resilient_queued_spin_lock_slowpath); __bpf_kfunc_start_defs(); +static void bpf_prog_report_rqspinlock_violation(const char *str, void *lock, bool irqsave) +{ + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); + struct bpf_stream_stage ss; + struct bpf_prog *prog; + + prog = bpf_prog_find_from_stack(); + if (!prog) + return; + bpf_stream_stage(ss, prog, BPF_STDERR, ({ + bpf_stream_printk(ss, "ERROR: %s for bpf_res_spin_lock%s\n", str, irqsave ? "_irqsave" : ""); + bpf_stream_printk(ss, "Attempted lock = 0x%px\n", lock); + bpf_stream_printk(ss, "Total held locks = %d\n", rqh->cnt); + for (int i = 0; i < min(RES_NR_HELD, rqh->cnt); i++) + bpf_stream_printk(ss, "Held lock[%2d] = 0x%px\n", i, rqh->locks[i]); + bpf_stream_dump_stack(ss); + })); +} + +#define REPORT_STR(ret) ({ (ret) == -ETIMEDOUT ? "Timeout detected" : "AA or ABBA deadlock detected"; }) + __bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) { int ret; @@ -676,6 +699,7 @@ __bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) preempt_disable(); ret = res_spin_lock((rqspinlock_t *)lock); if (unlikely(ret)) { + bpf_prog_report_rqspinlock_violation(REPORT_STR(ret), lock, false); preempt_enable(); return ret; } @@ -698,6 +722,7 @@ __bpf_kfunc int bpf_res_spin_lock_irqsave(struct bpf_res_spin_lock *lock, unsign local_irq_save(flags); ret = res_spin_lock((rqspinlock_t *)lock); if (unlikely(ret)) { + bpf_prog_report_rqspinlock_violation(REPORT_STR(ret), lock, true); local_irq_restore(flags); preempt_enable(); return ret; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 3615c06b7dfa..da3d328f5c15 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -42,6 +42,28 @@ static inline int stack_map_data_size(struct bpf_map *map) sizeof(struct bpf_stack_build_id) : sizeof(u64); } +/** + * stack_map_calculate_max_depth - Calculate maximum allowed stack trace depth + * @size: Size of the buffer/map value in bytes + * @elem_size: Size of each stack trace element + * @flags: BPF stack trace flags (BPF_F_USER_STACK, BPF_F_USER_BUILD_ID, ...) + * + * Return: Maximum number of stack trace entries that can be safely stored + */ +static u32 stack_map_calculate_max_depth(u32 size, u32 elem_size, u64 flags) +{ + u32 skip = flags & BPF_F_SKIP_FIELD_MASK; + u32 max_depth; + u32 curr_sysctl_max_stack = READ_ONCE(sysctl_perf_event_max_stack); + + max_depth = size / elem_size; + max_depth += skip; + if (max_depth > curr_sysctl_max_stack) + return curr_sysctl_max_stack; + + return max_depth; +} + static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) { u64 elem_size = sizeof(struct stack_map_bucket) + @@ -229,8 +251,8 @@ static long __bpf_get_stackid(struct bpf_map *map, { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *bucket, *new_bucket, *old_bucket; + u32 hash, id, trace_nr, trace_len, i, max_depth; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; - u32 hash, id, trace_nr, trace_len, i; bool user = flags & BPF_F_USER_STACK; u64 *ips; bool hash_matches; @@ -239,7 +261,8 @@ static long __bpf_get_stackid(struct bpf_map *map, /* skipping more than usable stack trace */ return -EFAULT; - trace_nr = trace->nr - skip; + max_depth = stack_map_calculate_max_depth(map->value_size, stack_map_data_size(map), flags); + trace_nr = min_t(u32, trace->nr - skip, max_depth - skip); trace_len = trace_nr * sizeof(u64); ips = trace->ip + skip; hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); @@ -300,22 +323,19 @@ static long __bpf_get_stackid(struct bpf_map *map, BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { - u32 max_depth = map->value_size / stack_map_data_size(map); - u32 skip = flags & BPF_F_SKIP_FIELD_MASK; + u32 elem_size = stack_map_data_size(map); bool user = flags & BPF_F_USER_STACK; struct perf_callchain_entry *trace; bool kernel = !user; + u32 max_depth; if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) return -EINVAL; - max_depth += skip; - if (max_depth > sysctl_perf_event_max_stack) - max_depth = sysctl_perf_event_max_stack; - - trace = get_perf_callchain(regs, 0, kernel, user, max_depth, - false, false); + max_depth = stack_map_calculate_max_depth(map->value_size, elem_size, flags); + trace = get_perf_callchain(regs, kernel, user, max_depth, + false, false, 0); if (unlikely(!trace)) /* couldn't fetch the stack trace */ @@ -371,15 +391,11 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, return -EFAULT; nr_kernel = count_kernel_ip(trace); + __u64 nr = trace->nr; /* save original */ if (kernel) { - __u64 nr = trace->nr; - trace->nr = nr_kernel; ret = __bpf_get_stackid(map, trace, flags); - - /* restore nr */ - trace->nr = nr; } else { /* user */ u64 skip = flags & BPF_F_SKIP_FIELD_MASK; @@ -390,6 +406,10 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; ret = __bpf_get_stackid(map, trace, flags); } + + /* restore nr */ + trace->nr = nr; + return ret; } @@ -406,7 +426,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, struct perf_callchain_entry *trace_in, void *buf, u32 size, u64 flags, bool may_fault) { - u32 trace_nr, copy_len, elem_size, num_elem, max_depth; + u32 trace_nr, copy_len, elem_size, max_depth; bool user_build_id = flags & BPF_F_USER_BUILD_ID; bool crosstask = task && task != current; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; @@ -438,21 +458,20 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, goto clear; } - num_elem = size / elem_size; - max_depth = num_elem + skip; - if (sysctl_perf_event_max_stack < max_depth) - max_depth = sysctl_perf_event_max_stack; + max_depth = stack_map_calculate_max_depth(size, elem_size, flags); if (may_fault) rcu_read_lock(); /* need RCU for perf's callchain below */ - if (trace_in) + if (trace_in) { trace = trace_in; - else if (kernel && task) + trace->nr = min_t(u32, trace->nr, max_depth); + } else if (kernel && task) { trace = get_callchain_entry_for_task(task, max_depth); - else - trace = get_perf_callchain(regs, 0, kernel, user, max_depth, - crosstask, false); + } else { + trace = get_perf_callchain(regs, kernel, user, max_depth, + crosstask, false, 0); + } if (unlikely(!trace) || trace->nr < skip) { if (may_fault) @@ -461,7 +480,6 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, } trace_nr = trace->nr - skip; - trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; copy_len = trace_nr * elem_size; ips = trace->ip + skip; @@ -646,7 +664,15 @@ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) } /* Called from syscall */ -int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) +static int stack_map_lookup_and_delete_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + return bpf_stackmap_extract(map, key, value, true); +} + +/* Called from syscall */ +int bpf_stackmap_extract(struct bpf_map *map, void *key, void *value, + bool delete) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *bucket, *old_bucket; @@ -663,7 +689,10 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) memcpy(value, bucket->data, trace_len); memset(value + trace_len, 0, map->value_size - trace_len); - old_bucket = xchg(&smap->buckets[id], bucket); + if (delete) + old_bucket = bucket; + else + old_bucket = xchg(&smap->buckets[id], bucket); if (old_bucket) pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); return 0; @@ -754,6 +783,7 @@ const struct bpf_map_ops stack_trace_map_ops = { .map_free = stack_map_free, .map_get_next_key = stack_map_get_next_key, .map_lookup_elem = stack_map_lookup_elem, + .map_lookup_and_delete_elem = stack_map_lookup_and_delete_elem, .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, .map_check_btf = map_check_no_btf, diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c new file mode 100644 index 000000000000..0b6bc3f30335 --- /dev/null +++ b/kernel/bpf/stream.c @@ -0,0 +1,384 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/bpf_mem_alloc.h> +#include <linux/gfp.h> +#include <linux/memory.h> +#include <linux/mutex.h> + +static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len) +{ + init_llist_node(&elem->node); + elem->total_len = len; + elem->consumed_len = 0; +} + +static struct bpf_stream_elem *bpf_stream_elem_alloc(int len) +{ + const int max_len = ARRAY_SIZE((struct bpf_bprintf_buffers){}.buf); + struct bpf_stream_elem *elem; + size_t alloc_size; + + /* + * Length denotes the amount of data to be written as part of stream element, + * thus includes '\0' byte. We're capped by how much bpf_bprintf_buffers can + * accomodate, therefore deny allocations that won't fit into them. + */ + if (len < 0 || len > max_len) + return NULL; + + alloc_size = offsetof(struct bpf_stream_elem, str[len]); + elem = kmalloc_nolock(alloc_size, __GFP_ZERO, -1); + if (!elem) + return NULL; + + bpf_stream_elem_init(elem, len); + + return elem; +} + +static int __bpf_stream_push_str(struct llist_head *log, const char *str, int len) +{ + struct bpf_stream_elem *elem = NULL; + + /* + * Allocate a bpf_prog_stream_elem and push it to the bpf_prog_stream + * log, elements will be popped at once and reversed to print the log. + */ + elem = bpf_stream_elem_alloc(len); + if (!elem) + return -ENOMEM; + + memcpy(elem->str, str, len); + llist_add(&elem->node, log); + + return 0; +} + +static int bpf_stream_consume_capacity(struct bpf_stream *stream, int len) +{ + if (atomic_read(&stream->capacity) >= BPF_STREAM_MAX_CAPACITY) + return -ENOSPC; + if (atomic_add_return(len, &stream->capacity) >= BPF_STREAM_MAX_CAPACITY) { + atomic_sub(len, &stream->capacity); + return -ENOSPC; + } + return 0; +} + +static void bpf_stream_release_capacity(struct bpf_stream *stream, struct bpf_stream_elem *elem) +{ + int len = elem->total_len; + + atomic_sub(len, &stream->capacity); +} + +static int bpf_stream_push_str(struct bpf_stream *stream, const char *str, int len) +{ + int ret = bpf_stream_consume_capacity(stream, len); + + return ret ?: __bpf_stream_push_str(&stream->log, str, len); +} + +static struct bpf_stream *bpf_stream_get(enum bpf_stream_id stream_id, struct bpf_prog_aux *aux) +{ + if (stream_id != BPF_STDOUT && stream_id != BPF_STDERR) + return NULL; + return &aux->stream[stream_id - 1]; +} + +static void bpf_stream_free_elem(struct bpf_stream_elem *elem) +{ + kfree_nolock(elem); +} + +static void bpf_stream_free_list(struct llist_node *list) +{ + struct bpf_stream_elem *elem, *tmp; + + llist_for_each_entry_safe(elem, tmp, list, node) + bpf_stream_free_elem(elem); +} + +static struct llist_node *bpf_stream_backlog_peek(struct bpf_stream *stream) +{ + return stream->backlog_head; +} + +static struct llist_node *bpf_stream_backlog_pop(struct bpf_stream *stream) +{ + struct llist_node *node; + + node = stream->backlog_head; + if (stream->backlog_head == stream->backlog_tail) + stream->backlog_head = stream->backlog_tail = NULL; + else + stream->backlog_head = node->next; + return node; +} + +static void bpf_stream_backlog_fill(struct bpf_stream *stream) +{ + struct llist_node *head, *tail; + + if (llist_empty(&stream->log)) + return; + tail = llist_del_all(&stream->log); + if (!tail) + return; + head = llist_reverse_order(tail); + + if (!stream->backlog_head) { + stream->backlog_head = head; + stream->backlog_tail = tail; + } else { + stream->backlog_tail->next = head; + stream->backlog_tail = tail; + } + + return; +} + +static bool bpf_stream_consume_elem(struct bpf_stream_elem *elem, int *len) +{ + int rem = elem->total_len - elem->consumed_len; + int used = min(rem, *len); + + elem->consumed_len += used; + *len -= used; + + return elem->consumed_len == elem->total_len; +} + +static int bpf_stream_read(struct bpf_stream *stream, void __user *buf, int len) +{ + int rem_len = len, cons_len, ret = 0; + struct bpf_stream_elem *elem = NULL; + struct llist_node *node; + + mutex_lock(&stream->lock); + + while (rem_len) { + int pos = len - rem_len; + bool cont; + + node = bpf_stream_backlog_peek(stream); + if (!node) { + bpf_stream_backlog_fill(stream); + node = bpf_stream_backlog_peek(stream); + } + if (!node) + break; + elem = container_of(node, typeof(*elem), node); + + cons_len = elem->consumed_len; + cont = bpf_stream_consume_elem(elem, &rem_len) == false; + + ret = copy_to_user(buf + pos, elem->str + cons_len, + elem->consumed_len - cons_len); + /* Restore in case of error. */ + if (ret) { + ret = -EFAULT; + elem->consumed_len = cons_len; + break; + } + + if (cont) + continue; + bpf_stream_backlog_pop(stream); + bpf_stream_release_capacity(stream, elem); + bpf_stream_free_elem(elem); + } + + mutex_unlock(&stream->lock); + return ret ? ret : len - rem_len; +} + +int bpf_prog_stream_read(struct bpf_prog *prog, enum bpf_stream_id stream_id, void __user *buf, int len) +{ + struct bpf_stream *stream; + + stream = bpf_stream_get(stream_id, prog->aux); + if (!stream) + return -ENOENT; + return bpf_stream_read(stream, buf, len); +} + +__bpf_kfunc_start_defs(); + +/* + * Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the + * enum in headers. + */ +__bpf_kfunc int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args, + u32 len__sz, void *aux__prog) +{ + struct bpf_bprintf_data data = { + .get_bin_args = true, + .get_buf = true, + }; + struct bpf_prog_aux *aux = aux__prog; + u32 fmt_size = strlen(fmt__str) + 1; + struct bpf_stream *stream; + u32 data_len = len__sz; + int ret, num_args; + + stream = bpf_stream_get(stream_id, aux); + if (!stream) + return -ENOENT; + + if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 || + (data_len && !args)) + return -EINVAL; + num_args = data_len / 8; + + ret = bpf_bprintf_prepare(fmt__str, fmt_size, args, num_args, &data); + if (ret < 0) + return ret; + + ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt__str, data.bin_args); + /* Exclude NULL byte during push. */ + ret = bpf_stream_push_str(stream, data.buf, ret); + bpf_bprintf_cleanup(&data); + + return ret; +} + +__bpf_kfunc_end_defs(); + +/* Added kfunc to common_btf_ids */ + +void bpf_prog_stream_init(struct bpf_prog *prog) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) { + atomic_set(&prog->aux->stream[i].capacity, 0); + init_llist_head(&prog->aux->stream[i].log); + mutex_init(&prog->aux->stream[i].lock); + prog->aux->stream[i].backlog_head = NULL; + prog->aux->stream[i].backlog_tail = NULL; + } +} + +void bpf_prog_stream_free(struct bpf_prog *prog) +{ + struct llist_node *list; + int i; + + for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) { + list = llist_del_all(&prog->aux->stream[i].log); + bpf_stream_free_list(list); + bpf_stream_free_list(prog->aux->stream[i].backlog_head); + } +} + +void bpf_stream_stage_init(struct bpf_stream_stage *ss) +{ + init_llist_head(&ss->log); + ss->len = 0; +} + +void bpf_stream_stage_free(struct bpf_stream_stage *ss) +{ + struct llist_node *node; + + node = llist_del_all(&ss->log); + bpf_stream_free_list(node); +} + +int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...) +{ + struct bpf_bprintf_buffers *buf; + va_list args; + int ret; + + if (bpf_try_get_buffers(&buf)) + return -EBUSY; + + va_start(args, fmt); + ret = vsnprintf(buf->buf, ARRAY_SIZE(buf->buf), fmt, args); + va_end(args); + ss->len += ret; + /* Exclude NULL byte during push. */ + ret = __bpf_stream_push_str(&ss->log, buf->buf, ret); + bpf_put_buffers(); + return ret; +} + +int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog, + enum bpf_stream_id stream_id) +{ + struct llist_node *list, *head, *tail; + struct bpf_stream *stream; + int ret; + + stream = bpf_stream_get(stream_id, prog->aux); + if (!stream) + return -EINVAL; + + ret = bpf_stream_consume_capacity(stream, ss->len); + if (ret) + return ret; + + list = llist_del_all(&ss->log); + head = tail = list; + + if (!list) + return 0; + while (llist_next(list)) { + tail = llist_next(list); + list = tail; + } + llist_add_batch(head, tail, &stream->log); + return 0; +} + +struct dump_stack_ctx { + struct bpf_stream_stage *ss; + int err; +}; + +static bool dump_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp) +{ + struct dump_stack_ctx *ctxp = cookie; + const char *file = "", *line = ""; + struct bpf_prog *prog; + int num, ret; + + rcu_read_lock(); + prog = bpf_prog_ksym_find(ip); + rcu_read_unlock(); + if (prog) { + ret = bpf_prog_get_file_line(prog, ip, &file, &line, &num); + if (ret < 0) + goto end; + ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n %s @ %s:%d\n", + (void *)(long)ip, line, file, num); + return !ctxp->err; + } +end: + ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n", (void *)(long)ip); + return !ctxp->err; +} + +int bpf_stream_stage_dump_stack(struct bpf_stream_stage *ss) +{ + struct dump_stack_ctx ctx = { .ss = ss }; + int ret; + + ret = bpf_stream_stage_printk(ss, "CPU: %d UID: %d PID: %d Comm: %s\n", + raw_smp_processor_id(), __kuid_val(current_real_cred()->euid), + current->pid, current->comm); + if (ret) + return ret; + ret = bpf_stream_stage_printk(ss, "Call trace:\n"); + if (ret) + return ret; + arch_bpf_stack_walk(dump_stack_cb, &ctx); + if (ctx.err) + return ctx.err; + return bpf_stream_stage_printk(ss, "\n"); +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 64c3393e8270..4ff82144f885 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ +#include <crypto/sha2.h> #include <linux/bpf.h> #include <linux/bpf-cgroup.h> #include <linux/bpf_trace.h> @@ -36,6 +37,9 @@ #include <linux/memcontrol.h> #include <linux/trace_events.h> #include <linux/tracepoint.h> +#include <linux/overflow.h> +#include <linux/cookie.h> +#include <linux/verification.h> #include <net/netfilter/nf_bpf_link.h> #include <net/netkit.h> @@ -52,6 +56,7 @@ #define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY) DEFINE_PER_CPU(int, bpf_prog_active); +DEFINE_COOKIE(bpf_map_cookie); static DEFINE_IDR(prog_idr); static DEFINE_SPINLOCK(prog_idr_lock); static DEFINE_IDR(map_idr); @@ -153,7 +158,7 @@ static void maybe_wait_bpf_programs(struct bpf_map *map) */ if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) - synchronize_rcu(); + synchronize_rcu_expedited(); } static void unpin_uptr_kaddr(void *kaddr) @@ -315,7 +320,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { err = bpf_percpu_cgroup_storage_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { - err = bpf_stackmap_copy(map, key, value); + err = bpf_stackmap_extract(map, key, value, false); } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { err = bpf_fd_array_map_lookup_elem(map, key, value); } else if (IS_FD_HASH(map)) { @@ -515,6 +520,21 @@ void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, return ptr; } +void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags, + int node) +{ + struct mem_cgroup *memcg, *old_memcg; + void *ptr; + + memcg = bpf_map_get_memcg(map); + old_memcg = set_active_memcg(memcg); + ptr = kmalloc_nolock(size, flags | __GFP_ACCOUNT, node); + set_active_memcg(old_memcg); + mem_cgroup_put(memcg); + + return ptr; +} + void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) { struct mem_cgroup *memcg, *old_memcg; @@ -578,7 +598,7 @@ static bool can_alloc_pages(void) static struct page *__bpf_alloc_page(int nid) { if (!can_alloc_pages()) - return try_alloc_pages(nid, 0); + return alloc_pages_nolock(__GFP_ACCOUNT, nid, 0); return alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT @@ -669,6 +689,7 @@ void btf_record_free(struct btf_record *rec) case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: + case BPF_TASK_WORK: /* Nothing to release */ break; default: @@ -693,7 +714,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) if (IS_ERR_OR_NULL(rec)) return NULL; - size = offsetof(struct btf_record, fields[rec->cnt]); + size = struct_size(rec, fields, rec->cnt); new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN); if (!new_rec) return ERR_PTR(-ENOMEM); @@ -722,6 +743,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: + case BPF_TASK_WORK: /* Nothing to acquire */ break; default: @@ -748,7 +770,7 @@ bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *r return false; if (rec_a->cnt != rec_b->cnt) return false; - size = offsetof(struct btf_record, fields[rec_a->cnt]); + size = struct_size(rec_a, fields, rec_a->cnt); /* btf_parse_fields uses kzalloc to allocate a btf_record, so unused * members are zeroed out. So memcmp is safe to do without worrying * about padding/unused fields. @@ -780,6 +802,13 @@ void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj) bpf_wq_cancel_and_free(obj + rec->wq_off); } +void bpf_obj_free_task_work(const struct btf_record *rec, void *obj) +{ + if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TASK_WORK))) + return; + bpf_task_work_cancel_and_free(obj + rec->task_work_off); +} + void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; @@ -804,6 +833,9 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) case BPF_WORKQUEUE: bpf_wq_cancel_and_free(field_ptr); break; + case BPF_TASK_WORK: + bpf_task_work_cancel_and_free(field_ptr); + break; case BPF_KPTR_UNREF: WRITE_ONCE(*(u64 *)field_ptr, 0); break; @@ -857,6 +889,7 @@ static void bpf_map_free(struct bpf_map *map) * the free of values or special fields allocated from bpf memory * allocator. */ + kfree(map->excl_prog_sha); migrate_disable(); map->ops->map_free(map); migrate_enable(); @@ -884,6 +917,7 @@ static void bpf_map_free_deferred(struct work_struct *work) security_bpf_map_free(map); bpf_map_release_memcg(map); + bpf_map_owner_free(map); bpf_map_free(map); } @@ -901,7 +935,7 @@ static void bpf_map_free_in_work(struct bpf_map *map) /* Avoid spawning kworkers, since they all might contend * for the same mutex like slab_mutex. */ - queue_work(system_unbound_wq, &map->work); + queue_work(system_dfl_wq, &map->work); } static void bpf_map_free_rcu_gp(struct rcu_head *rcu) @@ -978,12 +1012,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) struct bpf_map *map = filp->private_data; u32 type = 0, jited = 0; - if (map_type_contains_progs(map)) { - spin_lock(&map->owner.lock); - type = map->owner.type; - jited = map->owner.jited; - spin_unlock(&map->owner.lock); + spin_lock(&map->owner_lock); + if (map->owner) { + type = map->owner->type; + jited = map->owner->jited; } + spin_unlock(&map->owner_lock); seq_printf(m, "map_type:\t%u\n" @@ -1128,7 +1162,7 @@ static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr if (map->ops->map_get_unmapped_area) return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags); #ifdef CONFIG_MMU - return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(filp, addr, len, pgoff, flags); #else return addr; #endif @@ -1200,6 +1234,7 @@ int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size) return src - orig_src; } +EXPORT_SYMBOL_GPL(bpf_obj_name_cpy); int map_check_no_btf(const struct bpf_map *map, const struct btf *btf, @@ -1233,7 +1268,8 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | - BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR, + BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR | + BPF_TASK_WORK, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; @@ -1265,6 +1301,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, break; case BPF_TIMER: case BPF_WORKQUEUE: + case BPF_TASK_WORK: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) { @@ -1334,9 +1371,9 @@ static bool bpf_net_capable(void) return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN); } -#define BPF_MAP_CREATE_LAST_FIELD map_token_fd +#define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size /* called via syscall */ -static int map_create(union bpf_attr *attr, bool kernel) +static int map_create(union bpf_attr *attr, bpfptr_t uattr) { const struct bpf_map_ops *ops; struct bpf_token *token = NULL; @@ -1457,6 +1494,7 @@ static int map_create(union bpf_attr *attr, bool kernel) case BPF_MAP_TYPE_STRUCT_OPS: case BPF_MAP_TYPE_CPUMAP: case BPF_MAP_TYPE_ARENA: + case BPF_MAP_TYPE_INSN_ARRAY: if (!bpf_token_capable(token, CAP_BPF)) goto put_token; break; @@ -1486,10 +1524,14 @@ static int map_create(union bpf_attr *attr, bool kernel) if (err < 0) goto free_map; + preempt_disable(); + map->cookie = gen_cookie_next(&bpf_map_cookie); + preempt_enable(); + atomic64_set(&map->refcnt, 1); atomic64_set(&map->usercnt, 1); mutex_init(&map->freeze_mutex); - spin_lock_init(&map->owner.lock); + spin_lock_init(&map->owner_lock); if (attr->btf_key_type_id || attr->btf_value_type_id || /* Even the map's value is a kernel's struct, @@ -1526,7 +1568,30 @@ static int map_create(union bpf_attr *attr, bool kernel) attr->btf_vmlinux_value_type_id; } - err = security_bpf_map_create(map, attr, token, kernel); + if (attr->excl_prog_hash) { + bpfptr_t uprog_hash = make_bpfptr(attr->excl_prog_hash, uattr.is_kernel); + + if (attr->excl_prog_hash_size != SHA256_DIGEST_SIZE) { + err = -EINVAL; + goto free_map; + } + + map->excl_prog_sha = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); + if (!map->excl_prog_sha) { + err = -ENOMEM; + goto free_map; + } + + if (copy_from_bpfptr(map->excl_prog_sha, uprog_hash, SHA256_DIGEST_SIZE)) { + err = -EFAULT; + goto free_map; + } + } else if (attr->excl_prog_hash_size) { + err = -EINVAL; + goto free_map; + } + + err = security_bpf_map_create(map, attr, token, uattr.is_kernel); if (err) goto free_map_sec; @@ -1619,7 +1684,8 @@ struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map) } EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero); -int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) +int __weak bpf_stackmap_extract(struct bpf_map *map, void *key, void *value, + bool delete) { return -ENOTSUPP; } @@ -1661,9 +1727,6 @@ static int map_lookup_elem(union bpf_attr *attr) if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) return -EINVAL; - if (attr->flags & ~BPF_F_LOCK) - return -EINVAL; - CLASS(fd, f)(attr->map_fd); map = __bpf_map_get(f); if (IS_ERR(map)) @@ -1671,9 +1734,9 @@ static int map_lookup_elem(union bpf_attr *attr) if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) return -EPERM; - if ((attr->flags & BPF_F_LOCK) && - !btf_record_has_field(map->record, BPF_SPIN_LOCK)) - return -EINVAL; + err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK); + if (err) + return err; key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) @@ -1736,11 +1799,9 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) goto err_put; } - if ((attr->flags & BPF_F_LOCK) && - !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { - err = -EINVAL; + err = bpf_map_check_op_flags(map, attr->flags, ~0); + if (err) goto err_put; - } key = ___bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { @@ -1944,13 +2005,9 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file, void *key, *value; int err = 0; - if (attr->batch.elem_flags & ~BPF_F_LOCK) - return -EINVAL; - - if ((attr->batch.elem_flags & BPF_F_LOCK) && - !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { - return -EINVAL; - } + err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK); + if (err) + return err; value_size = bpf_map_value_size(map); @@ -2007,12 +2064,9 @@ int generic_map_lookup_batch(struct bpf_map *map, u32 value_size, cp, max_count; int err; - if (attr->batch.elem_flags & ~BPF_F_LOCK) - return -EINVAL; - - if ((attr->batch.elem_flags & BPF_F_LOCK) && - !btf_record_has_field(map->record, BPF_SPIN_LOCK)) - return -EINVAL; + err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK); + if (err) + return err; value_size = bpf_map_value_size(map); @@ -2150,7 +2204,8 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) } else if (map->map_type == BPF_MAP_TYPE_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_STACK_TRACE) { if (!bpf_map_is_offloaded(map)) { bpf_disable_instrumentation(); rcu_read_lock(); @@ -2266,7 +2321,7 @@ static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op) return; if (audit_enabled == AUDIT_OFF) return; - if (!in_irq() && !irqs_disabled()) + if (!in_hardirq() && !irqs_disabled()) ctx = audit_context(); ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF); if (unlikely(!ab)) @@ -2364,7 +2419,7 @@ static void __bpf_prog_put(struct bpf_prog *prog) struct bpf_prog_aux *aux = prog->aux; if (atomic64_dec_and_test(&aux->refcnt)) { - if (in_irq() || irqs_disabled()) { + if (in_hardirq() || irqs_disabled()) { INIT_WORK(&aux->work, bpf_prog_put_deferred); schedule_work(&aux->work); } else { @@ -2398,6 +2453,9 @@ void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog) struct bpf_prog_stats *stats; unsigned int flags; + if (unlikely(!prog->stats)) + return; + stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->misses); @@ -2753,8 +2811,61 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) } } +static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr, + bool is_kernel) +{ + bpfptr_t usig = make_bpfptr(attr->signature, is_kernel); + struct bpf_dynptr_kern sig_ptr, insns_ptr; + struct bpf_key *key = NULL; + void *sig; + int err = 0; + + if (system_keyring_id_check(attr->keyring_id) == 0) + key = bpf_lookup_system_key(attr->keyring_id); + else + key = bpf_lookup_user_key(attr->keyring_id, 0); + + if (!key) + return -EINVAL; + + sig = kvmemdup_bpfptr(usig, attr->signature_size); + if (IS_ERR(sig)) { + bpf_key_put(key); + return -ENOMEM; + } + + bpf_dynptr_init(&sig_ptr, sig, BPF_DYNPTR_TYPE_LOCAL, 0, + attr->signature_size); + bpf_dynptr_init(&insns_ptr, prog->insnsi, BPF_DYNPTR_TYPE_LOCAL, 0, + prog->len * sizeof(struct bpf_insn)); + + err = bpf_verify_pkcs7_signature((struct bpf_dynptr *)&insns_ptr, + (struct bpf_dynptr *)&sig_ptr, key); + + bpf_key_put(key); + kvfree(sig); + return err; +} + +static int bpf_prog_mark_insn_arrays_ready(struct bpf_prog *prog) +{ + int err; + int i; + + for (i = 0; i < prog->aux->used_map_cnt; i++) { + if (prog->aux->used_maps[i]->map_type != BPF_MAP_TYPE_INSN_ARRAY) + continue; + + err = bpf_insn_array_ready(prog->aux->used_maps[i]); + if (err) + return err; + } + + return 0; +} + /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD fd_array_cnt +#define BPF_PROG_LOAD_LAST_FIELD keyring_id static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) { @@ -2918,6 +3029,12 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) /* eBPF programs must be GPL compatible to use GPL-ed functions */ prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0; + if (attr->signature) { + err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel); + if (err) + goto free_prog; + } + prog->orig_prog = NULL; prog->jited = 0; @@ -2976,6 +3093,10 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (err < 0) goto free_used_maps; + err = bpf_prog_mark_insn_arrays_ready(prog); + if (err < 0) + goto free_used_maps; + err = bpf_prog_alloc_id(prog); if (err) goto free_used_maps; @@ -3068,7 +3189,7 @@ static int bpf_obj_get(const union bpf_attr *attr) */ void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog, - bool sleepable) + enum bpf_attach_type attach_type, bool sleepable) { WARN_ON(ops->dealloc && ops->dealloc_deferred); atomic64_set(&link->refcnt, 1); @@ -3077,12 +3198,14 @@ void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, link->id = 0; link->ops = ops; link->prog = prog; + link->attach_type = attach_type; } void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, - const struct bpf_link_ops *ops, struct bpf_prog *prog) + const struct bpf_link_ops *ops, struct bpf_prog *prog, + enum bpf_attach_type attach_type) { - bpf_link_init_sleepable(link, type, ops, prog, false); + bpf_link_init_sleepable(link, type, ops, prog, attach_type, false); } static void bpf_link_free_id(int id) @@ -3227,7 +3350,14 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; if (type < ARRAY_SIZE(bpf_link_type_strs) && bpf_link_type_strs[type]) { - seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]); + if (link->type == BPF_LINK_TYPE_KPROBE_MULTI) + seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_KPROBE_MULTI_RETURN ? + "kretprobe_multi" : "kprobe_multi"); + else if (link->type == BPF_LINK_TYPE_UPROBE_MULTI) + seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_UPROBE_MULTI_RETURN ? + "uretprobe_multi" : "uprobe_multi"); + else + seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]); } else { WARN_ONCE(1, "missing BPF_LINK_TYPE(...) for link type %u\n", type); seq_printf(m, "link_type:\t<%u>\n", type); @@ -3402,10 +3532,12 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, seq_printf(seq, "attach_type:\t%d\n" "target_obj_id:\t%u\n" - "target_btf_id:\t%u\n", - tr_link->attach_type, + "target_btf_id:\t%u\n" + "cookie:\t%llu\n", + link->attach_type, target_obj_id, - target_btf_id); + target_btf_id, + tr_link->link.cookie); } static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, @@ -3414,7 +3546,8 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); - info->tracing.attach_type = tr_link->attach_type; + info->tracing.attach_type = link->attach_type; + info->tracing.cookie = tr_link->link.cookie; bpf_trampoline_unpack_key(tr_link->trampoline->key, &info->tracing.target_obj_id, &info->tracing.target_btf_id); @@ -3432,7 +3565,8 @@ static const struct bpf_link_ops bpf_tracing_link_lops = { static int bpf_tracing_prog_attach(struct bpf_prog *prog, int tgt_prog_fd, u32 btf_id, - u64 bpf_cookie) + u64 bpf_cookie, + enum bpf_attach_type attach_type) { struct bpf_link_primer link_primer; struct bpf_prog *tgt_prog = NULL; @@ -3500,8 +3634,8 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, goto out_put_prog; } bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING, - &bpf_tracing_link_lops, prog); - link->attach_type = prog->expected_attach_type; + &bpf_tracing_link_lops, prog, attach_type); + link->link.cookie = bpf_cookie; mutex_lock(&prog->aux->dst_mutex); @@ -3650,8 +3784,10 @@ static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link, container_of(link, struct bpf_raw_tp_link, link); seq_printf(seq, - "tp_name:\t%s\n", - raw_tp_link->btp->tp->name); + "tp_name:\t%s\n" + "cookie:\t%llu\n", + raw_tp_link->btp->tp->name, + raw_tp_link->cookie); } static int bpf_copy_to_user(char __user *ubuf, const char *buf, u32 ulen, @@ -3687,6 +3823,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, return -EINVAL; info->raw_tracepoint.tp_name_len = tp_len + 1; + info->raw_tracepoint.cookie = raw_tp_link->cookie; if (!ubuf) return 0; @@ -3793,20 +3930,46 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, info->perf_event.kprobe.cookie = event->bpf_cookie; return 0; } + +static void bpf_perf_link_fdinfo_kprobe(const struct perf_event *event, + struct seq_file *seq) +{ + const char *name; + int err; + u32 prog_id, type; + u64 offset, addr; + unsigned long missed; + + err = bpf_get_perf_event_info(event, &prog_id, &type, &name, + &offset, &addr, &missed); + if (err) + return; + + seq_printf(seq, + "name:\t%s\n" + "offset:\t%#llx\n" + "missed:\t%lu\n" + "addr:\t%#llx\n" + "event_type:\t%s\n" + "cookie:\t%llu\n", + name, offset, missed, addr, + type == BPF_FD_TYPE_KRETPROBE ? "kretprobe" : "kprobe", + event->bpf_cookie); +} #endif #ifdef CONFIG_UPROBE_EVENTS static int bpf_perf_link_fill_uprobe(const struct perf_event *event, struct bpf_link_info *info) { + u64 ref_ctr_offset, offset; char __user *uname; - u64 addr, offset; u32 ulen, type; int err; uname = u64_to_user_ptr(info->perf_event.uprobe.file_name); ulen = info->perf_event.uprobe.name_len; - err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr, + err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &ref_ctr_offset, &type, NULL); if (err) return err; @@ -3818,8 +3981,34 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event, info->perf_event.uprobe.name_len = ulen; info->perf_event.uprobe.offset = offset; info->perf_event.uprobe.cookie = event->bpf_cookie; + info->perf_event.uprobe.ref_ctr_offset = ref_ctr_offset; return 0; } + +static void bpf_perf_link_fdinfo_uprobe(const struct perf_event *event, + struct seq_file *seq) +{ + const char *name; + int err; + u32 prog_id, type; + u64 offset, ref_ctr_offset; + unsigned long missed; + + err = bpf_get_perf_event_info(event, &prog_id, &type, &name, + &offset, &ref_ctr_offset, &missed); + if (err) + return; + + seq_printf(seq, + "name:\t%s\n" + "offset:\t%#llx\n" + "ref_ctr_offset:\t%#llx\n" + "event_type:\t%s\n" + "cookie:\t%llu\n", + name, offset, ref_ctr_offset, + type == BPF_FD_TYPE_URETPROBE ? "uretprobe" : "uprobe", + event->bpf_cookie); +} #endif static int bpf_perf_link_fill_probe(const struct perf_event *event, @@ -3888,10 +4077,79 @@ static int bpf_perf_link_fill_link_info(const struct bpf_link *link, } } +static void bpf_perf_event_link_show_fdinfo(const struct perf_event *event, + struct seq_file *seq) +{ + seq_printf(seq, + "type:\t%u\n" + "config:\t%llu\n" + "event_type:\t%s\n" + "cookie:\t%llu\n", + event->attr.type, event->attr.config, + "event", event->bpf_cookie); +} + +static void bpf_tracepoint_link_show_fdinfo(const struct perf_event *event, + struct seq_file *seq) +{ + int err; + const char *name; + u32 prog_id; + + err = bpf_get_perf_event_info(event, &prog_id, NULL, &name, NULL, + NULL, NULL); + if (err) + return; + + seq_printf(seq, + "tp_name:\t%s\n" + "event_type:\t%s\n" + "cookie:\t%llu\n", + name, "tracepoint", event->bpf_cookie); +} + +static void bpf_probe_link_show_fdinfo(const struct perf_event *event, + struct seq_file *seq) +{ +#ifdef CONFIG_KPROBE_EVENTS + if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE) + return bpf_perf_link_fdinfo_kprobe(event, seq); +#endif + +#ifdef CONFIG_UPROBE_EVENTS + if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE) + return bpf_perf_link_fdinfo_uprobe(event, seq); +#endif +} + +static void bpf_perf_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_perf_link *perf_link; + const struct perf_event *event; + + perf_link = container_of(link, struct bpf_perf_link, link); + event = perf_get_event(perf_link->perf_file); + if (IS_ERR(event)) + return; + + switch (event->prog->type) { + case BPF_PROG_TYPE_PERF_EVENT: + return bpf_perf_event_link_show_fdinfo(event, seq); + case BPF_PROG_TYPE_TRACEPOINT: + return bpf_tracepoint_link_show_fdinfo(event, seq); + case BPF_PROG_TYPE_KPROBE: + return bpf_probe_link_show_fdinfo(event, seq); + default: + return; + } +} + static const struct bpf_link_ops bpf_perf_link_lops = { .release = bpf_perf_link_release, .dealloc = bpf_perf_link_dealloc, .fill_link_info = bpf_perf_link_fill_link_info, + .show_fdinfo = bpf_perf_link_show_fdinfo, }; static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) @@ -3914,7 +4172,8 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro err = -ENOMEM; goto out_put_file; } - bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog, + attr->link_create.attach_type); link->perf_file = perf_file; err = bpf_link_prime(&link->link, &link_primer); @@ -3946,7 +4205,8 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro #endif /* CONFIG_PERF_EVENTS */ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, - const char __user *user_tp_name, u64 cookie) + const char __user *user_tp_name, u64 cookie, + enum bpf_attach_type attach_type) { struct bpf_link_primer link_primer; struct bpf_raw_tp_link *link; @@ -3969,7 +4229,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, tp_name = prog->aux->attach_func_name; break; } - return bpf_tracing_prog_attach(prog, 0, 0, 0); + return bpf_tracing_prog_attach(prog, 0, 0, 0, attach_type); case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0) @@ -3991,7 +4251,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, goto out_put_btp; } bpf_link_init_sleepable(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, - &bpf_raw_tp_link_lops, prog, + &bpf_raw_tp_link_lops, prog, attach_type, tracepoint_is_faultable(btp->tp)); link->btp = btp; link->cookie = cookie; @@ -4033,7 +4293,7 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) tp_name = u64_to_user_ptr(attr->raw_tracepoint.name); cookie = attr->raw_tracepoint.cookie; - fd = bpf_raw_tp_link_attach(prog, tp_name, cookie); + fd = bpf_raw_tp_link_attach(prog, tp_name, cookie, prog->expected_attach_type); if (fd < 0) bpf_prog_put(prog); return fd; @@ -4183,6 +4443,25 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, } } +static bool is_cgroup_prog_type(enum bpf_prog_type ptype, enum bpf_attach_type atype, + bool check_atype) +{ + switch (ptype) { + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_SOCK_OPS: + return true; + case BPF_PROG_TYPE_LSM: + return check_atype ? atype == BPF_LSM_CGROUP : true; + default: + return false; + } +} + #define BPF_PROG_ATTACH_LAST_FIELD expected_revision #define BPF_F_ATTACH_MASK_BASE \ @@ -4213,6 +4492,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) if (bpf_mprog_supported(ptype)) { if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG) return -EINVAL; + } else if (is_cgroup_prog_type(ptype, 0, false)) { + if (attr->attach_flags & ~(BPF_F_ATTACH_MASK_BASE | BPF_F_ATTACH_MASK_MPROG)) + return -EINVAL; } else { if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE) return -EINVAL; @@ -4230,6 +4512,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) return -EINVAL; } + if (is_cgroup_prog_type(ptype, prog->expected_attach_type, true)) { + ret = cgroup_bpf_prog_attach(attr, ptype, prog); + goto out; + } + switch (ptype) { case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: @@ -4241,20 +4528,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_PROG_TYPE_FLOW_DISSECTOR: ret = netns_bpf_prog_attach(attr, prog); break; - case BPF_PROG_TYPE_CGROUP_DEVICE: - case BPF_PROG_TYPE_CGROUP_SKB: - case BPF_PROG_TYPE_CGROUP_SOCK: - case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: - case BPF_PROG_TYPE_CGROUP_SOCKOPT: - case BPF_PROG_TYPE_CGROUP_SYSCTL: - case BPF_PROG_TYPE_SOCK_OPS: - case BPF_PROG_TYPE_LSM: - if (ptype == BPF_PROG_TYPE_LSM && - prog->expected_attach_type != BPF_LSM_CGROUP) - ret = -EINVAL; - else - ret = cgroup_bpf_prog_attach(attr, ptype, prog); - break; case BPF_PROG_TYPE_SCHED_CLS: if (attr->attach_type == BPF_TCX_INGRESS || attr->attach_type == BPF_TCX_EGRESS) @@ -4265,7 +4538,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) default: ret = -EINVAL; } - +out: if (ret) bpf_prog_put(prog); return ret; @@ -4293,6 +4566,9 @@ static int bpf_prog_detach(const union bpf_attr *attr) if (IS_ERR(prog)) return PTR_ERR(prog); } + } else if (is_cgroup_prog_type(ptype, 0, false)) { + if (attr->attach_flags || attr->relative_fd) + return -EINVAL; } else if (attr->attach_flags || attr->relative_fd || attr->expected_revision) { @@ -4773,19 +5049,19 @@ static int bpf_prog_get_info_by_fd(struct file *file, struct bpf_insn *insns_sanitized; bool fault; - if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) { + if (!prog->blinded || bpf_dump_raw_ok(file->f_cred)) { + insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred); + if (!insns_sanitized) + return -ENOMEM; + uinsns = u64_to_user_ptr(info.xlated_prog_insns); + ulen = min_t(u32, info.xlated_prog_len, ulen); + fault = copy_to_user(uinsns, insns_sanitized, ulen); + kfree(insns_sanitized); + if (fault) + return -EFAULT; + } else { info.xlated_prog_insns = 0; - goto done; } - insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred); - if (!insns_sanitized) - return -ENOMEM; - uinsns = u64_to_user_ptr(info.xlated_prog_insns); - ulen = min_t(u32, info.xlated_prog_len, ulen); - fault = copy_to_user(uinsns, insns_sanitized, ulen); - kfree(insns_sanitized); - if (fault) - return -EFAULT; } if (bpf_prog_is_offloaded(prog->aux)) { @@ -4998,6 +5274,9 @@ static int bpf_map_get_info_by_fd(struct file *file, info_len = min_t(u32, sizeof(info), info_len); memset(&info, 0, sizeof(info)); + if (copy_from_user(&info, uinfo, info_len)) + return -EFAULT; + info.type = map->map_type; info.id = map->id; info.key_size = map->key_size; @@ -5022,6 +5301,25 @@ static int bpf_map_get_info_by_fd(struct file *file, return err; } + if (info.hash) { + char __user *uhash = u64_to_user_ptr(info.hash); + + if (!map->ops->map_get_hash) + return -EINVAL; + + if (info.hash_size != SHA256_DIGEST_SIZE) + return -EINVAL; + + err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha); + if (err != 0) + return err; + + if (copy_to_user(uhash, map->sha, SHA256_DIGEST_SIZE) != 0) + return -EFAULT; + } else if (info.hash_size) { + return -EINVAL; + } + if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) return -EFAULT; @@ -5083,6 +5381,21 @@ static int bpf_link_get_info_by_fd(struct file *file, } +static int token_get_info_by_fd(struct file *file, + struct bpf_token *token, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_token_info __user *uinfo = u64_to_user_ptr(attr->info.info); + u32 info_len = attr->info.info_len; + int err; + + err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len); + if (err) + return err; + return bpf_token_get_info_by_fd(token, attr, uattr); +} + #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, @@ -5106,6 +5419,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, else if (fd_file(f)->f_op == &bpf_link_fops || fd_file(f)->f_op == &bpf_link_fops_poll) return bpf_link_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr); + else if (fd_file(f)->f_op == &bpf_token_fops) + return token_get_info_by_fd(fd_file(f), fd_file(f)->private_data, + attr, uattr); return -EINVAL; } @@ -5193,21 +5509,10 @@ static int bpf_task_fd_query_copy(const union bpf_attr *attr, if (put_user(zero, ubuf)) return -EFAULT; - } else if (input_len >= len + 1) { - /* ubuf can hold the string with NULL terminator */ - if (copy_to_user(ubuf, buf, len + 1)) - return -EFAULT; } else { - /* ubuf cannot hold the string with NULL terminator, - * do a partial copy with NULL terminator. - */ - char zero = '\0'; - - err = -ENOSPC; - if (copy_to_user(ubuf, buf, input_len - 1)) - return -EFAULT; - if (put_user(zero, ubuf + input_len - 1)) - return -EFAULT; + err = bpf_copy_to_user(ubuf, buf, input_len, len); + if (err == -EFAULT) + return err; } } @@ -5385,7 +5690,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, attr->link_create.target_btf_id, - attr->link_create.tracing.cookie); + attr->link_create.tracing.cookie, + attr->link_create.attach_type); break; case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_TRACING: @@ -5394,7 +5700,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) goto out; } if (prog->expected_attach_type == BPF_TRACE_RAW_TP) - ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie); + ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie, + attr->link_create.attach_type); else if (prog->expected_attach_type == BPF_TRACE_ITER) ret = bpf_iter_link_attach(attr, uattr, prog); else if (prog->expected_attach_type == BPF_LSM_CGROUP) @@ -5403,7 +5710,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, attr->link_create.target_btf_id, - attr->link_create.tracing.cookie); + attr->link_create.tracing.cookie, + attr->link_create.attach_type); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: case BPF_PROG_TYPE_SK_LOOKUP: @@ -5792,6 +6100,28 @@ static int token_create(union bpf_attr *attr) return bpf_token_create(attr); } +#define BPF_PROG_STREAM_READ_BY_FD_LAST_FIELD prog_stream_read.prog_fd + +static int prog_stream_read(union bpf_attr *attr) +{ + char __user *buf = u64_to_user_ptr(attr->prog_stream_read.stream_buf); + u32 len = attr->prog_stream_read.stream_buf_len; + struct bpf_prog *prog; + int ret; + + if (CHECK_ATTR(BPF_PROG_STREAM_READ_BY_FD)) + return -EINVAL; + + prog = bpf_prog_get(attr->prog_stream_read.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + ret = bpf_prog_stream_read(prog, attr->prog_stream_read.stream_id, buf, len); + bpf_prog_put(prog); + + return ret; +} + static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; @@ -5813,7 +6143,7 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) switch (cmd) { case BPF_MAP_CREATE: - err = map_create(&attr, uattr.is_kernel); + err = map_create(&attr, uattr); break; case BPF_MAP_LOOKUP_ELEM: err = map_lookup_elem(&attr); @@ -5928,6 +6258,9 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) case BPF_TOKEN_CREATE: err = token_create(&attr); break; + case BPF_PROG_STREAM_READ_BY_FD: + err = prog_stream_read(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 81d6cf90584a..9cbe15ce3540 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -7,14 +7,46 @@ #include <linux/kobject.h> #include <linux/init.h> #include <linux/sysfs.h> +#include <linux/mm.h> +#include <linux/io.h> +#include <linux/btf.h> /* See scripts/link-vmlinux.sh, gen_btf() func for details */ extern char __start_BTF[]; extern char __stop_BTF[]; +static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj, + const struct bin_attribute *attr, + struct vm_area_struct *vma) +{ + unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT; + size_t vm_size = vma->vm_end - vma->vm_start; + phys_addr_t addr = __pa_symbol(__start_BTF); + unsigned long pfn = addr >> PAGE_SHIFT; + + if (attr->private != __start_BTF || !PAGE_ALIGNED(addr)) + return -EINVAL; + + if (vma->vm_pgoff) + return -EINVAL; + + if (vma->vm_flags & (VM_WRITE | VM_EXEC | VM_MAYSHARE)) + return -EACCES; + + if (pfn + pages < pfn) + return -EINVAL; + + if ((vm_size >> PAGE_SHIFT) > pages) + return -EINVAL; + + vm_flags_mod(vma, VM_DONTDUMP, VM_MAYEXEC | VM_MAYWRITE); + return remap_pfn_range(vma, vma->vm_start, pfn, vm_size, vma->vm_page_prot); +} + static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = { .attr = { .name = "vmlinux", .mode = 0444, }, - .read_new = sysfs_bin_attr_simple_read, + .read = sysfs_bin_attr_simple_read, + .mmap = btf_sysfs_vmlinux_mmap, }; struct kobject *btf_kobj; diff --git a/kernel/bpf/tcx.c b/kernel/bpf/tcx.c index 2e4885e7781f..efd987ea6872 100644 --- a/kernel/bpf/tcx.c +++ b/kernel/bpf/tcx.c @@ -142,7 +142,7 @@ static int tcx_link_prog_attach(struct bpf_link *link, u32 flags, u32 id_or_fd, u64 revision) { struct tcx_link *tcx = tcx_link(link); - bool created, ingress = tcx->location == BPF_TCX_INGRESS; + bool created, ingress = link->attach_type == BPF_TCX_INGRESS; struct bpf_mprog_entry *entry, *entry_new; struct net_device *dev = tcx->dev; int ret; @@ -169,7 +169,7 @@ static int tcx_link_prog_attach(struct bpf_link *link, u32 flags, u32 id_or_fd, static void tcx_link_release(struct bpf_link *link) { struct tcx_link *tcx = tcx_link(link); - bool ingress = tcx->location == BPF_TCX_INGRESS; + bool ingress = link->attach_type == BPF_TCX_INGRESS; struct bpf_mprog_entry *entry, *entry_new; struct net_device *dev; int ret = 0; @@ -204,7 +204,7 @@ static int tcx_link_update(struct bpf_link *link, struct bpf_prog *nprog, struct bpf_prog *oprog) { struct tcx_link *tcx = tcx_link(link); - bool ingress = tcx->location == BPF_TCX_INGRESS; + bool ingress = link->attach_type == BPF_TCX_INGRESS; struct bpf_mprog_entry *entry, *entry_new; struct net_device *dev; int ret = 0; @@ -260,8 +260,8 @@ static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq) seq_printf(seq, "ifindex:\t%u\n", ifindex); seq_printf(seq, "attach_type:\t%u (%s)\n", - tcx->location, - tcx->location == BPF_TCX_INGRESS ? "ingress" : "egress"); + link->attach_type, + link->attach_type == BPF_TCX_INGRESS ? "ingress" : "egress"); } static int tcx_link_fill_info(const struct bpf_link *link, @@ -276,7 +276,7 @@ static int tcx_link_fill_info(const struct bpf_link *link, rtnl_unlock(); info->tcx.ifindex = ifindex; - info->tcx.attach_type = tcx->location; + info->tcx.attach_type = link->attach_type; return 0; } @@ -301,8 +301,8 @@ static int tcx_link_init(struct tcx_link *tcx, struct net_device *dev, struct bpf_prog *prog) { - bpf_link_init(&tcx->link, BPF_LINK_TYPE_TCX, &tcx_link_lops, prog); - tcx->location = attr->link_create.attach_type; + bpf_link_init(&tcx->link, BPF_LINK_TYPE_TCX, &tcx_link_lops, prog, + attr->link_create.attach_type); tcx->dev = dev; return bpf_link_prime(&tcx->link, link_primer); } diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index 9dbc31b25e3d..f8e70e9c3998 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -83,6 +83,11 @@ struct tnum tnum_sub(struct tnum a, struct tnum b) return TNUM(dv & ~mu, mu); } +struct tnum tnum_neg(struct tnum a) +{ + return tnum_sub(TNUM(0, 0), a); +} + struct tnum tnum_and(struct tnum a, struct tnum b) { u64 alpha, beta, v; @@ -111,31 +116,55 @@ struct tnum tnum_xor(struct tnum a, struct tnum b) return TNUM(v & ~mu, mu); } -/* Generate partial products by multiplying each bit in the multiplier (tnum a) - * with the multiplicand (tnum b), and add the partial products after - * appropriately bit-shifting them. Instead of directly performing tnum addition - * on the generated partial products, equivalenty, decompose each partial - * product into two tnums, consisting of the value-sum (acc_v) and the - * mask-sum (acc_m) and then perform tnum addition on them. The following paper - * explains the algorithm in more detail: https://arxiv.org/abs/2105.05398. +/* Perform long multiplication, iterating through the bits in a using rshift: + * - if LSB(a) is a known 0, keep current accumulator + * - if LSB(a) is a known 1, add b to current accumulator + * - if LSB(a) is unknown, take a union of the above cases. + * + * For example: + * + * acc_0: acc_1: + * + * 11 * -> 11 * -> 11 * -> union(0011, 1001) == x0x1 + * x1 01 11 + * ------ ------ ------ + * 11 11 11 + * xx 00 11 + * ------ ------ ------ + * ???? 0011 1001 */ struct tnum tnum_mul(struct tnum a, struct tnum b) { - u64 acc_v = a.value * b.value; - struct tnum acc_m = TNUM(0, 0); + struct tnum acc = TNUM(0, 0); while (a.value || a.mask) { /* LSB of tnum a is a certain 1 */ if (a.value & 1) - acc_m = tnum_add(acc_m, TNUM(0, b.mask)); + acc = tnum_add(acc, b); /* LSB of tnum a is uncertain */ - else if (a.mask & 1) - acc_m = tnum_add(acc_m, TNUM(0, b.value | b.mask)); + else if (a.mask & 1) { + /* acc = tnum_union(acc_0, acc_1), where acc_0 and + * acc_1 are partial accumulators for cases + * LSB(a) = certain 0 and LSB(a) = certain 1. + * acc_0 = acc + 0 * b = acc. + * acc_1 = acc + 1 * b = tnum_add(acc, b). + */ + + acc = tnum_union(acc, tnum_add(acc, b)); + } /* Note: no case for LSB is certain 0 */ a = tnum_rshift(a, 1); b = tnum_lshift(b, 1); } - return tnum_add(TNUM(acc_v, 0), acc_m); + return acc; +} + +bool tnum_overlap(struct tnum a, struct tnum b) +{ + u64 mu; + + mu = ~a.mask & ~b.mask; + return (a.value & mu) == (b.value & mu); } /* Note that if a and b disagree - i.e. one has a 'known 1' where the other has @@ -150,6 +179,19 @@ struct tnum tnum_intersect(struct tnum a, struct tnum b) return TNUM(v & ~mu, mu); } +/* Returns a tnum with the uncertainty from both a and b, and in addition, new + * uncertainty at any position that a and b disagree. This represents a + * superset of the union of the concrete sets of both a and b. Despite the + * overapproximation, it is optimal. + */ +struct tnum tnum_union(struct tnum a, struct tnum b) +{ + u64 v = a.value & b.value; + u64 mu = (a.value ^ b.value) | a.mask | b.mask; + + return TNUM(v & ~mu, mu); +} + struct tnum tnum_cast(struct tnum a, u8 size) { a.value &= (1ULL << (size * 8)) - 1; diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index 26057aa13503..feecd8f4dbf9 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -103,23 +103,22 @@ static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp) static const struct inode_operations bpf_token_iops = { }; -static const struct file_operations bpf_token_fops = { +const struct file_operations bpf_token_fops = { .release = bpf_token_release, .show_fdinfo = bpf_token_show_fdinfo, }; int bpf_token_create(union bpf_attr *attr) { + struct bpf_token *token __free(kfree) = NULL; struct bpf_mount_opts *mnt_opts; - struct bpf_token *token = NULL; struct user_namespace *userns; struct inode *inode; - struct file *file; CLASS(fd, f)(attr->token_create.bpffs_fd); struct path path; struct super_block *sb; umode_t mode; - int err, fd; + int err; if (fd_empty(f)) return -EBADF; @@ -166,23 +165,20 @@ int bpf_token_create(union bpf_attr *attr) inode->i_fop = &bpf_token_fops; clear_nlink(inode); /* make sure it is unlinked */ - file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops); - if (IS_ERR(file)) { - iput(inode); - return PTR_ERR(file); - } + FD_PREPARE(fdf, O_CLOEXEC, + alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, + O_RDWR, &bpf_token_fops)); + if (fdf.err) + return fdf.err; token = kzalloc(sizeof(*token), GFP_USER); - if (!token) { - err = -ENOMEM; - goto out_file; - } + if (!token) + return -ENOMEM; atomic64_set(&token->refcnt, 1); - /* remember bpffs owning userns for future ns_capable() checks */ - token->userns = get_user_ns(userns); - + /* remember bpffs owning userns for future ns_capable() checks. */ + token->userns = userns; token->allowed_cmds = mnt_opts->delegate_cmds; token->allowed_maps = mnt_opts->delegate_maps; token->allowed_progs = mnt_opts->delegate_progs; @@ -190,24 +186,34 @@ int bpf_token_create(union bpf_attr *attr) err = security_bpf_token_create(token, attr, &path); if (err) - goto out_token; + return err; - fd = get_unused_fd_flags(O_CLOEXEC); - if (fd < 0) { - err = fd; - goto out_token; - } + get_user_ns(token->userns); + fd_prepare_file(fdf)->private_data = no_free_ptr(token); + return fd_publish(fdf); +} - file->private_data = token; - fd_install(fd, file); +int bpf_token_get_info_by_fd(struct bpf_token *token, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_token_info __user *uinfo = u64_to_user_ptr(attr->info.info); + struct bpf_token_info info; + u32 info_len = attr->info.info_len; - return fd; + info_len = min_t(u32, info_len, sizeof(info)); + memset(&info, 0, sizeof(info)); -out_token: - bpf_token_free(token); -out_file: - fput(file); - return err; + info.allowed_cmds = token->allowed_cmds; + info.allowed_maps = token->allowed_maps; + info.allowed_progs = token->allowed_progs; + info.allowed_attachs = token->allowed_attachs; + + if (copy_to_user(uinfo, &info, info_len) || + put_user(info_len, &uattr->info.info_len)) + return -EFAULT; + + return 0; } struct bpf_token *bpf_token_get_from_fd(u32 ufd) diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index c4b1a98ff726..976d89011b15 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -175,23 +175,42 @@ out: return tr; } -static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) +static int bpf_trampoline_update_fentry(struct bpf_trampoline *tr, u32 orig_flags, + void *old_addr, void *new_addr) { + enum bpf_text_poke_type new_t = BPF_MOD_CALL, old_t = BPF_MOD_CALL; void *ip = tr->func.addr; + + if (!new_addr) + new_t = BPF_MOD_NOP; + else if (bpf_trampoline_use_jmp(tr->flags)) + new_t = BPF_MOD_JUMP; + + if (!old_addr) + old_t = BPF_MOD_NOP; + else if (bpf_trampoline_use_jmp(orig_flags)) + old_t = BPF_MOD_JUMP; + + return bpf_arch_text_poke(ip, old_t, new_t, old_addr, new_addr); +} + +static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags, + void *old_addr) +{ int ret; if (tr->func.ftrace_managed) ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false); else - ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL); + ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, NULL); return ret; } -static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr, +static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, + void *old_addr, void *new_addr, bool lock_direct_mutex) { - void *ip = tr->func.addr; int ret; if (tr->func.ftrace_managed) { @@ -200,7 +219,8 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad else ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr); } else { - ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr); + ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, + new_addr); } return ret; } @@ -220,10 +240,12 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) } if (tr->func.ftrace_managed) { - ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1); + ret = ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1); + if (ret) + return ret; ret = register_ftrace_direct(tr->fops, (long)new_addr); } else { - ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr); + ret = bpf_trampoline_update_fentry(tr, 0, NULL, new_addr); } return ret; @@ -334,8 +356,9 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im) * call_rcu_tasks() is not necessary. */ if (im->ip_after_call) { - int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP, - NULL, im->ip_epilogue); + int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_NOP, + BPF_MOD_JUMP, NULL, + im->ip_epilogue); WARN_ON(err); if (IS_ENABLED(CONFIG_TASKS_RCU)) call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks); @@ -408,7 +431,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut return PTR_ERR(tlinks); if (total == 0) { - err = unregister_fentry(tr, tr->cur_image->image); + err = unregister_fentry(tr, orig_flags, tr->cur_image->image); bpf_tramp_image_put(tr->cur_image); tr->cur_image = NULL; goto out; @@ -432,9 +455,20 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS again: - if ((tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) && - (tr->flags & BPF_TRAMP_F_CALL_ORIG)) - tr->flags |= BPF_TRAMP_F_ORIG_STACK; + if (tr->flags & BPF_TRAMP_F_CALL_ORIG) { + if (tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) { + /* The BPF_TRAMP_F_SKIP_FRAME can be cleared in the + * first try, reset it in the second try. + */ + tr->flags |= BPF_TRAMP_F_ORIG_STACK | BPF_TRAMP_F_SKIP_FRAME; + } else if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_JMP)) { + /* Use "jmp" instead of "call" for the trampoline + * in the origin call case, and we don't need to + * skip the frame. + */ + tr->flags &= ~BPF_TRAMP_F_SKIP_FRAME; + } + } #endif size = arch_bpf_trampoline_size(&tr->func.model, tr->flags, @@ -465,10 +499,18 @@ again: if (err) goto out_free; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP + if (bpf_trampoline_use_jmp(tr->flags)) + tr->fops->flags |= FTRACE_OPS_FL_JMP; + else + tr->fops->flags &= ~FTRACE_OPS_FL_JMP; +#endif + WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) /* progs already running at this address */ - err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex); + err = modify_fentry(tr, orig_flags, tr->cur_image->image, + im->image, lock_direct_mutex); else /* first time registering */ err = register_fentry(tr, im->image); @@ -479,11 +521,6 @@ again: * BPF_TRAMP_F_SHARE_IPMODIFY is set, we can generate the * trampoline again, and retry register. */ - /* reset fops->func and fops->trampoline for re-register */ - tr->fops->func = NULL; - tr->fops->trampoline = 0; - - /* free im memory and reallocate later */ bpf_tramp_image_free(im); goto again; } @@ -496,8 +533,15 @@ again: tr->cur_image = im; out: /* If any error happens, restore previous flags */ - if (err) + if (err) { tr->flags = orig_flags; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP + if (bpf_trampoline_use_jmp(tr->flags)) + tr->fops->flags |= FTRACE_OPS_FL_JMP; + else + tr->fops->flags &= ~FTRACE_OPS_FL_JMP; +#endif + } kfree(tlinks); return err; @@ -573,7 +617,8 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, if (err) return err; tr->extension_prog = link->link.prog; - return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, + return bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP, + BPF_MOD_JUMP, NULL, link->link.prog->bpf_func); } if (cnt >= BPF_MAX_TRAMP_LINKS) @@ -621,6 +666,7 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, if (kind == BPF_TRAMP_REPLACE) { WARN_ON_ONCE(!tr->extension_prog); err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, + BPF_MOD_NOP, tr->extension_prog->bpf_func, NULL); tr->extension_prog = NULL; guard(mutex)(&tgt_prog->aux->ext_mutex); @@ -674,7 +720,8 @@ static const struct bpf_link_ops bpf_shim_tramp_link_lops = { static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog, bpf_func_t bpf_func, - int cgroup_atype) + int cgroup_atype, + enum bpf_attach_type attach_type) { struct bpf_shim_tramp_link *shim_link = NULL; struct bpf_prog *p; @@ -701,7 +748,7 @@ static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog p->expected_attach_type = BPF_LSM_MAC; bpf_prog_inc(p); bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC, - &bpf_shim_tramp_link_lops, p); + &bpf_shim_tramp_link_lops, p, attach_type); bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype); return shim_link; @@ -726,7 +773,8 @@ static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr, } int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, - int cgroup_atype) + int cgroup_atype, + enum bpf_attach_type attach_type) { struct bpf_shim_tramp_link *shim_link = NULL; struct bpf_attach_target_info tgt_info = {}; @@ -763,7 +811,7 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, /* Allocate and install new shim. */ - shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype); + shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype, attach_type); if (!shim_link) { err = -ENOMEM; goto err; @@ -897,8 +945,7 @@ static __always_inline u64 notrace bpf_prog_start_time(void) static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) __acquires(RCU) { - rcu_read_lock(); - migrate_disable(); + rcu_read_lock_dont_migrate(); run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); @@ -911,27 +958,32 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram return bpf_prog_start_time(); } -static void notrace update_prog_stats(struct bpf_prog *prog, - u64 start) +static void notrace __update_prog_stats(struct bpf_prog *prog, u64 start) { struct bpf_prog_stats *stats; + unsigned long flags; + u64 duration; - if (static_branch_unlikely(&bpf_stats_enabled_key) && - /* static_key could be enabled in __bpf_prog_enter* - * and disabled in __bpf_prog_exit*. - * And vice versa. - * Hence check that 'start' is valid. - */ - start > NO_START_TIME) { - u64 duration = sched_clock() - start; - unsigned long flags; - - stats = this_cpu_ptr(prog->stats); - flags = u64_stats_update_begin_irqsave(&stats->syncp); - u64_stats_inc(&stats->cnt); - u64_stats_add(&stats->nsecs, duration); - u64_stats_update_end_irqrestore(&stats->syncp, flags); - } + /* + * static_key could be enabled in __bpf_prog_enter* and disabled in + * __bpf_prog_exit*. And vice versa. Check that 'start' is valid. + */ + if (start <= NO_START_TIME) + return; + + duration = sched_clock() - start; + stats = this_cpu_ptr(prog->stats); + flags = u64_stats_update_begin_irqsave(&stats->syncp); + u64_stats_inc(&stats->cnt); + u64_stats_add(&stats->nsecs, duration); + u64_stats_update_end_irqrestore(&stats->syncp, flags); +} + +static __always_inline void notrace update_prog_stats(struct bpf_prog *prog, + u64 start) +{ + if (static_branch_unlikely(&bpf_stats_enabled_key)) + __update_prog_stats(prog, start); } static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start, @@ -942,8 +994,7 @@ static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start, update_prog_stats(prog, start); this_cpu_dec(*(prog->active)); - migrate_enable(); - rcu_read_unlock(); + rcu_read_unlock_migrate(); } static u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, @@ -953,8 +1004,7 @@ static u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, /* Runtime stats are exported via actual BPF_LSM_CGROUP * programs, not the shims. */ - rcu_read_lock(); - migrate_disable(); + rcu_read_lock_dont_migrate(); run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); @@ -967,8 +1017,7 @@ static void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, { bpf_reset_run_ctx(run_ctx->saved_run_ctx); - migrate_enable(); - rcu_read_unlock(); + rcu_read_unlock_migrate(); } u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, @@ -1026,8 +1075,7 @@ static u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) __acquires(RCU) { - rcu_read_lock(); - migrate_disable(); + rcu_read_lock_dont_migrate(); run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); @@ -1041,8 +1089,7 @@ static void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); - migrate_enable(); - rcu_read_unlock(); + rcu_read_unlock_migrate(); } void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 54c6953a8b84..f0ca69f888fa 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -44,6 +44,12 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { #undef BPF_LINK_TYPE }; +enum bpf_features { + BPF_FEAT_RDONLY_CAST_TO_VOID = 0, + BPF_FEAT_STREAMS = 1, + __MAX_BPF_FEAT, +}; + struct bpf_mem_alloc bpf_global_percpu_ma; static bool bpf_global_percpu_ma_set; @@ -203,8 +209,6 @@ static void invalidate_non_owning_refs(struct bpf_verifier_env *env); static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env); static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg); -static void specialize_kfunc(struct bpf_verifier_env *env, - u32 func_id, u16 offset, unsigned long *addr); static bool is_trusted_reg(const struct bpf_reg_state *reg); static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) @@ -322,6 +326,7 @@ struct bpf_kfunc_call_arg_meta { struct btf *arg_btf; u32 arg_btf_id; bool arg_owning_ref; + bool arg_prog; struct { struct btf_field *field; @@ -404,7 +409,8 @@ static bool reg_not_null(const struct bpf_reg_state *reg) type == PTR_TO_MAP_KEY || type == PTR_TO_SOCK_COMMON || (type == PTR_TO_BTF_ID && is_trusted_reg(reg)) || - type == PTR_TO_MEM; + (type == PTR_TO_MEM && !(reg->type & PTR_UNTRUSTED)) || + type == CONST_PTR_TO_MAP; } static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg) @@ -507,6 +513,7 @@ static bool is_callback_calling_kfunc(u32 btf_id); static bool is_bpf_throw_kfunc(struct bpf_insn *insn); static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id); +static bool is_task_work_add_kfunc(u32 func_id); static bool is_sync_callback_calling_function(enum bpf_func_id func_id) { @@ -539,6 +546,21 @@ static bool is_async_callback_calling_insn(struct bpf_insn *insn) (bpf_pseudo_kfunc_call(insn) && is_async_callback_calling_kfunc(insn->imm)); } +static bool is_async_cb_sleepable(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + /* bpf_timer callbacks are never sleepable. */ + if (bpf_helper_call(insn) && insn->imm == BPF_FUNC_timer_set_callback) + return false; + + /* bpf_wq and bpf_task_work callbacks are always sleepable. */ + if (bpf_pseudo_kfunc_call(insn) && insn->off == 0 && + (is_bpf_wq_set_callback_impl_kfunc(insn->imm) || is_task_work_add_kfunc(insn->imm))) + return true; + + verifier_bug(env, "unhandled async callback in is_async_cb_sleepable"); + return false; +} + static bool is_may_goto_insn(struct bpf_insn *insn) { return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO; @@ -666,6 +688,10 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type) return BPF_DYNPTR_TYPE_SKB; case DYNPTR_TYPE_XDP: return BPF_DYNPTR_TYPE_XDP; + case DYNPTR_TYPE_SKB_META: + return BPF_DYNPTR_TYPE_SKB_META; + case DYNPTR_TYPE_FILE: + return BPF_DYNPTR_TYPE_FILE; default: return BPF_DYNPTR_TYPE_INVALID; } @@ -682,6 +708,10 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type) return DYNPTR_TYPE_SKB; case BPF_DYNPTR_TYPE_XDP: return DYNPTR_TYPE_XDP; + case BPF_DYNPTR_TYPE_SKB_META: + return DYNPTR_TYPE_SKB_META; + case BPF_DYNPTR_TYPE_FILE: + return DYNPTR_TYPE_FILE; default: return 0; } @@ -689,7 +719,7 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type) static bool dynptr_type_refcounted(enum bpf_dynptr_type type) { - return type == BPF_DYNPTR_TYPE_RINGBUF; + return type == BPF_DYNPTR_TYPE_RINGBUF || type == BPF_DYNPTR_TYPE_FILE; } static void __mark_dynptr_reg(struct bpf_reg_state *reg, @@ -775,8 +805,7 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ state->stack[spi - 1].spilled_ptr.ref_obj_id = id; } - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; - state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; + bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi)); return 0; } @@ -793,29 +822,7 @@ static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_stat __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); - /* Why do we need to set REG_LIVE_WRITTEN for STACK_INVALID slot? - * - * While we don't allow reading STACK_INVALID, it is still possible to - * do <8 byte writes marking some but not all slots as STACK_MISC. Then, - * helpers or insns can do partial read of that part without failing, - * but check_stack_range_initialized, check_stack_read_var_off, and - * check_stack_read_fixed_off will do mark_reg_read for all 8-bytes of - * the slot conservatively. Hence we need to prevent those liveness - * marking walks. - * - * This was not a problem before because STACK_INVALID is only set by - * default (where the default reg state has its reg->parent as NULL), or - * in clean_live_states after REG_LIVE_DONE (at which point - * mark_reg_read won't walk reg->parent chain), but not randomly during - * verifier state exploration (like we did above). Hence, for our case - * parentage chain will still be live (i.e. reg->parent may be - * non-NULL), while earlier reg->parent was NULL, so we need - * REG_LIVE_WRITTEN to screen off read marker propagation when it is - * done later on reads or by mark_dynptr_read as well to unnecessary - * mark registers in verifier state. - */ - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; - state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; + bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi)); } static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg) @@ -823,6 +830,15 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re struct bpf_func_state *state = func(env, reg); int spi, ref_obj_id, i; + /* + * This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot + * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr + * is safe to do directly. + */ + if (reg->type == CONST_PTR_TO_DYNPTR) { + verifier_bug(env, "CONST_PTR_TO_DYNPTR cannot be released"); + return -EFAULT; + } spi = dynptr_get_spi(env, reg); if (spi < 0) return spi; @@ -854,7 +870,7 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re * dynptr */ if (state->stack[i].slot_type[0] != STACK_DYNPTR) { - verbose(env, "verifier internal error: misconfigured ref_obj_id\n"); + verifier_bug(env, "misconfigured ref_obj_id"); return -EFAULT; } if (state->stack[i].spilled_ptr.dynptr.first_slot) @@ -924,9 +940,7 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); - /* Same reason as unmark_stack_slots_dynptr above */ - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; - state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; + bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi)); return 0; } @@ -1044,7 +1058,6 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, else st->type |= PTR_UNTRUSTED; } - st->live |= REG_LIVE_WRITTEN; st->ref_obj_id = i == 0 ? id : 0; st->iter.btf = btf; st->iter.btf_id = btf_id; @@ -1054,6 +1067,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, for (j = 0; j < BPF_REG_SIZE; j++) slot->slot_type[j] = STACK_ITER; + bpf_mark_stack_write(env, state->frameno, BIT(spi - i)); mark_stack_slot_scratched(env, spi - i); } @@ -1079,12 +1093,10 @@ static int unmark_stack_slots_iter(struct bpf_verifier_env *env, __mark_reg_not_init(env, st); - /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */ - st->live |= REG_LIVE_WRITTEN; - for (j = 0; j < BPF_REG_SIZE; j++) slot->slot_type[j] = STACK_INVALID; + bpf_mark_stack_write(env, state->frameno, BIT(spi - i)); mark_stack_slot_scratched(env, spi - i); } @@ -1174,9 +1186,9 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, slot = &state->stack[spi]; st = &slot->spilled_ptr; + bpf_mark_stack_write(env, reg->frameno, BIT(spi)); __mark_reg_known_zero(st); st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ - st->live |= REG_LIVE_WRITTEN; st->ref_obj_id = id; st->irq.kfunc_class = kfunc_class; @@ -1230,8 +1242,7 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r __mark_reg_not_init(env, st); - /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */ - st->live |= REG_LIVE_WRITTEN; + bpf_mark_stack_write(env, reg->frameno, BIT(spi)); for (i = 0; i < BPF_REG_SIZE; i++) slot->slot_type[i] = STACK_INVALID; @@ -1402,7 +1413,7 @@ static void *realloc_array(void *arr, size_t old_n, size_t new_n, size_t size) goto out; alloc_size = kmalloc_size_roundup(size_mul(new_n, size)); - new_arr = krealloc(arr, alloc_size, GFP_KERNEL); + new_arr = krealloc(arr, alloc_size, GFP_KERNEL_ACCOUNT); if (!new_arr) { kfree(arr); return NULL; @@ -1419,14 +1430,14 @@ out: static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf_verifier_state *src) { dst->refs = copy_array(dst->refs, src->refs, src->acquired_refs, - sizeof(struct bpf_reference_state), GFP_KERNEL); + sizeof(struct bpf_reference_state), GFP_KERNEL_ACCOUNT); if (!dst->refs) return -ENOMEM; dst->acquired_refs = src->acquired_refs; dst->active_locks = src->active_locks; dst->active_preempt_locks = src->active_preempt_locks; - dst->active_rcu_lock = src->active_rcu_lock; + dst->active_rcu_locks = src->active_rcu_locks; dst->active_irq_id = src->active_irq_id; dst->active_lock_id = src->active_lock_id; dst->active_lock_ptr = src->active_lock_ptr; @@ -1438,7 +1449,7 @@ static int copy_stack_state(struct bpf_func_state *dst, const struct bpf_func_st size_t n = src->allocated_stack / BPF_REG_SIZE; dst->stack = copy_array(dst->stack, src->stack, n, sizeof(struct bpf_stack_state), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!dst->stack) return -ENOMEM; @@ -1646,7 +1657,7 @@ static void update_peak_states(struct bpf_verifier_env *env) { u32 cur_states; - cur_states = env->explored_states_size + env->free_list_size; + cur_states = env->explored_states_size + env->free_list_size + env->num_backedges; env->peak_states = max(env->peak_states, cur_states); } @@ -1658,6 +1669,13 @@ static void free_func_state(struct bpf_func_state *state) kfree(state); } +static void clear_jmp_history(struct bpf_verifier_state *state) +{ + kfree(state->jmp_history); + state->jmp_history = NULL; + state->jmp_history_cnt = 0; +} + static void free_verifier_state(struct bpf_verifier_state *state, bool free_self) { @@ -1668,11 +1686,12 @@ static void free_verifier_state(struct bpf_verifier_state *state, state->frame[i] = NULL; } kfree(state->refs); + clear_jmp_history(state); if (free_self) kfree(state); } -/* struct bpf_verifier_state->{parent,loop_entry} refer to states +/* struct bpf_verifier_state->parent refers to states * that are in either of env->{expored_states,free_list}. * In both cases the state is contained in struct bpf_verifier_state_list. */ @@ -1683,37 +1702,24 @@ static struct bpf_verifier_state_list *state_parent_as_list(struct bpf_verifier_ return NULL; } -static struct bpf_verifier_state_list *state_loop_entry_as_list(struct bpf_verifier_state *st) -{ - if (st->loop_entry) - return container_of(st->loop_entry, struct bpf_verifier_state_list, state); - return NULL; -} +static bool incomplete_read_marks(struct bpf_verifier_env *env, + struct bpf_verifier_state *st); /* A state can be freed if it is no longer referenced: * - is in the env->free_list; * - has no children states; - * - is not used as loop_entry. - * - * Freeing a state can make it's loop_entry free-able. */ static void maybe_free_verifier_state(struct bpf_verifier_env *env, struct bpf_verifier_state_list *sl) { - struct bpf_verifier_state_list *loop_entry_sl; - - while (sl && sl->in_free_list && - sl->state.branches == 0 && - sl->state.used_as_loop_entry == 0) { - loop_entry_sl = state_loop_entry_as_list(&sl->state); - if (loop_entry_sl) - loop_entry_sl->state.used_as_loop_entry--; - list_del(&sl->node); - free_verifier_state(&sl->state, false); - kfree(sl); - env->free_list_size--; - sl = loop_entry_sl; - } + if (!sl->in_free_list + || sl->state.branches != 0 + || incomplete_read_marks(env, &sl->state)) + return; + list_del(&sl->node); + free_verifier_state(&sl->state, false); + kfree(sl); + env->free_list_size--; } /* copy verifier state from src to dst growing dst stack space @@ -1732,6 +1738,13 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, struct bpf_func_state *dst; int i, err; + dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history, + src->jmp_history_cnt, sizeof(*dst_state->jmp_history), + GFP_KERNEL_ACCOUNT); + if (!dst_state->jmp_history) + return -ENOMEM; + dst_state->jmp_history_cnt = src->jmp_history_cnt; + /* if dst has more stack frames then src frame, free them, this is also * necessary in case of exceptional exits using bpf_throw. */ @@ -1744,22 +1757,20 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, return err; dst_state->speculative = src->speculative; dst_state->in_sleepable = src->in_sleepable; + dst_state->cleaned = src->cleaned; dst_state->curframe = src->curframe; dst_state->branches = src->branches; dst_state->parent = src->parent; dst_state->first_insn_idx = src->first_insn_idx; dst_state->last_insn_idx = src->last_insn_idx; - dst_state->insn_hist_start = src->insn_hist_start; - dst_state->insn_hist_end = src->insn_hist_end; dst_state->dfs_depth = src->dfs_depth; dst_state->callback_unroll_depth = src->callback_unroll_depth; - dst_state->used_as_loop_entry = src->used_as_loop_entry; dst_state->may_goto_depth = src->may_goto_depth; - dst_state->loop_entry = src->loop_entry; + dst_state->equal_state = src->equal_state; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { - dst = kzalloc(sizeof(*dst), GFP_KERNEL); + dst = kzalloc(sizeof(*dst), GFP_KERNEL_ACCOUNT); if (!dst) return -ENOMEM; dst_state->frame[i] = dst; @@ -1798,184 +1809,256 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta return true; } -/* Open coded iterators allow back-edges in the state graph in order to - * check unbounded loops that iterators. - * - * In is_state_visited() it is necessary to know if explored states are - * part of some loops in order to decide whether non-exact states - * comparison could be used: - * - non-exact states comparison establishes sub-state relation and uses - * read and precision marks to do so, these marks are propagated from - * children states and thus are not guaranteed to be final in a loop; - * - exact states comparison just checks if current and explored states - * are identical (and thus form a back-edge). - * - * Paper "A New Algorithm for Identifying Loops in Decompilation" - * by Tao Wei, Jian Mao, Wei Zou and Yu Chen [1] presents a convenient - * algorithm for loop structure detection and gives an overview of - * relevant terminology. It also has helpful illustrations. - * - * [1] https://api.semanticscholar.org/CorpusID:15784067 - * - * We use a similar algorithm but because loop nested structure is - * irrelevant for verifier ours is significantly simpler and resembles - * strongly connected components algorithm from Sedgewick's textbook. - * - * Define topmost loop entry as a first node of the loop traversed in a - * depth first search starting from initial state. The goal of the loop - * tracking algorithm is to associate topmost loop entries with states - * derived from these entries. - * - * For each step in the DFS states traversal algorithm needs to identify - * the following situations: - * - * initial initial initial - * | | | - * V V V - * ... ... .---------> hdr - * | | | | - * V V | V - * cur .-> succ | .------... - * | | | | | | - * V | V | V V - * succ '-- cur | ... ... - * | | | - * | V V - * | succ <- cur - * | | - * | V - * | ... - * | | - * '----' - * - * (A) successor state of cur (B) successor state of cur or it's entry - * not yet traversed are in current DFS path, thus cur and succ - * are members of the same outermost loop - * - * initial initial - * | | - * V V - * ... ... - * | | - * V V - * .------... .------... - * | | | | - * V V V V - * .-> hdr ... ... ... - * | | | | | - * | V V V V - * | succ <- cur succ <- cur - * | | | - * | V V - * | ... ... - * | | | - * '----' exit - * - * (C) successor state of cur is a part of some loop but this loop - * does not include cur or successor state is not in a loop at all. - * - * Algorithm could be described as the following python code: - * - * traversed = set() # Set of traversed nodes - * entries = {} # Mapping from node to loop entry - * depths = {} # Depth level assigned to graph node - * path = set() # Current DFS path - * - * # Find outermost loop entry known for n - * def get_loop_entry(n): - * h = entries.get(n, None) - * while h in entries: - * h = entries[h] - * return h - * - * # Update n's loop entry if h comes before n in current DFS path. - * def update_loop_entry(n, h): - * if h in path and depths[entries.get(n, n)] < depths[n]: - * entries[n] = h1 +/* Return IP for a given frame in a call stack */ +static u32 frame_insn_idx(struct bpf_verifier_state *st, u32 frame) +{ + return frame == st->curframe + ? st->insn_idx + : st->frame[frame + 1]->callsite; +} + +/* For state @st look for a topmost frame with frame_insn_idx() in some SCC, + * if such frame exists form a corresponding @callchain as an array of + * call sites leading to this frame and SCC id. + * E.g.: * - * def dfs(n, depth): - * traversed.add(n) - * path.add(n) - * depths[n] = depth - * for succ in G.successors(n): - * if succ not in traversed: - * # Case A: explore succ and update cur's loop entry - * # only if succ's entry is in current DFS path. - * dfs(succ, depth + 1) - * h = entries.get(succ, None) - * update_loop_entry(n, h) - * else: - * # Case B or C depending on `h1 in path` check in update_loop_entry(). - * update_loop_entry(n, succ) - * path.remove(n) + * void foo() { A: loop {... SCC#1 ...}; } + * void bar() { B: loop { C: foo(); ... SCC#2 ... } + * D: loop { E: foo(); ... SCC#3 ... } } + * void main() { F: bar(); } * - * To adapt this algorithm for use with verifier: - * - use st->branch == 0 as a signal that DFS of succ had been finished - * and cur's loop entry has to be updated (case A), handle this in - * update_branch_counts(); - * - use st->branch > 0 as a signal that st is in the current DFS path; - * - handle cases B and C in is_state_visited(). + * @callchain at (A) would be either (F,SCC#2) or (F,SCC#3) depending + * on @st frame call sites being (F,C,A) or (F,E,A). */ -static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_env *env, - struct bpf_verifier_state *st) +static bool compute_scc_callchain(struct bpf_verifier_env *env, + struct bpf_verifier_state *st, + struct bpf_scc_callchain *callchain) { - struct bpf_verifier_state *topmost = st->loop_entry; - u32 steps = 0; + u32 i, scc, insn_idx; - while (topmost && topmost->loop_entry) { - if (steps++ > st->dfs_depth) { - WARN_ONCE(true, "verifier bug: infinite loop in get_loop_entry\n"); - verbose(env, "verifier bug: infinite loop in get_loop_entry()\n"); - return ERR_PTR(-EFAULT); + memset(callchain, 0, sizeof(*callchain)); + for (i = 0; i <= st->curframe; i++) { + insn_idx = frame_insn_idx(st, i); + scc = env->insn_aux_data[insn_idx].scc; + if (scc) { + callchain->scc = scc; + break; + } else if (i < st->curframe) { + callchain->callsites[i] = insn_idx; + } else { + return false; } - topmost = topmost->loop_entry; } - return topmost; + return true; } -static void update_loop_entry(struct bpf_verifier_env *env, - struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr) +/* Check if bpf_scc_visit instance for @callchain exists. */ +static struct bpf_scc_visit *scc_visit_lookup(struct bpf_verifier_env *env, + struct bpf_scc_callchain *callchain) { - /* The hdr->branches check decides between cases B and C in - * comment for get_loop_entry(). If hdr->branches == 0 then - * head's topmost loop entry is not in current DFS path, - * hence 'cur' and 'hdr' are not in the same loop and there is - * no need to update cur->loop_entry. - */ - if (hdr->branches && hdr->dfs_depth < (cur->loop_entry ?: cur)->dfs_depth) { - if (cur->loop_entry) { - cur->loop_entry->used_as_loop_entry--; - maybe_free_verifier_state(env, state_loop_entry_as_list(cur)); + struct bpf_scc_info *info = env->scc_info[callchain->scc]; + struct bpf_scc_visit *visits = info->visits; + u32 i; + + if (!info) + return NULL; + for (i = 0; i < info->num_visits; i++) + if (memcmp(callchain, &visits[i].callchain, sizeof(*callchain)) == 0) + return &visits[i]; + return NULL; +} + +/* Allocate a new bpf_scc_visit instance corresponding to @callchain. + * Allocated instances are alive for a duration of the do_check_common() + * call and are freed by free_states(). + */ +static struct bpf_scc_visit *scc_visit_alloc(struct bpf_verifier_env *env, + struct bpf_scc_callchain *callchain) +{ + struct bpf_scc_visit *visit; + struct bpf_scc_info *info; + u32 scc, num_visits; + u64 new_sz; + + scc = callchain->scc; + info = env->scc_info[scc]; + num_visits = info ? info->num_visits : 0; + new_sz = sizeof(*info) + sizeof(struct bpf_scc_visit) * (num_visits + 1); + info = kvrealloc(env->scc_info[scc], new_sz, GFP_KERNEL_ACCOUNT); + if (!info) + return NULL; + env->scc_info[scc] = info; + info->num_visits = num_visits + 1; + visit = &info->visits[num_visits]; + memset(visit, 0, sizeof(*visit)); + memcpy(&visit->callchain, callchain, sizeof(*callchain)); + return visit; +} + +/* Form a string '(callsite#1,callsite#2,...,scc)' in env->tmp_str_buf */ +static char *format_callchain(struct bpf_verifier_env *env, struct bpf_scc_callchain *callchain) +{ + char *buf = env->tmp_str_buf; + int i, delta = 0; + + delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "("); + for (i = 0; i < ARRAY_SIZE(callchain->callsites); i++) { + if (!callchain->callsites[i]) + break; + delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u,", + callchain->callsites[i]); + } + delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u)", callchain->scc); + return env->tmp_str_buf; +} + +/* If callchain for @st exists (@st is in some SCC), ensure that + * bpf_scc_visit instance for this callchain exists. + * If instance does not exist or is empty, assign visit->entry_state to @st. + */ +static int maybe_enter_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + struct bpf_scc_callchain *callchain = &env->callchain_buf; + struct bpf_scc_visit *visit; + + if (!compute_scc_callchain(env, st, callchain)) + return 0; + visit = scc_visit_lookup(env, callchain); + visit = visit ?: scc_visit_alloc(env, callchain); + if (!visit) + return -ENOMEM; + if (!visit->entry_state) { + visit->entry_state = st; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "SCC enter %s\n", format_callchain(env, callchain)); + } + return 0; +} + +static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit); + +/* If callchain for @st exists (@st is in some SCC), make it empty: + * - set visit->entry_state to NULL; + * - flush accumulated backedges. + */ +static int maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + struct bpf_scc_callchain *callchain = &env->callchain_buf; + struct bpf_scc_visit *visit; + + if (!compute_scc_callchain(env, st, callchain)) + return 0; + visit = scc_visit_lookup(env, callchain); + if (!visit) { + /* + * If path traversal stops inside an SCC, corresponding bpf_scc_visit + * must exist for non-speculative paths. For non-speculative paths + * traversal stops when: + * a. Verification error is found, maybe_exit_scc() is not called. + * b. Top level BPF_EXIT is reached. Top level BPF_EXIT is not a member + * of any SCC. + * c. A checkpoint is reached and matched. Checkpoints are created by + * is_state_visited(), which calls maybe_enter_scc(), which allocates + * bpf_scc_visit instances for checkpoints within SCCs. + * (c) is the only case that can reach this point. + */ + if (!st->speculative) { + verifier_bug(env, "scc exit: no visit info for call chain %s", + format_callchain(env, callchain)); + return -EFAULT; } - cur->loop_entry = hdr; - hdr->used_as_loop_entry++; + return 0; + } + if (visit->entry_state != st) + return 0; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "SCC exit %s\n", format_callchain(env, callchain)); + visit->entry_state = NULL; + env->num_backedges -= visit->num_backedges; + visit->num_backedges = 0; + update_peak_states(env); + return propagate_backedges(env, visit); +} + +/* Lookup an bpf_scc_visit instance corresponding to @st callchain + * and add @backedge to visit->backedges. @st callchain must exist. + */ +static int add_scc_backedge(struct bpf_verifier_env *env, + struct bpf_verifier_state *st, + struct bpf_scc_backedge *backedge) +{ + struct bpf_scc_callchain *callchain = &env->callchain_buf; + struct bpf_scc_visit *visit; + + if (!compute_scc_callchain(env, st, callchain)) { + verifier_bug(env, "add backedge: no SCC in verification path, insn_idx %d", + st->insn_idx); + return -EFAULT; } + visit = scc_visit_lookup(env, callchain); + if (!visit) { + verifier_bug(env, "add backedge: no visit info for call chain %s", + format_callchain(env, callchain)); + return -EFAULT; + } + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "SCC backedge %s\n", format_callchain(env, callchain)); + backedge->next = visit->backedges; + visit->backedges = backedge; + visit->num_backedges++; + env->num_backedges++; + update_peak_states(env); + return 0; } -static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +/* bpf_reg_state->live marks for registers in a state @st are incomplete, + * if state @st is in some SCC and not all execution paths starting at this + * SCC are fully explored. + */ +static bool incomplete_read_marks(struct bpf_verifier_env *env, + struct bpf_verifier_state *st) +{ + struct bpf_scc_callchain *callchain = &env->callchain_buf; + struct bpf_scc_visit *visit; + + if (!compute_scc_callchain(env, st, callchain)) + return false; + visit = scc_visit_lookup(env, callchain); + if (!visit) + return false; + return !!visit->backedges; +} + +static void free_backedges(struct bpf_scc_visit *visit) +{ + struct bpf_scc_backedge *backedge, *next; + + for (backedge = visit->backedges; backedge; backedge = next) { + free_verifier_state(&backedge->state, false); + next = backedge->next; + kfree(backedge); + } + visit->backedges = NULL; +} + +static int update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { struct bpf_verifier_state_list *sl = NULL, *parent_sl; struct bpf_verifier_state *parent; + int err; while (st) { u32 br = --st->branches; - /* br == 0 signals that DFS exploration for 'st' is finished, - * thus it is necessary to update parent's loop entry if it - * turned out that st is a part of some loop. - * This is a part of 'case A' in get_loop_entry() comment. - */ - if (br == 0 && st->parent && st->loop_entry) - update_loop_entry(env, st->parent, st->loop_entry); - - /* WARN_ON(br > 1) technically makes sense here, + /* verifier_bug_if(br > 1, ...) technically makes sense here, * but see comment in push_stack(), hence: */ - WARN_ONCE((int)br < 0, - "BUG update_branch_counts:branches_to_explore=%d\n", - br); + verifier_bug_if((int)br < 0, env, "%s:branches_to_explore=%d", __func__, br); if (br) break; + err = maybe_exit_scc(env, st); + if (err) + return err; parent = st->parent; parent_sl = state_parent_as_list(st); if (sl) @@ -1983,6 +2066,7 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi st = parent; sl = parent_sl; } + return 0; } static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, @@ -2014,6 +2098,18 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, return 0; } +static bool error_recoverable_with_nospec(int err) +{ + /* Should only return true for non-fatal errors that are allowed to + * occur during speculative verification. For these we can insert a + * nospec and the program might still be accepted. Do not include + * something like ENOMEM because it is likely to re-occur for the next + * architectural path once it has been recovered-from in all speculative + * paths. + */ + return err == -EPERM || err == -EACCES || err == -EINVAL; +} + static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx, bool speculative) @@ -2022,9 +2118,9 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, struct bpf_verifier_stack_elem *elem; int err; - elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); + elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT); if (!elem) - goto err; + return ERR_PTR(-ENOMEM); elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; @@ -2034,12 +2130,12 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, env->stack_size++; err = copy_verifier_state(&elem->st, cur); if (err) - goto err; + return ERR_PTR(-ENOMEM); elem->st.speculative |= speculative; if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) { verbose(env, "The sequence of %d jumps is too complex.\n", env->stack_size); - goto err; + return ERR_PTR(-E2BIG); } if (elem->st.parent) { ++elem->st.parent->branches; @@ -2054,12 +2150,6 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, */ } return &elem->st; -err: - free_verifier_state(env->cur_state, true); - env->cur_state = NULL; - /* pop all elements and return */ - while (!pop_stack(env, NULL, NULL, false)); - return NULL; } #define CALLER_SAVED_REGS 6 @@ -2161,10 +2251,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) /* transfer reg's id which is unique for every map_lookup_elem * as UID of the inner map. */ - if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER)) - reg->map_uid = reg->id; - if (btf_record_has_field(map->inner_map_meta->record, BPF_WORKQUEUE)) + if (btf_record_has_field(map->inner_map_meta->record, + BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { reg->map_uid = reg->id; + } } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || @@ -2203,7 +2293,8 @@ static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg) static bool reg_is_dynptr_slice_pkt(const struct bpf_reg_state *reg) { return base_type(reg->type) == PTR_TO_MEM && - (reg->type & DYNPTR_TYPE_SKB || reg->type & DYNPTR_TYPE_XDP); + (reg->type & + (DYNPTR_TYPE_SKB | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META)); } /* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */ @@ -2452,6 +2543,58 @@ static void __reg64_deduce_bounds(struct bpf_reg_state *reg) if ((u64)reg->smin_value <= (u64)reg->smax_value) { reg->umin_value = max_t(u64, reg->smin_value, reg->umin_value); reg->umax_value = min_t(u64, reg->smax_value, reg->umax_value); + } else { + /* If the s64 range crosses the sign boundary, then it's split + * between the beginning and end of the U64 domain. In that + * case, we can derive new bounds if the u64 range overlaps + * with only one end of the s64 range. + * + * In the following example, the u64 range overlaps only with + * positive portion of the s64 range. + * + * 0 U64_MAX + * | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] | + * |----------------------------|----------------------------| + * |xxxxx s64 range xxxxxxxxx] [xxxxxxx| + * 0 S64_MAX S64_MIN -1 + * + * We can thus derive the following new s64 and u64 ranges. + * + * 0 U64_MAX + * | [xxxxxx u64 range xxxxx] | + * |----------------------------|----------------------------| + * | [xxxxxx s64 range xxxxx] | + * 0 S64_MAX S64_MIN -1 + * + * If they overlap in two places, we can't derive anything + * because reg_state can't represent two ranges per numeric + * domain. + * + * 0 U64_MAX + * | [xxxxxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxxxxx] | + * |----------------------------|----------------------------| + * |xxxxx s64 range xxxxxxxxx] [xxxxxxxxxx| + * 0 S64_MAX S64_MIN -1 + * + * The first condition below corresponds to the first diagram + * above. + */ + if (reg->umax_value < (u64)reg->smin_value) { + reg->smin_value = (s64)reg->umin_value; + reg->umax_value = min_t(u64, reg->umax_value, reg->smax_value); + } else if ((u64)reg->smax_value < reg->umin_value) { + /* This second condition considers the case where the u64 range + * overlaps with the negative portion of the s64 range: + * + * 0 U64_MAX + * | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] | + * |----------------------------|----------------------------| + * |xxxxxxxxx] [xxxxxxxxxxxx s64 range | + * 0 S64_MAX S64_MIN -1 + */ + reg->smax_value = (s64)reg->umax_value; + reg->umin_value = max_t(u64, reg->umin_value, reg->smin_value); + } } } @@ -2483,20 +2626,6 @@ static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg) reg->smin_value = max_t(s64, reg->smin_value, new_smin); reg->smax_value = min_t(s64, reg->smax_value, new_smax); - /* if s32 can be treated as valid u32 range, we can use it as well */ - if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) { - /* s32 -> u64 tightening */ - new_umin = (reg->umin_value & ~0xffffffffULL) | (u32)reg->s32_min_value; - new_umax = (reg->umax_value & ~0xffffffffULL) | (u32)reg->s32_max_value; - reg->umin_value = max_t(u64, reg->umin_value, new_umin); - reg->umax_value = min_t(u64, reg->umax_value, new_umax); - /* s32 -> s64 tightening */ - new_smin = (reg->smin_value & ~0xffffffffULL) | (u32)reg->s32_min_value; - new_smax = (reg->smax_value & ~0xffffffffULL) | (u32)reg->s32_max_value; - reg->smin_value = max_t(s64, reg->smin_value, new_smin); - reg->smax_value = min_t(s64, reg->smax_value, new_smax); - } - /* Here we would like to handle a special case after sign extending load, * when upper bits for a 64-bit range are all 1s or all 0s. * @@ -2563,6 +2692,7 @@ static void reg_bounds_sync(struct bpf_reg_state *reg) /* We might have learned something about the sign bit. */ __reg_deduce_bounds(reg); __reg_deduce_bounds(reg); + __reg_deduce_bounds(reg); /* We might have learned some bits from the bounds. */ __reg_bound_offset(reg); /* Intersecting with the old var_off might have improved our bounds @@ -2609,13 +2739,13 @@ static int reg_bounds_sanity_check(struct bpf_verifier_env *env, return 0; out: - verbose(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] " - "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)\n", - ctx, msg, reg->umin_value, reg->umax_value, - reg->smin_value, reg->smax_value, - reg->u32_min_value, reg->u32_max_value, - reg->s32_min_value, reg->s32_max_value, - reg->var_off.value, reg->var_off.mask); + verifier_bug(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] " + "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)", + ctx, msg, reg->umin_value, reg->umax_value, + reg->smin_value, reg->smax_value, + reg->u32_min_value, reg->u32_max_value, + reg->s32_min_value, reg->s32_max_value, + reg->var_off.value, reg->var_off.mask); if (env->test_reg_invariants) return -EFAULT; __mark_reg_unbounded(reg); @@ -2725,22 +2855,33 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, __mark_reg_not_init(env, regs + regno); } -static void mark_btf_ld_reg(struct bpf_verifier_env *env, - struct bpf_reg_state *regs, u32 regno, - enum bpf_reg_type reg_type, - struct btf *btf, u32 btf_id, - enum bpf_type_flag flag) +static int mark_btf_ld_reg(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, u32 regno, + enum bpf_reg_type reg_type, + struct btf *btf, u32 btf_id, + enum bpf_type_flag flag) { - if (reg_type == SCALAR_VALUE) { + switch (reg_type) { + case SCALAR_VALUE: mark_reg_unknown(env, regs, regno); - return; + return 0; + case PTR_TO_BTF_ID: + mark_reg_known_zero(env, regs, regno); + regs[regno].type = PTR_TO_BTF_ID | flag; + regs[regno].btf = btf; + regs[regno].btf_id = btf_id; + if (type_may_be_null(flag)) + regs[regno].id = ++env->id_gen; + return 0; + case PTR_TO_MEM: + mark_reg_known_zero(env, regs, regno); + regs[regno].type = PTR_TO_MEM | flag; + regs[regno].mem_size = 0; + return 0; + default: + verifier_bug(env, "unexpected reg_type %d in %s\n", reg_type, __func__); + return -EFAULT; } - mark_reg_known_zero(env, regs, regno); - regs[regno].type = PTR_TO_BTF_ID | flag; - regs[regno].btf = btf; - regs[regno].btf_id = btf_id; - if (type_may_be_null(flag)) - regs[regno].id = ++env->id_gen; } #define DEF_NOT_SUBREG (0) @@ -2752,8 +2893,6 @@ static void init_reg_state(struct bpf_verifier_env *env, for (i = 0; i < MAX_BPF_REG; i++) { mark_reg_not_init(env, regs, i); - regs[i].live = REG_LIVE_NONE; - regs[i].parent = NULL; regs[i].subreg_def = DEF_NOT_SUBREG; } @@ -2789,9 +2928,9 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, struct bpf_verifier_stack_elem *elem; struct bpf_func_state *frame; - elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); + elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT); if (!elem) - goto err; + return ERR_PTR(-ENOMEM); elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; @@ -2803,35 +2942,24 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, verbose(env, "The sequence of %d jumps is too complex for async cb.\n", env->stack_size); - goto err; + return ERR_PTR(-E2BIG); } /* Unlike push_stack() do not copy_verifier_state(). * The caller state doesn't matter. * This is async callback. It starts in a fresh stack. * Initialize it similar to do_check_common(). - * But we do need to make sure to not clobber insn_hist, so we keep - * chaining insn_hist_start/insn_hist_end indices as for a normal - * child state. */ elem->st.branches = 1; elem->st.in_sleepable = is_sleepable; - elem->st.insn_hist_start = env->cur_state->insn_hist_end; - elem->st.insn_hist_end = elem->st.insn_hist_start; - frame = kzalloc(sizeof(*frame), GFP_KERNEL); + frame = kzalloc(sizeof(*frame), GFP_KERNEL_ACCOUNT); if (!frame) - goto err; + return ERR_PTR(-ENOMEM); init_func_state(env, frame, BPF_MAIN_FUNC /* callsite */, 0 /* frameno within this callchain */, subprog /* subprog number within this prog */); elem->st.frame[0] = frame; return &elem->st; -err: - free_verifier_state(env->cur_state, true); - env->cur_state = NULL; - /* pop all elements and return */ - while (!pop_stack(env, NULL, NULL, false)); - return NULL; } @@ -2848,7 +2976,7 @@ static int cmp_subprogs(const void *a, const void *b) } /* Find subprogram that contains instruction at 'off' */ -static struct bpf_subprog_info *find_containing_subprog(struct bpf_verifier_env *env, int off) +struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *vals = env->subprog_info; int l, r, m; @@ -2873,7 +3001,7 @@ static int find_subprog(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *p; - p = find_containing_subprog(env, off); + p = bpf_find_containing_subprog(env, off); if (!p || p->start != off) return -ENOENT; return p - env->subprog_info; @@ -2996,6 +3124,9 @@ struct bpf_kfunc_btf_tab { u32 nr_descs; }; +static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc, + int insn_idx); + static int kfunc_desc_cmp_by_id_off(const void *a, const void *b) { const struct bpf_kfunc_desc *d0 = a; @@ -3013,7 +3144,7 @@ static int kfunc_btf_cmp_by_off(const void *a, const void *b) return d0->offset - d1->offset; } -static const struct bpf_kfunc_desc * +static struct bpf_kfunc_desc * find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset) { struct bpf_kfunc_desc desc = { @@ -3136,12 +3267,12 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) { const struct btf_type *func, *func_proto; struct bpf_kfunc_btf_tab *btf_tab; + struct btf_func_model func_model; struct bpf_kfunc_desc_tab *tab; struct bpf_prog_aux *prog_aux; struct bpf_kfunc_desc *desc; const char *func_name; struct btf *desc_btf; - unsigned long call_imm; unsigned long addr; int err; @@ -3169,7 +3300,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return -EINVAL; } - tab = kzalloc(sizeof(*tab), GFP_KERNEL); + tab = kzalloc(sizeof(*tab), GFP_KERNEL_ACCOUNT); if (!tab) return -ENOMEM; prog_aux->kfunc_tab = tab; @@ -3185,7 +3316,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return 0; if (!btf_tab && offset) { - btf_tab = kzalloc(sizeof(*btf_tab), GFP_KERNEL); + btf_tab = kzalloc(sizeof(*btf_tab), GFP_KERNEL_ACCOUNT); if (!btf_tab) return -ENOMEM; prog_aux->kfunc_btf_tab = btf_tab; @@ -3225,19 +3356,6 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) func_name); return -EINVAL; } - specialize_kfunc(env, func_id, offset, &addr); - - if (bpf_jit_supports_far_kfunc_call()) { - call_imm = func_id; - } else { - call_imm = BPF_CALL_IMM(addr); - /* Check whether the relative offset overflows desc->imm */ - if ((unsigned long)(s32)call_imm != call_imm) { - verbose(env, "address of kernel function %s is out of range\n", - func_name); - return -EINVAL; - } - } if (bpf_dev_bound_kfunc_id(func_id)) { err = bpf_dev_bound_kfunc_check(&env->log, prog_aux); @@ -3245,18 +3363,20 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return err; } + err = btf_distill_func_proto(&env->log, desc_btf, + func_proto, func_name, + &func_model); + if (err) + return err; + desc = &tab->descs[tab->nr_descs++]; desc->func_id = func_id; - desc->imm = call_imm; desc->offset = offset; desc->addr = addr; - err = btf_distill_func_proto(&env->log, desc_btf, - func_proto, func_name, - &desc->func_model); - if (!err) - sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), - kfunc_desc_cmp_by_id_off, NULL); - return err; + desc->func_model = func_model; + sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), + kfunc_desc_cmp_by_id_off, NULL); + return 0; } static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b) @@ -3271,16 +3391,43 @@ static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b) return 0; } -static void sort_kfunc_descs_by_imm_off(struct bpf_prog *prog) +static int set_kfunc_desc_imm(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc) +{ + unsigned long call_imm; + + if (bpf_jit_supports_far_kfunc_call()) { + call_imm = desc->func_id; + } else { + call_imm = BPF_CALL_IMM(desc->addr); + /* Check whether the relative offset overflows desc->imm */ + if ((unsigned long)(s32)call_imm != call_imm) { + verbose(env, "address of kernel func_id %u is out of range\n", + desc->func_id); + return -EINVAL; + } + } + desc->imm = call_imm; + return 0; +} + +static int sort_kfunc_descs_by_imm_off(struct bpf_verifier_env *env) { struct bpf_kfunc_desc_tab *tab; + int i, err; - tab = prog->aux->kfunc_tab; + tab = env->prog->aux->kfunc_tab; if (!tab) - return; + return 0; + + for (i = 0; i < tab->nr_descs; i++) { + err = set_kfunc_desc_imm(env, &tab->descs[i]); + if (err) + return err; + } sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm_off, NULL); + return 0; } bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog) @@ -3384,15 +3531,6 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env) return 0; } -static int jmp_offset(struct bpf_insn *insn) -{ - u8 code = insn->code; - - if (code == (BPF_JMP32 | BPF_JA)) - return insn->imm; - return insn->off; -} - static int check_subprogs(struct bpf_verifier_env *env) { int i, subprog_start, subprog_end, off, cur_subprog = 0; @@ -3417,9 +3555,13 @@ static int check_subprogs(struct bpf_verifier_env *env) subprog[cur_subprog].has_ld_abs = true; if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) goto next; - if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) + if (BPF_OP(code) == BPF_CALL) + goto next; + if (BPF_OP(code) == BPF_EXIT) { + subprog[cur_subprog].exit_idx = i; goto next; - off = i + jmp_offset(&insn[i]) + 1; + } + off = i + bpf_jmp_offset(&insn[i]) + 1; if (off < subprog_start || off >= subprog_end) { verbose(env, "jump out of range from insn %d to %d\n", i, off); return -EINVAL; @@ -3445,70 +3587,15 @@ next: return 0; } -/* Parentage chain of this register (or stack slot) should take care of all - * issues like callee-saved registers, stack slot allocation time, etc. - */ -static int mark_reg_read(struct bpf_verifier_env *env, - const struct bpf_reg_state *state, - struct bpf_reg_state *parent, u8 flag) -{ - bool writes = parent == state->parent; /* Observe write marks */ - int cnt = 0; - - while (parent) { - /* if read wasn't screened by an earlier write ... */ - if (writes && state->live & REG_LIVE_WRITTEN) - break; - if (parent->live & REG_LIVE_DONE) { - verbose(env, "verifier BUG type %s var_off %lld off %d\n", - reg_type_str(env, parent->type), - parent->var_off.value, parent->off); - return -EFAULT; - } - /* The first condition is more likely to be true than the - * second, checked it first. - */ - if ((parent->live & REG_LIVE_READ) == flag || - parent->live & REG_LIVE_READ64) - /* The parentage chain never changes and - * this parent was already marked as LIVE_READ. - * There is no need to keep walking the chain again and - * keep re-marking all parents as LIVE_READ. - * This case happens when the same register is read - * multiple times without writes into it in-between. - * Also, if parent has the stronger REG_LIVE_READ64 set, - * then no need to set the weak REG_LIVE_READ32. - */ - break; - /* ... then we depend on parent's value */ - parent->live |= flag; - /* REG_LIVE_READ64 overrides REG_LIVE_READ32. */ - if (flag == REG_LIVE_READ64) - parent->live &= ~REG_LIVE_READ32; - state = parent; - parent = state->parent; - writes = true; - cnt++; - } - - if (env->longest_mark_read_walk < cnt) - env->longest_mark_read_walk = cnt; - return 0; -} - static int mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi, int nr_slots) { - struct bpf_func_state *state = func(env, reg); int err, i; for (i = 0; i < nr_slots; i++) { - struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr; - - err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64); + err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi - i)); if (err) return err; - mark_stack_slot_scratched(env, spi - i); } return 0; @@ -3554,7 +3641,7 @@ static int mark_irq_flag_read(struct bpf_verifier_env *env, struct bpf_reg_state * code only. It returns TRUE if the source or destination register operates * on 64-bit, otherwise return FALSE. */ -static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, +static bool is_reg64(struct bpf_insn *insn, u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t) { u8 code, class, op; @@ -3649,30 +3736,30 @@ static int insn_def_regno(const struct bpf_insn *insn) case BPF_ST: return -1; case BPF_STX: - if ((BPF_MODE(insn->code) == BPF_ATOMIC || - BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) && - (insn->imm & BPF_FETCH)) { + if (BPF_MODE(insn->code) == BPF_ATOMIC || + BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) { if (insn->imm == BPF_CMPXCHG) return BPF_REG_0; - else + else if (insn->imm == BPF_LOAD_ACQ) + return insn->dst_reg; + else if (insn->imm & BPF_FETCH) return insn->src_reg; - } else { - return -1; } + return -1; default: return insn->dst_reg; } } /* Return TRUE if INSN has defined any 32-bit value explicitly. */ -static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn) +static bool insn_has_def32(struct bpf_insn *insn) { int dst_reg = insn_def_regno(insn); if (dst_reg == -1) return false; - return !is_reg64(env, insn, dst_reg, NULL, DST_OP); + return !is_reg64(insn, dst_reg, NULL, DST_OP); } static void mark_insn_zext(struct bpf_verifier_env *env, @@ -3703,7 +3790,7 @@ static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *r mark_reg_scratched(env, regno); reg = ®s[regno]; - rw64 = is_reg64(env, insn, regno, reg, t); + rw64 = is_reg64(insn, regno, reg, t); if (t == SRC_OP) { /* check whether register used as source operand can be read */ if (reg->type == NOT_INIT) { @@ -3717,15 +3804,13 @@ static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *r if (rw64) mark_insn_zext(env, reg); - return mark_reg_read(env, reg, reg->parent, - rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32); + return 0; } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { verbose(env, "frame pointer is read only\n"); return -EACCES; } - reg->live |= REG_LIVE_WRITTEN; reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1; if (t == DST_OP) mark_reg_unknown(env, regs, regno); @@ -3846,10 +3931,11 @@ static void linked_regs_unpack(u64 val, struct linked_regs *s) } /* for any branch, call, exit record the history of jmps in the given state */ -static int push_insn_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, - int insn_flags, u64 linked_regs) +static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, + int insn_flags, u64 linked_regs) { - struct bpf_insn_hist_entry *p; + u32 cnt = cur->jmp_history_cnt; + struct bpf_jmp_history_entry *p; size_t alloc_size; /* combine instruction flags if we already recorded this instruction */ @@ -3857,44 +3943,41 @@ static int push_insn_history(struct bpf_verifier_env *env, struct bpf_verifier_s /* atomic instructions push insn_flags twice, for READ and * WRITE sides, but they should agree on stack slot */ - WARN_ONCE((env->cur_hist_ent->flags & insn_flags) && - (env->cur_hist_ent->flags & insn_flags) != insn_flags, - "verifier insn history bug: insn_idx %d cur flags %x new flags %x\n", - env->insn_idx, env->cur_hist_ent->flags, insn_flags); + verifier_bug_if((env->cur_hist_ent->flags & insn_flags) && + (env->cur_hist_ent->flags & insn_flags) != insn_flags, + env, "insn history: insn_idx %d cur flags %x new flags %x", + env->insn_idx, env->cur_hist_ent->flags, insn_flags); env->cur_hist_ent->flags |= insn_flags; - WARN_ONCE(env->cur_hist_ent->linked_regs != 0, - "verifier insn history bug: insn_idx %d linked_regs != 0: %#llx\n", - env->insn_idx, env->cur_hist_ent->linked_regs); + verifier_bug_if(env->cur_hist_ent->linked_regs != 0, env, + "insn history: insn_idx %d linked_regs: %#llx", + env->insn_idx, env->cur_hist_ent->linked_regs); env->cur_hist_ent->linked_regs = linked_regs; return 0; } - if (cur->insn_hist_end + 1 > env->insn_hist_cap) { - alloc_size = size_mul(cur->insn_hist_end + 1, sizeof(*p)); - p = kvrealloc(env->insn_hist, alloc_size, GFP_USER); - if (!p) - return -ENOMEM; - env->insn_hist = p; - env->insn_hist_cap = alloc_size / sizeof(*p); - } + cnt++; + alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p))); + p = krealloc(cur->jmp_history, alloc_size, GFP_KERNEL_ACCOUNT); + if (!p) + return -ENOMEM; + cur->jmp_history = p; - p = &env->insn_hist[cur->insn_hist_end]; + p = &cur->jmp_history[cnt - 1]; p->idx = env->insn_idx; p->prev_idx = env->prev_insn_idx; p->flags = insn_flags; p->linked_regs = linked_regs; - - cur->insn_hist_end++; + cur->jmp_history_cnt = cnt; env->cur_hist_ent = p; return 0; } -static struct bpf_insn_hist_entry *get_insn_hist_entry(struct bpf_verifier_env *env, - u32 hist_start, u32 hist_end, int insn_idx) +static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st, + u32 hist_end, int insn_idx) { - if (hist_end > hist_start && env->insn_hist[hist_end - 1].idx == insn_idx) - return &env->insn_hist[hist_end - 1]; + if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx) + return &st->jmp_history[hist_end - 1]; return NULL; } @@ -3911,26 +3994,25 @@ static struct bpf_insn_hist_entry *get_insn_hist_entry(struct bpf_verifier_env * * history entry recording a jump from last instruction of parent state and * first instruction of given state. */ -static int get_prev_insn_idx(const struct bpf_verifier_env *env, - struct bpf_verifier_state *st, - int insn_idx, u32 hist_start, u32 *hist_endp) +static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, + u32 *history) { - u32 hist_end = *hist_endp; - u32 cnt = hist_end - hist_start; + u32 cnt = *history; - if (insn_idx == st->first_insn_idx) { + if (i == st->first_insn_idx) { if (cnt == 0) return -ENOENT; - if (cnt == 1 && env->insn_hist[hist_start].idx == insn_idx) + if (cnt == 1 && st->jmp_history[0].idx == i) return -ENOENT; } - if (cnt && env->insn_hist[hist_end - 1].idx == insn_idx) { - (*hist_endp)--; - return env->insn_hist[hist_end - 1].prev_idx; + if (cnt && st->jmp_history[cnt - 1].idx == i) { + i = st->jmp_history[cnt - 1].prev_idx; + (*history)--; } else { - return insn_idx - 1; + i--; } + return i; } static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) @@ -3987,8 +4069,7 @@ static inline u32 bt_empty(struct backtrack_state *bt) static inline int bt_subprog_enter(struct backtrack_state *bt) { if (bt->frame == MAX_CALL_FRAMES - 1) { - verbose(bt->env, "BUG subprog enter from frame %d\n", bt->frame); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(bt->env, "subprog enter from frame %d", bt->frame); return -EFAULT; } bt->frame++; @@ -3998,8 +4079,7 @@ static inline int bt_subprog_enter(struct backtrack_state *bt) static inline int bt_subprog_exit(struct backtrack_state *bt) { if (bt->frame == 0) { - verbose(bt->env, "BUG subprog exit from frame 0\n"); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(bt->env, "subprog exit from frame 0"); return -EFAULT; } bt->frame--; @@ -4091,7 +4171,7 @@ static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask) } } /* format stack slots bitmask, e.g., "-8,-24,-40" for 0x15 mask */ -static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) +void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) { DECLARE_BITMAP(mask, 64); bool first = true; @@ -4113,7 +4193,7 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) /* If any register R in hist->linked_regs is marked as precise in bt, * do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs. */ -static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_insn_hist_entry *hist) +static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist) { struct linked_regs linked_regs; bool some_precise = false; @@ -4146,8 +4226,6 @@ static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_insn_hist } } -static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); - /* For given verifier state backtrack_insn() is called from the last insn to * the first insn. Its purpose is to compute a bitmask of registers and * stack slots that needs precision in the parent verifier state. @@ -4158,7 +4236,7 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); * - *was* processed previously during backtracking. */ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, - struct bpf_insn_hist_entry *hist, struct backtrack_state *bt) + struct bpf_jmp_history_entry *hist, struct backtrack_state *bt) { struct bpf_insn *insn = env->prog->insnsi + idx; u8 class = BPF_CLASS(insn->code); @@ -4174,7 +4252,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_reg_mask(bt)); verbose(env, "mark_precise: frame%d: regs=%s ", bt->frame, env->tmp_str_buf); - fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt)); + bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt)); verbose(env, "stack=%s before ", env->tmp_str_buf); verbose(env, "%d: ", idx); verbose_insn(env, insn); @@ -4277,14 +4355,15 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * should be literally next instruction in * caller program */ - WARN_ONCE(idx + 1 != subseq_idx, "verifier backtracking bug"); + verifier_bug_if(idx + 1 != subseq_idx, env, + "extra insn from subprog"); /* r1-r5 are invalidated after subprog call, * so for global func call it shouldn't be set * anymore */ if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { - verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "global subprog unexpected regs %x", + bt_reg_mask(bt)); return -EFAULT; } /* global subprog always sets R0 */ @@ -4298,16 +4377,17 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * the current frame should be zero by now */ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) { - verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "static subprog unexpected regs %x", + bt_reg_mask(bt)); return -EFAULT; } /* we are now tracking register spills correctly, * so any instance of leftover slots is a bug */ if (bt_stack_mask(bt) != 0) { - verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug (subprog leftover stack slots)"); + verifier_bug(env, + "static subprog leftover stack slots %llx", + bt_stack_mask(bt)); return -EFAULT; } /* propagate r1-r5 to the caller */ @@ -4330,13 +4410,13 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * not actually arguments passed directly to callback subprogs */ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) { - verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "callback unexpected regs %x", + bt_reg_mask(bt)); return -EFAULT; } if (bt_stack_mask(bt) != 0) { - verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug (callback leftover stack slots)"); + verifier_bug(env, "callback leftover stack slots %llx", + bt_stack_mask(bt)); return -EFAULT; } /* clear r1-r5 in callback subprog's mask */ @@ -4355,13 +4435,18 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, /* regular helper call sets R0 */ bt_clear_reg(bt, BPF_REG_0); if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { - /* if backtracing was looking for registers R1-R5 + /* if backtracking was looking for registers R1-R5 * they should have been found already. */ - verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "backtracking call unexpected regs %x", + bt_reg_mask(bt)); return -EFAULT; } + if (insn->src_reg == BPF_REG_0 && insn->imm == BPF_FUNC_tail_call + && subseq_idx - idx != 1) { + if (bt_subprog_enter(bt)) + return -EFAULT; + } } else if (opcode == BPF_EXIT) { bool r0_precise; @@ -4373,12 +4458,12 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * backtracking, as these registers are set by the function * invoking callback. */ - if (subseq_idx >= 0 && calls_callback(env, subseq_idx)) + if (subseq_idx >= 0 && bpf_calls_callback(env, subseq_idx)) for (i = BPF_REG_1; i <= BPF_REG_5; i++) bt_clear_reg(bt, i); if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { - verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "backtracking exit unexpected regs %x", + bt_reg_mask(bt)); return -EFAULT; } @@ -4413,8 +4498,10 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * before it would be equally necessary to * propagate it to dreg. */ - bt_set_reg(bt, dreg); - bt_set_reg(bt, sreg); + if (!hist || !(hist->flags & INSN_F_SRC_REG_STACK)) + bt_set_reg(bt, sreg); + if (!hist || !(hist->flags & INSN_F_DST_REG_STACK)) + bt_set_reg(bt, dreg); } else if (BPF_SRC(insn->code) == BPF_K) { /* dreg <cond> K * Only dreg still needs precision before @@ -4449,7 +4536,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * . if (scalar cond K|scalar) * . helper_call(.., scalar, ...) where ARG_CONST is expected * backtrack through the verifier states and mark all registers and - * stack slots with spilled constants that these scalar regisers + * stack slots with spilled constants that these scalar registers * should be precise. * . during state pruning two registers (or spilled stack slots) * are equivalent if both are not precise. @@ -4572,7 +4659,7 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_ * SCALARS, as well as any other registers and slots that contribute to * a tracked state of given registers/stack slots, depending on specific BPF * assembly instructions (see backtrack_insns() for exact instruction handling - * logic). This backtracking relies on recorded insn_hist and is able to + * logic). This backtracking relies on recorded jmp_history and is able to * traverse entire chain of parent states. This process ends only when all the * necessary registers/slots and their transitive dependencies are marked as * precise. @@ -4652,23 +4739,27 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_ * mark_all_scalars_imprecise() to hopefully get more permissive and generic * finalized states which help in short circuiting more future states. */ -static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) +static int __mark_chain_precision(struct bpf_verifier_env *env, + struct bpf_verifier_state *starting_state, + int regno, + bool *changed) { + struct bpf_verifier_state *st = starting_state; struct backtrack_state *bt = &env->bt; - struct bpf_verifier_state *st = env->cur_state; int first_idx = st->first_insn_idx; - int last_idx = env->insn_idx; + int last_idx = starting_state->insn_idx; int subseq_idx = -1; struct bpf_func_state *func; + bool tmp, skip_first = true; struct bpf_reg_state *reg; - bool skip_first = true; int i, fr, err; if (!env->bpf_capable) return 0; + changed = changed ?: &tmp; /* set frame number from which we are starting to backtrack */ - bt_init(bt, env->cur_state->curframe); + bt_init(bt, starting_state->curframe); /* Do sanity checks against current state of register and/or stack * slot, but don't set precise flag in current state, as precision @@ -4678,7 +4769,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) if (regno >= 0) { reg = &func->regs[regno]; if (reg->type != SCALAR_VALUE) { - WARN_ONCE(1, "backtracing misuse"); + verifier_bug(env, "backtracking misuse"); return -EFAULT; } bt_set_reg(bt, regno); @@ -4689,9 +4780,8 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) for (;;) { DECLARE_BITMAP(mask, 64); - u32 hist_start = st->insn_hist_start; - u32 hist_end = st->insn_hist_end; - struct bpf_insn_hist_entry *hist; + u32 history = st->jmp_history_cnt; + struct bpf_jmp_history_entry *hist; if (env->log.level & BPF_LOG_LEVEL2) { verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n", @@ -4713,15 +4803,16 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) for_each_set_bit(i, mask, 32) { reg = &st->frame[0]->regs[i]; bt_clear_reg(bt, i); - if (reg->type == SCALAR_VALUE) + if (reg->type == SCALAR_VALUE) { reg->precise = true; + *changed = true; + } } return 0; } - verbose(env, "BUG backtracking func entry subprog %d reg_mask %x stack_mask %llx\n", - st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "backtracking func entry subprog %d reg_mask %x stack_mask %llx", + st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt)); return -EFAULT; } @@ -4730,11 +4821,11 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) err = 0; skip_first = false; } else { - hist = get_insn_hist_entry(env, hist_start, hist_end, i); + hist = get_jmp_hist_entry(st, history, i); err = backtrack_insn(env, i, subseq_idx, hist, bt); } if (err == -ENOTSUPP) { - mark_all_scalars_precise(env, env->cur_state); + mark_all_scalars_precise(env, starting_state); bt_reset(bt); return 0; } else if (err) { @@ -4747,7 +4838,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) */ return 0; subseq_idx = i; - i = get_prev_insn_idx(env, st, i, hist_start, &hist_end); + i = get_prev_insn_idx(st, i, &history); if (i == -ENOENT) break; if (i >= env->prog->len) { @@ -4757,8 +4848,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) * It means the backtracking missed the spot where * particular register was initialized with a constant. */ - verbose(env, "BUG backtracking idx %d\n", i); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "backtracking idx %d", i); return -EFAULT; } } @@ -4775,37 +4865,39 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) bt_clear_frame_reg(bt, fr, i); continue; } - if (reg->precise) + if (reg->precise) { bt_clear_frame_reg(bt, fr, i); - else + } else { reg->precise = true; + *changed = true; + } } bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr)); for_each_set_bit(i, mask, 64) { - if (i >= func->allocated_stack / BPF_REG_SIZE) { - verbose(env, "BUG backtracking (stack slot %d, total slots %d)\n", - i, func->allocated_stack / BPF_REG_SIZE); - WARN_ONCE(1, "verifier backtracking bug (stack slot out of bounds)"); + if (verifier_bug_if(i >= func->allocated_stack / BPF_REG_SIZE, + env, "stack slot %d, total slots %d", + i, func->allocated_stack / BPF_REG_SIZE)) return -EFAULT; - } if (!is_spilled_scalar_reg(&func->stack[i])) { bt_clear_frame_slot(bt, fr, i); continue; } reg = &func->stack[i].spilled_ptr; - if (reg->precise) + if (reg->precise) { bt_clear_frame_slot(bt, fr, i); - else + } else { reg->precise = true; + *changed = true; + } } if (env->log.level & BPF_LOG_LEVEL2) { fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_frame_reg_mask(bt, fr)); verbose(env, "mark_precise: frame%d: parent state regs=%s ", fr, env->tmp_str_buf); - fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, + bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_frame_stack_mask(bt, fr)); verbose(env, "stack=%s: ", env->tmp_str_buf); print_verifier_state(env, st, fr, true); @@ -4825,7 +4917,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) * fallback to marking all precise */ if (!bt_empty(bt)) { - mark_all_scalars_precise(env, env->cur_state); + mark_all_scalars_precise(env, starting_state); bt_reset(bt); } @@ -4834,15 +4926,16 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) int mark_chain_precision(struct bpf_verifier_env *env, int regno) { - return __mark_chain_precision(env, regno); + return __mark_chain_precision(env, env->cur_state, regno, NULL); } /* mark_chain_precision_batch() assumes that env->bt is set in the caller to * desired reg and stack masks across all relevant frames */ -static int mark_chain_precision_batch(struct bpf_verifier_env *env) +static int mark_chain_precision_batch(struct bpf_verifier_env *env, + struct bpf_verifier_state *starting_state) { - return __mark_chain_precision(env, -1); + return __mark_chain_precision(env, starting_state, -1, NULL); } static bool is_spillable_regtype(enum bpf_reg_type type) @@ -4927,12 +5020,7 @@ static void assign_scalar_id_before_mov(struct bpf_verifier_env *env, /* Copy src state preserving dst->parent and dst->live fields */ static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src) { - struct bpf_reg_state *parent = dst->parent; - enum bpf_reg_liveness live = dst->live; - *dst = *src; - dst->parent = parent; - dst->live = live; } static void save_register_state(struct bpf_verifier_env *env, @@ -4943,8 +5031,6 @@ static void save_register_state(struct bpf_verifier_env *env, int i; copy_register_state(&state->stack[spi].spilled_ptr, reg); - if (size == BPF_REG_SIZE) - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; for (i = BPF_REG_SIZE; i > BPF_REG_SIZE - size; i--) state->stack[spi].slot_type[i - 1] = STACK_SPILL; @@ -5031,13 +5117,25 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, } if (sanitize) - env->insn_aux_data[insn_idx].sanitize_stack_spill = true; + env->insn_aux_data[insn_idx].nospec_result = true; } err = destroy_if_dynptr_stack_slot(env, state, spi); if (err) return err; + if (!(off % BPF_REG_SIZE) && size == BPF_REG_SIZE) { + /* only mark the slot as written if all 8 bytes were written + * otherwise read propagation may incorrectly stop too soon + * when stack slots are partially written. + * This heuristic means that read propagation will be + * conservative, since it will add reg_live_read marks + * to stack slots all the way to first state when programs + * writes+reads less than 8 bytes + */ + bpf_mark_stack_write(env, state->frameno, BIT(spi)); + } + check_fastcall_stack_contract(env, state, insn_idx, off); mark_stack_slot_scratched(env, spi); if (reg && !(off % BPF_REG_SIZE) && reg->type == SCALAR_VALUE && env->bpf_capable) { @@ -5081,17 +5179,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, for (i = 0; i < BPF_REG_SIZE; i++) scrub_spilled_slot(&state->stack[spi].slot_type[i]); - /* only mark the slot as written if all 8 bytes were written - * otherwise read propagation may incorrectly stop too soon - * when stack slots are partially written. - * This heuristic means that read propagation will be - * conservative, since it will add reg_live_read marks - * to stack slots all the way to first state when programs - * writes+reads less than 8 bytes - */ - if (size == BPF_REG_SIZE) - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; - /* when we zero initialize stack slots mark them as such */ if ((reg && register_is_null(reg)) || (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) { @@ -5114,7 +5201,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, } if (insn_flags) - return push_insn_history(env, env->cur_state, insn_flags, 0); + return push_jmp_history(env, env->cur_state, insn_flags, 0); return 0; } @@ -5284,7 +5371,6 @@ static void mark_reg_stack_read(struct bpf_verifier_env *env, /* have read misc data from the stack */ mark_reg_unknown(env, state->regs, dst_regno); } - state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } /* Read the stack at 'off' and put the results into the register indicated by @@ -5307,12 +5393,16 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, struct bpf_reg_state *reg; u8 *stype, type; int insn_flags = insn_stack_access_flags(reg_state->frameno, spi); + int err; stype = reg_state->stack[spi].slot_type; reg = ®_state->stack[spi].spilled_ptr; mark_stack_slot_scratched(env, spi); check_fastcall_stack_contract(env, state, env->insn_idx, off); + err = bpf_mark_stack_read(env, reg_state->frameno, env->insn_idx, BIT(spi)); + if (err) + return err; if (is_spilled_reg(®_state->stack[spi])) { u8 spill_size = 1; @@ -5327,7 +5417,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, return -EACCES; } - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); if (dst_regno < 0) return 0; @@ -5381,7 +5470,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, insn_flags = 0; /* not restoring original register state */ } } - state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } else if (dst_regno >= 0) { /* restore register state from stack */ copy_register_state(&state->regs[dst_regno], reg); @@ -5389,7 +5477,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions */ - state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } else if (__is_pointer_value(env->allow_ptr_leaks, reg)) { /* If dst_regno==-1, the caller is asking us whether * it is acceptable to use this value as a SCALAR_VALUE @@ -5401,7 +5488,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, off); return -EACCES; } - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); } else { for (i = 0; i < size; i++) { type = stype[(slot - i) % BPF_REG_SIZE]; @@ -5415,13 +5501,12 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, off, i, size); return -EACCES; } - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); if (dst_regno >= 0) mark_reg_stack_read(env, reg_state, off, off + size, dst_regno); insn_flags = 0; /* we are not restoring spilled register */ } if (insn_flags) - return push_insn_history(env, env->cur_state, insn_flags, 0); + return push_jmp_history(env, env->cur_state, insn_flags, 0); return 0; } @@ -5796,8 +5881,7 @@ bad_type: static bool in_sleepable(struct bpf_verifier_env *env) { - return env->prog->sleepable || - (env->cur_state && env->cur_state->in_sleepable); + return env->cur_state->in_sleepable; } /* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock() @@ -5805,7 +5889,7 @@ static bool in_sleepable(struct bpf_verifier_env *env) */ static bool in_rcu_cs(struct bpf_verifier_env *env) { - return env->cur_state->active_rcu_lock || + return env->cur_state->active_rcu_locks || env->cur_state->active_locks || !in_sleepable(env); } @@ -5901,6 +5985,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; int class = BPF_CLASS(insn->code); struct bpf_reg_state *val_reg; + int ret; /* Things we already checked for in check_map_access and caller: * - Reject cases where variable offset may touch kptr @@ -5934,8 +6019,11 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, /* We can simply mark the value_regno receiving the pointer * value from map as PTR_TO_BTF_ID, with the correct type. */ - mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf, - kptr_field->kptr.btf_id, btf_ld_kptr_type(env, kptr_field)); + ret = mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, + kptr_field->kptr.btf, kptr_field->kptr.btf_id, + btf_ld_kptr_type(env, kptr_field)); + if (ret < 0) + return ret; } else if (class == BPF_STX) { val_reg = reg_state(env, value_regno); if (!register_is_null(val_reg) && @@ -5954,6 +6042,18 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, return 0; } +/* + * Return the size of the memory region accessible from a pointer to map value. + * For INSN_ARRAY maps whole bpf_insn_array->ips array is accessible. + */ +static u32 map_mem_size(const struct bpf_map *map) +{ + if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) + return map->max_entries * sizeof(long); + + return map->value_size; +} + /* check read/write into a map element with possible variable offset */ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed, @@ -5963,11 +6063,11 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; struct bpf_map *map = reg->map_ptr; + u32 mem_size = map_mem_size(map); struct btf_record *rec; int err, i; - err = check_mem_region_access(env, regno, off, size, map->value_size, - zero_size_allowed); + err = check_mem_region_access(env, regno, off, size, mem_size, zero_size_allowed); if (err) return err; @@ -6382,6 +6482,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, break; case PTR_TO_MAP_VALUE: pointer_desc = "value "; + if (reg->map_ptr->map_type == BPF_MAP_TYPE_INSN_ARRAY) + strict = true; break; case PTR_TO_CTX: pointer_desc = "context "; @@ -6561,21 +6663,18 @@ continue_func: /* find the callee */ next_insn = i + insn[i].imm + 1; sidx = find_subprog(env, next_insn); - if (sidx < 0) { - WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", - next_insn); + if (verifier_bug_if(sidx < 0, env, "callee not found at insn %d", next_insn)) return -EFAULT; - } if (subprog[sidx].is_async_cb) { if (subprog[sidx].has_tail_call) { - verbose(env, "verifier bug. subprog has tail_call and async cb\n"); + verifier_bug(env, "subprog has tail_call and async cb"); return -EFAULT; } /* async callbacks don't increase bpf prog stack size unless called directly */ if (!bpf_pseudo_call(insn + i)) continue; if (subprog[sidx].is_exception_cb) { - verbose(env, "insn %d cannot call exception cb directly\n", i); + verbose(env, "insn %d cannot call exception cb directly", i); return -EINVAL; } } @@ -6675,11 +6774,8 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, int start = idx + insn->imm + 1, subprog; subprog = find_subprog(env, start); - if (subprog < 0) { - WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", - start); + if (verifier_bug_if(subprog < 0, env, "get stack depth: no program at insn %d", start)) return -EFAULT; - } return env->subprog_info[subprog].stack_depth; } #endif @@ -7004,9 +7100,16 @@ BTF_TYPE_SAFE_RCU(struct css_set) { struct cgroup *dfl_cgrp; }; +BTF_TYPE_SAFE_RCU(struct cgroup_subsys_state) { + struct cgroup *cgroup; +}; + /* RCU trusted: these fields are trusted in RCU CS and can be NULL */ BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) { struct file __rcu *exe_file; +#ifdef CONFIG_MEMCG + struct task_struct __rcu *owner; +#endif }; /* skb->sk, req->sk are not RCU protected, but we mark them as such @@ -7038,8 +7141,7 @@ BTF_TYPE_SAFE_TRUSTED(struct file) { struct inode *f_inode; }; -BTF_TYPE_SAFE_TRUSTED(struct dentry) { - /* no negative dentry-s in places where bpf can see it */ +BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry) { struct inode *d_inode; }; @@ -7047,6 +7149,11 @@ BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket) { struct sock *sk; }; +BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct) { + struct mm_struct *vm_mm; + struct file *vm_file; +}; + static bool type_is_rcu(struct bpf_verifier_env *env, struct bpf_reg_state *reg, const char *field_name, u32 btf_id) @@ -7054,6 +7161,7 @@ static bool type_is_rcu(struct bpf_verifier_env *env, BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup_subsys_state)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu"); } @@ -7077,7 +7185,6 @@ static bool type_is_trusted(struct bpf_verifier_env *env, BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file)); - BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted"); } @@ -7087,6 +7194,8 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env, const char *field_name, u32 btf_id) { BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted_or_null"); @@ -7150,7 +7259,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (env->ops->btf_struct_access && !type_is_alloc(reg->type) && atype == BPF_WRITE) { if (!btf_is_kernel(reg->btf)) { - verbose(env, "verifier internal error: reg->btf must be kernel btf\n"); + verifier_bug(env, "reg->btf must be kernel btf"); return -EFAULT; } ret = env->ops->btf_struct_access(&env->log, reg, off, size); @@ -7166,7 +7275,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) && !(reg->type & MEM_RCU) && !reg->ref_obj_id) { - verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n"); + verifier_bug(env, "ref_obj_id for allocated object must be non-zero"); return -EFAULT; } @@ -7236,8 +7345,11 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, clear_trusted_flags(&flag); } - if (atype == BPF_READ && value_regno >= 0) - mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); + if (atype == BPF_READ && value_regno >= 0) { + ret = mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); + if (ret < 0) + return ret; + } return 0; } @@ -7291,13 +7403,19 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, /* Simulate access to a PTR_TO_BTF_ID */ memset(&map_reg, 0, sizeof(map_reg)); - mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0); + ret = mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, + btf_vmlinux, *map->ops->map_btf_id, 0); + if (ret < 0) + return ret; ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag, NULL); if (ret < 0) return ret; - if (value_regno >= 0) - mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag); + if (value_regno >= 0) { + ret = mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag); + if (ret < 0) + return ret; + } return 0; } @@ -7461,10 +7579,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (t == BPF_READ && value_regno >= 0) { struct bpf_map *map = reg->map_ptr; - /* if map is read-only, track its contents as scalars */ + /* + * If map is read-only, track its contents as scalars, + * unless it is an insn array (see the special case below) + */ if (tnum_is_const(reg->var_off) && bpf_map_is_rdonly(map) && - map->ops->map_direct_value_addr) { + map->ops->map_direct_value_addr && + map->map_type != BPF_MAP_TYPE_INSN_ARRAY) { int map_off = off + reg->var_off.value; u64 val = 0; @@ -7475,12 +7597,21 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn regs[value_regno].type = SCALAR_VALUE; __mark_reg_known(®s[value_regno], val); + } else if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) { + if (bpf_size != BPF_DW) { + verbose(env, "Invalid read of %d bytes from insn_array\n", + size); + return -EACCES; + } + copy_register_state(®s[value_regno], reg); + regs[value_regno].type = PTR_TO_INSN; } else { mark_reg_unknown(env, regs, value_regno); } } } else if (base_type(reg->type) == PTR_TO_MEM) { bool rdonly_mem = type_is_rdonly_mem(reg->type); + bool rdonly_untrusted = rdonly_mem && (reg->type & PTR_UNTRUSTED); if (type_may_be_null(reg->type)) { verbose(env, "R%d invalid mem access '%s'\n", regno, @@ -7500,8 +7631,13 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - err = check_mem_region_access(env, regno, off, size, - reg->mem_size, false); + /* + * Accesses to untrusted PTR_TO_MEM are done through probe + * instructions, hence no need to check bounds in that case. + */ + if (!rdonly_untrusted) + err = check_mem_region_access(env, regno, off, size, + reg->mem_size, false); if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem)) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { @@ -7984,7 +8120,7 @@ static int check_stack_range_initialized( slot = -i - 1; spi = slot / BPF_REG_SIZE; if (state->allocated_stack <= slot) { - verbose(env, "verifier bug: allocated_stack too small\n"); + verbose(env, "allocated_stack too small\n"); return -EFAULT; } @@ -8026,10 +8162,10 @@ mark: /* reading any byte out of 8-byte 'spill_slot' will cause * the whole slot to be marked as 'read' */ - mark_reg_read(env, &state->stack[spi].spilled_ptr, - state->stack[spi].spilled_ptr.parent, - REG_LIVE_READ64); - /* We do not set REG_LIVE_WRITTEN for stack slot, as we can not + err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi)); + if (err) + return err; + /* We do not call bpf_mark_stack_write(), as we can not * be sure that whether stack slot is written to or not. Hence, * we must still conservatively propagate reads upwards even if * helper may write to the entire memory range. @@ -8384,38 +8520,73 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) return 0; } -static int process_timer_func(struct bpf_verifier_env *env, int regno, - struct bpf_call_arg_meta *meta) +/* Check if @regno is a pointer to a specific field in a map value */ +static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, + enum btf_field_type field_type) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; bool is_const = tnum_is_const(reg->var_off); struct bpf_map *map = reg->map_ptr; u64 val = reg->var_off.value; + const char *struct_name = btf_field_type_name(field_type); + int field_off = -1; if (!is_const) { verbose(env, - "R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n", - regno); + "R%d doesn't have constant offset. %s has to be at the constant offset\n", + regno, struct_name); return -EINVAL; } if (!map->btf) { - verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n", - map->name); + verbose(env, "map '%s' has to have BTF in order to use %s\n", map->name, + struct_name); + return -EINVAL; + } + if (!btf_record_has_field(map->record, field_type)) { + verbose(env, "map '%s' has no valid %s\n", map->name, struct_name); return -EINVAL; } - if (!btf_record_has_field(map->record, BPF_TIMER)) { - verbose(env, "map '%s' has no valid bpf_timer\n", map->name); + switch (field_type) { + case BPF_TIMER: + field_off = map->record->timer_off; + break; + case BPF_TASK_WORK: + field_off = map->record->task_work_off; + break; + case BPF_WORKQUEUE: + field_off = map->record->wq_off; + break; + default: + verifier_bug(env, "unsupported BTF field type: %s\n", struct_name); return -EINVAL; } - if (map->record->timer_off != val + reg->off) { - verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n", - val + reg->off, map->record->timer_off); + if (field_off != val + reg->off) { + verbose(env, "off %lld doesn't point to 'struct %s' that is at %d\n", + val + reg->off, struct_name, field_off); return -EINVAL; } + return 0; +} + +static int process_timer_func(struct bpf_verifier_env *env, int regno, + struct bpf_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_map *map = reg->map_ptr; + int err; + + err = check_map_field_pointer(env, regno, BPF_TIMER); + if (err) + return err; + if (meta->map_ptr) { - verbose(env, "verifier bug. Two map pointers in a timer helper\n"); + verifier_bug(env, "Two map pointers in a timer helper"); return -EFAULT; } + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n"); + return -EOPNOTSUPP; + } meta->map_uid = reg->map_uid; meta->map_ptr = map; return 0; @@ -8426,12 +8597,36 @@ static int process_wq_func(struct bpf_verifier_env *env, int regno, { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; struct bpf_map *map = reg->map_ptr; - u64 val = reg->var_off.value; + int err; - if (map->record->wq_off != val + reg->off) { - verbose(env, "off %lld doesn't point to 'struct bpf_wq' that is at %d\n", - val + reg->off, map->record->wq_off); - return -EINVAL; + err = check_map_field_pointer(env, regno, BPF_WORKQUEUE); + if (err) + return err; + + if (meta->map.ptr) { + verifier_bug(env, "Two map pointers in a bpf_wq helper"); + return -EFAULT; + } + + meta->map.uid = reg->map_uid; + meta->map.ptr = map; + return 0; +} + +static int process_task_work_func(struct bpf_verifier_env *env, int regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_map *map = reg->map_ptr; + int err; + + err = check_map_field_pointer(env, regno, BPF_TASK_WORK); + if (err) + return err; + + if (meta->map.ptr) { + verifier_bug(env, "Two map pointers in a bpf_task_work helper"); + return -EFAULT; } meta->map.uid = reg->map_uid; meta->map.ptr = map; @@ -8528,7 +8723,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*): */ if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) { - verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n"); + verifier_bug(env, "misconfigured dynptr helper type flags"); return -EFAULT; } @@ -8767,7 +8962,7 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, struct bpf_verifier_state *cur) { struct bpf_func_state *fold, *fcur; - int i, fr; + int i, fr, num_slots; reset_idmap_scratch(env); for (fr = old->curframe; fr >= 0; fr--) { @@ -8780,7 +8975,9 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, &fcur->regs[i], &env->idmap_scratch); - for (i = 0; i < fold->allocated_stack / BPF_REG_SIZE; i++) { + num_slots = min(fold->allocated_stack / BPF_REG_SIZE, + fcur->allocated_stack / BPF_REG_SIZE); + for (i = 0; i < num_slots; i++) { if (!is_spilled_reg(&fold->stack[i]) || !is_spilled_reg(&fcur->stack[i])) continue; @@ -8894,8 +9091,8 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE && cur_iter->iter.state != BPF_ITER_STATE_DRAINED) { - verbose(env, "verifier internal error: unexpected iterator state %d (%s)\n", - cur_iter->iter.state, iter_state_str(cur_iter->iter.state)); + verifier_bug(env, "unexpected iterator state %d (%s)", + cur_iter->iter.state, iter_state_str(cur_iter->iter.state)); return -EFAULT; } @@ -8905,7 +9102,7 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, */ if (!cur_st->parent || cur_st->parent->insn_idx != insn_idx || !same_callsites(cur_st->parent, cur_st)) { - verbose(env, "bug: bad parent state for iter next call"); + verifier_bug(env, "bad parent state for iter next call"); return -EFAULT; } /* Note cur_st->parent in the call below, it is necessary to skip @@ -8915,8 +9112,8 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, prev_st = find_prev_entry(env, cur_st->parent, insn_idx); /* branch out active iter state */ queued_st = push_stack(env, insn_idx + 1, insn_idx, false); - if (!queued_st) - return -ENOMEM; + if (IS_ERR(queued_st)) + return PTR_ERR(queued_st); queued_iter = get_iter_from_state(queued_st, meta); queued_iter->iter.state = BPF_ITER_STATE_ACTIVE; @@ -8964,8 +9161,8 @@ static int resolve_map_arg_type(struct bpf_verifier_env *env, { if (!meta->map_ptr) { /* kernel subsystem misconfigured verifier */ - verbose(env, "invalid map_ptr to access map->type\n"); - return -EACCES; + verifier_bug(env, "invalid map_ptr to access map->type"); + return -EFAULT; } switch (meta->map_ptr->map_type) { @@ -9111,7 +9308,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, compatible = compatible_reg_types[base_type(arg_type)]; if (!compatible) { - verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type); + verifier_bug(env, "unsupported arg type %d", arg_type); return -EFAULT; } @@ -9193,7 +9390,7 @@ found: if (!arg_btf_id) { if (!compatible->btf_id) { - verbose(env, "verifier internal error: missing arg compatible BTF ID\n"); + verifier_bug(env, "missing arg compatible BTF ID"); return -EFAULT; } arg_btf_id = compatible->btf_id; @@ -9225,7 +9422,7 @@ found: case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC: if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock && meta->func_id != BPF_FUNC_kptr_xchg) { - verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n"); + verifier_bug(env, "unimplemented handling of MEM_ALLOC"); return -EFAULT; } /* Check if local kptr in src arg matches kptr in dst arg */ @@ -9240,7 +9437,7 @@ found: /* Handled by helper specific checks */ break; default: - verbose(env, "verifier internal error: invalid PTR_TO_BTF_ID register for type match\n"); + verifier_bug(env, "invalid PTR_TO_BTF_ID register for type match"); return -EFAULT; } return 0; @@ -9501,7 +9698,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env, * to prevent pruning on it. */ bt_set_frame_slot(&env->bt, key->frameno, spi); - err = mark_chain_precision_batch(env); + err = mark_chain_precision_batch(env, env->cur_state); if (err < 0) return err; @@ -9598,7 +9795,7 @@ skip_type_check: return -EINVAL; } if (meta->release_regno) { - verbose(env, "verifier internal error: more than one release argument\n"); + verifier_bug(env, "more than one release argument"); return -EFAULT; } meta->release_regno = regno; @@ -9606,10 +9803,10 @@ skip_type_check: if (reg->ref_obj_id && base_type(arg_type) != ARG_KPTR_XCHG_DEST) { if (meta->ref_obj_id) { - verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + verbose(env, "more than one arg with ref_obj_id R%d %u %u", regno, reg->ref_obj_id, meta->ref_obj_id); - return -EFAULT; + return -EACCES; } meta->ref_obj_id = reg->ref_obj_id; } @@ -9652,8 +9849,8 @@ skip_type_check: * we have to check map_key here. Otherwise it means * that kernel subsystem misconfigured verifier */ - verbose(env, "invalid map_ptr to access map->key\n"); - return -EACCES; + verifier_bug(env, "invalid map_ptr to access map->key"); + return -EFAULT; } key_size = meta->map_ptr->key_size; err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL); @@ -9679,8 +9876,8 @@ skip_type_check: */ if (!meta->map_ptr) { /* kernel subsystem misconfigured verifier */ - verbose(env, "invalid map_ptr to access map->value\n"); - return -EACCES; + verifier_bug(env, "invalid map_ptr to access map->value"); + return -EFAULT; } meta->raw_mode = arg_type & MEM_UNINIT; err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, @@ -9709,7 +9906,7 @@ skip_type_check: if (err) return err; } else { - verbose(env, "verifier internal error\n"); + verifier_bug(env, "spin lock arg on unexpected helper"); return -EFAULT; } break; @@ -9953,6 +10150,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_map_push_elem) goto error; break; + case BPF_MAP_TYPE_INSN_ARRAY: + goto error; default: break; } @@ -10285,13 +10484,12 @@ static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int calls } if (state->frame[state->curframe + 1]) { - verbose(env, "verifier bug. Frame %d already allocated\n", - state->curframe + 1); + verifier_bug(env, "Frame %d already allocated", state->curframe + 1); return -EFAULT; } caller = state->frame[state->curframe]; - callee = kzalloc(sizeof(*callee), GFP_KERNEL); + callee = kzalloc(sizeof(*callee), GFP_KERNEL_ACCOUNT); if (!callee) return -ENOMEM; state->frame[state->curframe + 1] = callee; @@ -10346,6 +10544,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, bpf_log(log, "R%d is not a scalar\n", regno); return -EINVAL; } + } else if (arg->arg_type & PTR_UNTRUSTED) { + /* + * Anything is allowed for untrusted arguments, as these are + * read-only and probe read instructions would protect against + * invalid memory access. + */ } else if (arg->arg_type == ARG_PTR_TO_CTX) { ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE); if (ret < 0) @@ -10400,8 +10604,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, if (err) return err; } else { - bpf_log(log, "verifier bug: unrecognized arg#%d type %d\n", - i, arg->arg_type); + verifier_bug(env, "unrecognized arg#%d type %d", i, arg->arg_type); return -EFAULT; } } @@ -10464,13 +10667,13 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins env->subprog_info[subprog].is_cb = true; if (bpf_pseudo_kfunc_call(insn) && !is_callback_calling_kfunc(insn->imm)) { - verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n", - func_id_name(insn->imm), insn->imm); + verifier_bug(env, "kfunc %s#%d not marked as callback-calling", + func_id_name(insn->imm), insn->imm); return -EFAULT; } else if (!bpf_pseudo_kfunc_call(insn) && !is_callback_calling_function(insn->imm)) { /* helper */ - verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n", - func_id_name(insn->imm), insn->imm); + verifier_bug(env, "helper %s#%d not marked as callback-calling", + func_id_name(insn->imm), insn->imm); return -EFAULT; } @@ -10481,9 +10684,9 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins env->subprog_info[subprog].is_async_cb = true; async_cb = push_async_cb(env, env->subprog_info[subprog].start, insn_idx, subprog, - is_bpf_wq_set_callback_impl_kfunc(insn->imm)); - if (!async_cb) - return -EFAULT; + is_async_cb_sleepable(env, insn)); + if (IS_ERR(async_cb)) + return PTR_ERR(async_cb); callee = async_cb->frame[0]; callee->async_entry_cnt = caller->async_entry_cnt + 1; @@ -10499,8 +10702,8 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins * proceed with next instruction within current frame. */ callback_state = push_stack(env, env->subprog_info[subprog].start, insn_idx, false); - if (!callback_state) - return -ENOMEM; + if (IS_ERR(callback_state)) + return PTR_ERR(callback_state); err = setup_func_entry(env, subprog, insn_idx, set_callee_state_cb, callback_state); @@ -10522,10 +10725,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, target_insn = *insn_idx + insn->imm + 1; subprog = find_subprog(env, target_insn); - if (subprog < 0) { - verbose(env, "verifier bug. No program starts at insn %d\n", target_insn); + if (verifier_bug_if(subprog < 0, env, "target of func call at insn %d is not a program", + target_insn)) return -EFAULT; - } caller = state->frame[state->curframe]; err = btf_check_subprog_call(env, subprog, caller->regs); @@ -10541,7 +10743,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } if (env->subprog_info[subprog].might_sleep && - (env->cur_state->active_rcu_lock || env->cur_state->active_preempt_locks || + (env->cur_state->active_rcu_locks || env->cur_state->active_preempt_locks || env->cur_state->active_irq_id || !in_sleepable(env))) { verbose(env, "global functions that may sleep are not allowed in non-sleepable context,\n" "i.e., in a RCU/IRQ/preempt-disabled section, or in\n" @@ -10555,8 +10757,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return err; } - verbose(env, "Func#%d ('%s') is global and assumed valid.\n", - subprog, sub_name); + if (env->log.level & BPF_LOG_LEVEL) + verbose(env, "Func#%d ('%s') is global and assumed valid.\n", + subprog, sub_name); if (env->subprog_info[subprog].changes_pkt_data) clear_all_pkt_pointers(env); /* mark global subprog for verifying after main prog */ @@ -10583,6 +10786,8 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* and go analyze first insn of the callee */ *insn_idx = env->subprog_info[subprog].start - 1; + bpf_reset_live_stack_callchain(env); + if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "caller:\n"); print_verifier_state(env, state, caller->frameno, true); @@ -10708,7 +10913,7 @@ static int set_timer_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_async_callback_fn = true; - callee->callback_ret_range = retval_range(0, 1); + callee->callback_ret_range = retval_range(0, 0); return 0; } @@ -10795,6 +11000,36 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env, return 0; } +static int set_task_work_schedule_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + struct bpf_map *map_ptr = caller->regs[BPF_REG_3].map_ptr; + + /* + * callback_fn(struct bpf_map *map, void *key, void *value); + */ + callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP; + __mark_reg_known_zero(&callee->regs[BPF_REG_1]); + callee->regs[BPF_REG_1].map_ptr = map_ptr; + + callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY; + __mark_reg_known_zero(&callee->regs[BPF_REG_2]); + callee->regs[BPF_REG_2].map_ptr = map_ptr; + + callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE; + __mark_reg_known_zero(&callee->regs[BPF_REG_3]); + callee->regs[BPF_REG_3].map_ptr = map_ptr; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + callee->in_async_callback_fn = true; + callee->callback_ret_range = retval_range(S32_MIN, S32_MAX); + return 0; +} + static bool is_rbtree_lock_required_kfunc(u32 btf_id); /* Are we currently verifying the callback for a rbtree helper that must @@ -10837,6 +11072,10 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) bool in_callback_fn; int err; + err = bpf_update_live_stack(env); + if (err) + return err; + callee = state->frame[state->curframe]; r0 = &callee->regs[BPF_REG_0]; if (r0->type == PTR_TO_STACK) { @@ -10858,8 +11097,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) } /* we are going to rely on register's precise value */ - err = mark_reg_read(env, r0, r0->parent, REG_LIVE_READ64); - err = err ?: mark_chain_precision(env, BPF_REG_0); + err = mark_chain_precision(env, BPF_REG_0); if (err) return err; @@ -10869,9 +11107,9 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) "At callback return", "R0"); return -EINVAL; } - if (!calls_callback(env, callee->callsite)) { - verbose(env, "BUG: in callback at %d, callsite %d !calls_callback\n", - *insn_idx, callee->callsite); + if (!bpf_calls_callback(env, callee->callsite)) { + verifier_bug(env, "in callback at %d, callsite %d !calls_callback", + *insn_idx, callee->callsite); return -EFAULT; } } else { @@ -10978,8 +11216,8 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return 0; if (map == NULL) { - verbose(env, "kernel subsystem misconfigured verifier\n"); - return -EINVAL; + verifier_bug(env, "expected map for helper call"); + return -EFAULT; } /* In case of read-only, some additional restrictions @@ -11017,7 +11255,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, if (func_id != BPF_FUNC_tail_call) return 0; if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) { - verbose(env, "kernel subsystem misconfigured verifier\n"); + verbose(env, "expected prog array map for tail call"); return -EINVAL; } @@ -11088,7 +11326,7 @@ static int check_resource_leak(struct bpf_verifier_env *env, bool exception_exit return -EINVAL; } - if (check_lock && env->cur_state->active_rcu_lock) { + if (check_lock && env->cur_state->active_rcu_locks) { verbose(env, "%s cannot be used inside bpf_rcu_read_lock-ed region\n", prefix); return -EINVAL; } @@ -11124,7 +11362,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env, err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr, fmt_map_off); if (err) { - verbose(env, "verifier bug\n"); + verbose(env, "failed to retrieve map value address\n"); return -EFAULT; } fmt = (char *)(long)fmt_addr + fmt_map_off; @@ -11160,7 +11398,7 @@ static int check_get_func_ip(struct bpf_verifier_env *env) return -ENOTSUPP; } -static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) +static struct bpf_insn_aux_data *cur_aux(const struct bpf_verifier_env *env) { return &env->insn_aux_data[env->insn_idx]; } @@ -11220,7 +11458,16 @@ static int get_helper_proto(struct bpf_verifier_env *env, int func_id, return -EINVAL; *ptr = env->ops->get_func_proto(func_id, env->prog); - return *ptr ? 0 : -EINVAL; + return *ptr && (*ptr)->func ? 0 : -EINVAL; +} + +/* Check if we're in a sleepable context. */ +static inline bool in_sleepable_context(struct bpf_verifier_env *env) +{ + return !env->cur_state->active_rcu_locks && + !env->cur_state->active_preempt_locks && + !env->cur_state->active_irq_id && + in_sleepable(env); } static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, @@ -11270,9 +11517,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* With LD_ABS/IND some JITs save/restore skb from r1. */ changes_data = bpf_helper_changes_pkt_data(func_id); if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) { - verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n", - func_id_name(func_id), func_id); - return -EINVAL; + verifier_bug(env, "func %s#%d: r1 != ctx", func_id_name(func_id), func_id); + return -EFAULT; } memset(&meta, 0, sizeof(meta)); @@ -11280,20 +11526,16 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn err = check_func_proto(fn, func_id); if (err) { - verbose(env, "kernel subsystem misconfigured func %s#%d\n", - func_id_name(func_id), func_id); + verifier_bug(env, "incorrect func proto %s#%d", func_id_name(func_id), func_id); return err; } - if (env->cur_state->active_rcu_lock) { + if (env->cur_state->active_rcu_locks) { if (fn->might_sleep) { verbose(env, "sleepable helper %s#%d in rcu_read_lock region\n", func_id_name(func_id), func_id); return -EINVAL; } - - if (in_sleepable(env) && is_storage_get_function(func_id)) - env->insn_aux_data[insn_idx].storage_get_func_atomic = true; } if (env->cur_state->active_preempt_locks) { @@ -11302,9 +11544,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn func_id_name(func_id), func_id); return -EINVAL; } - - if (in_sleepable(env) && is_storage_get_function(func_id)) - env->insn_aux_data[insn_idx].storage_get_func_atomic = true; } if (env->cur_state->active_irq_id) { @@ -11313,11 +11552,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn func_id_name(func_id), func_id); return -EINVAL; } - - if (in_sleepable(env) && is_storage_get_function(func_id)) - env->insn_aux_data[insn_idx].storage_get_func_atomic = true; } + /* Track non-sleepable context for helpers. */ + if (!in_sleepable_context(env)) + env->insn_aux_data[insn_idx].non_sleepable = true; + meta.func_id = func_id; /* check args */ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { @@ -11348,15 +11588,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (meta.release_regno) { err = -EINVAL; - /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot - * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr - * is safe to do directly. - */ if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) { - if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) { - verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be released\n"); - return -EFAULT; - } err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); } else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj_id) { u32 ref_obj_id = meta.ref_obj_id; @@ -11471,23 +11703,23 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (meta.dynptr_id) { - verbose(env, "verifier internal error: meta.dynptr_id already set\n"); + verifier_bug(env, "meta.dynptr_id already set"); return -EFAULT; } if (meta.ref_obj_id) { - verbose(env, "verifier internal error: meta.ref_obj_id already set\n"); + verifier_bug(env, "meta.ref_obj_id already set"); return -EFAULT; } id = dynptr_id(env, reg); if (id < 0) { - verbose(env, "verifier internal error: failed to obtain dynptr id\n"); + verifier_bug(env, "failed to obtain dynptr id"); return id; } ref_obj_id = dynptr_ref_obj_id(env, reg); if (ref_obj_id < 0) { - verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n"); + verifier_bug(env, "failed to obtain dynptr ref_obj_id"); return ref_obj_id; } @@ -11509,7 +11741,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (dynptr_type == BPF_DYNPTR_TYPE_INVALID) return -EFAULT; - if (dynptr_type == BPF_DYNPTR_TYPE_SKB) + if (dynptr_type == BPF_DYNPTR_TYPE_SKB || + dynptr_type == BPF_DYNPTR_TYPE_SKB_META) /* this will trigger clear_all_pkt_pointers(), which will * invalidate all dynptr slices associated with the skb */ @@ -11572,9 +11805,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn * to map element returned from bpf_map_lookup_elem() */ if (meta.map_ptr == NULL) { - verbose(env, - "kernel subsystem misconfigured verifier\n"); - return -EINVAL; + verifier_bug(env, "unexpected null map_ptr"); + return -EFAULT; } if (func_id == BPF_FUNC_map_lookup_elem && @@ -11664,10 +11896,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } } else { if (fn->ret_btf_id == BPF_PTR_POISON) { - verbose(env, "verifier internal error:"); - verbose(env, "func %s has non-overwritten BPF_PTR_POISON return type\n", - func_id_name(func_id)); - return -EINVAL; + verifier_bug(env, "func %s has non-overwritten BPF_PTR_POISON return type", + func_id_name(func_id)); + return -EFAULT; } ret_btf = btf_vmlinux; ret_btf_id = *fn->ret_btf_id; @@ -11692,8 +11923,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].id = ++env->id_gen; if (helper_multiple_ref_obj_use(func_id, meta.map_ptr)) { - verbose(env, "verifier internal error: func %s#%d sets ref_obj_id more than once\n", - func_id_name(func_id), func_id); + verifier_bug(env, "func %s#%d sets ref_obj_id more than once", + func_id_name(func_id), func_id); return -EFAULT; } @@ -11751,6 +11982,25 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn env->prog->call_get_func_ip = true; } + if (func_id == BPF_FUNC_tail_call) { + if (env->cur_state->curframe) { + struct bpf_verifier_state *branch; + + mark_reg_scratched(env, BPF_REG_0); + branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); + if (IS_ERR(branch)) + return PTR_ERR(branch); + clear_all_pkt_pointers(env); + mark_reg_unknown(env, regs, BPF_REG_0); + err = prepare_func_exit(env, &env->insn_idx); + if (err) + return err; + env->insn_idx--; + } else { + changes_data = false; + } + } + if (changes_data) clear_all_pkt_pointers(env); return 0; @@ -11766,17 +12016,11 @@ static void __mark_btf_func_reg_size(struct bpf_verifier_env *env, struct bpf_re if (regno == BPF_REG_0) { /* Function return value */ - reg->live |= REG_LIVE_WRITTEN; reg->subreg_def = reg_size == sizeof(u64) ? DEF_NOT_SUBREG : env->insn_idx + 1; - } else { + } else if (reg_size == sizeof(u64)) { /* Function argument */ - if (reg_size == sizeof(u64)) { - mark_insn_zext(env, reg); - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); - } else { - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32); - } + mark_insn_zext(env, reg); } } @@ -11897,6 +12141,11 @@ static bool is_kfunc_arg_irq_flag(const struct btf *btf, const struct btf_param return btf_param_match_suffix(btf, arg, "__irq_flag"); } +static bool is_kfunc_arg_prog(const struct btf *btf, const struct btf_param *arg) +{ + return btf_param_match_suffix(btf, arg, "__prog"); +} + static bool is_kfunc_arg_scalar_with_name(const struct btf *btf, const struct btf_param *arg, const char *name) @@ -11924,6 +12173,7 @@ enum { KF_ARG_RB_NODE_ID, KF_ARG_WORKQUEUE_ID, KF_ARG_RES_SPIN_LOCK_ID, + KF_ARG_TASK_WORK_ID, }; BTF_ID_LIST(kf_arg_btf_ids) @@ -11934,6 +12184,7 @@ BTF_ID(struct, bpf_rb_root) BTF_ID(struct, bpf_rb_node) BTF_ID(struct, bpf_wq) BTF_ID(struct, bpf_res_spin_lock) +BTF_ID(struct, bpf_task_work) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -11982,11 +12233,26 @@ static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); } +static bool is_kfunc_arg_task_work(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_TASK_WORK_ID); +} + static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_param *arg) { return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID); } +static bool is_rbtree_node_type(const struct btf_type *t) +{ + return t == btf_type_by_id(btf_vmlinux, kf_arg_btf_ids[KF_ARG_RB_NODE_ID]); +} + +static bool is_list_node_type(const struct btf_type *t) +{ + return t == btf_type_by_id(btf_vmlinux, kf_arg_btf_ids[KF_ARG_LIST_NODE_ID]); +} + static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf, const struct btf_param *arg) { @@ -12059,6 +12325,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_WORKQUEUE, KF_ARG_PTR_TO_IRQ_FLAG, KF_ARG_PTR_TO_RES_SPIN_LOCK, + KF_ARG_PTR_TO_TASK_WORK, }; enum special_kfunc_type { @@ -12069,6 +12336,8 @@ enum special_kfunc_type { KF_bpf_list_push_back_impl, KF_bpf_list_pop_front, KF_bpf_list_pop_back, + KF_bpf_list_front, + KF_bpf_list_back, KF_bpf_cast_to_kern_ctx, KF_bpf_rdonly_cast, KF_bpf_rcu_read_lock, @@ -12076,8 +12345,13 @@ enum special_kfunc_type { KF_bpf_rbtree_remove, KF_bpf_rbtree_add_impl, KF_bpf_rbtree_first, + KF_bpf_rbtree_root, + KF_bpf_rbtree_left, + KF_bpf_rbtree_right, KF_bpf_dynptr_from_skb, KF_bpf_dynptr_from_xdp, + KF_bpf_dynptr_from_skb_meta, + KF_bpf_xdp_pull_data, KF_bpf_dynptr_slice, KF_bpf_dynptr_slice_rdwr, KF_bpf_dynptr_clone, @@ -12101,41 +12375,13 @@ enum special_kfunc_type { KF_bpf_res_spin_unlock, KF_bpf_res_spin_lock_irqsave, KF_bpf_res_spin_unlock_irqrestore, + KF_bpf_dynptr_from_file, + KF_bpf_dynptr_file_discard, + KF___bpf_trap, + KF_bpf_task_work_schedule_signal_impl, + KF_bpf_task_work_schedule_resume_impl, }; -BTF_SET_START(special_kfunc_set) -BTF_ID(func, bpf_obj_new_impl) -BTF_ID(func, bpf_obj_drop_impl) -BTF_ID(func, bpf_refcount_acquire_impl) -BTF_ID(func, bpf_list_push_front_impl) -BTF_ID(func, bpf_list_push_back_impl) -BTF_ID(func, bpf_list_pop_front) -BTF_ID(func, bpf_list_pop_back) -BTF_ID(func, bpf_cast_to_kern_ctx) -BTF_ID(func, bpf_rdonly_cast) -BTF_ID(func, bpf_rbtree_remove) -BTF_ID(func, bpf_rbtree_add_impl) -BTF_ID(func, bpf_rbtree_first) -#ifdef CONFIG_NET -BTF_ID(func, bpf_dynptr_from_skb) -BTF_ID(func, bpf_dynptr_from_xdp) -#endif -BTF_ID(func, bpf_dynptr_slice) -BTF_ID(func, bpf_dynptr_slice_rdwr) -BTF_ID(func, bpf_dynptr_clone) -BTF_ID(func, bpf_percpu_obj_new_impl) -BTF_ID(func, bpf_percpu_obj_drop_impl) -BTF_ID(func, bpf_throw) -BTF_ID(func, bpf_wq_set_callback_impl) -#ifdef CONFIG_CGROUPS -BTF_ID(func, bpf_iter_css_task_new) -#endif -#ifdef CONFIG_BPF_LSM -BTF_ID(func, bpf_set_dentry_xattr) -BTF_ID(func, bpf_remove_dentry_xattr) -#endif -BTF_SET_END(special_kfunc_set) - BTF_ID_LIST(special_kfunc_list) BTF_ID(func, bpf_obj_new_impl) BTF_ID(func, bpf_obj_drop_impl) @@ -12144,6 +12390,8 @@ BTF_ID(func, bpf_list_push_front_impl) BTF_ID(func, bpf_list_push_back_impl) BTF_ID(func, bpf_list_pop_front) BTF_ID(func, bpf_list_pop_back) +BTF_ID(func, bpf_list_front) +BTF_ID(func, bpf_list_back) BTF_ID(func, bpf_cast_to_kern_ctx) BTF_ID(func, bpf_rdonly_cast) BTF_ID(func, bpf_rcu_read_lock) @@ -12151,12 +12399,19 @@ BTF_ID(func, bpf_rcu_read_unlock) BTF_ID(func, bpf_rbtree_remove) BTF_ID(func, bpf_rbtree_add_impl) BTF_ID(func, bpf_rbtree_first) +BTF_ID(func, bpf_rbtree_root) +BTF_ID(func, bpf_rbtree_left) +BTF_ID(func, bpf_rbtree_right) #ifdef CONFIG_NET BTF_ID(func, bpf_dynptr_from_skb) BTF_ID(func, bpf_dynptr_from_xdp) +BTF_ID(func, bpf_dynptr_from_skb_meta) +BTF_ID(func, bpf_xdp_pull_data) #else BTF_ID_UNUSED BTF_ID_UNUSED +BTF_ID_UNUSED +BTF_ID_UNUSED #endif BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) @@ -12194,6 +12449,17 @@ BTF_ID(func, bpf_res_spin_lock) BTF_ID(func, bpf_res_spin_unlock) BTF_ID(func, bpf_res_spin_lock_irqsave) BTF_ID(func, bpf_res_spin_unlock_irqrestore) +BTF_ID(func, bpf_dynptr_from_file) +BTF_ID(func, bpf_dynptr_file_discard) +BTF_ID(func, __bpf_trap) +BTF_ID(func, bpf_task_work_schedule_signal_impl) +BTF_ID(func, bpf_task_work_schedule_resume_impl) + +static bool is_task_work_add_kfunc(u32 func_id) +{ + return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal_impl] || + func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume_impl]; +} static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -12225,6 +12491,11 @@ static bool is_kfunc_bpf_preempt_enable(struct bpf_kfunc_call_arg_meta *meta) return meta->func_id == special_kfunc_list[KF_bpf_preempt_enable]; } +static bool is_kfunc_pkt_changing(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->func_id == special_kfunc_list[KF_bpf_xdp_pull_data]; +} + static enum kfunc_ptr_arg_type get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, @@ -12284,6 +12555,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_wq(meta->btf, &args[argno])) return KF_ARG_PTR_TO_WORKQUEUE; + if (is_kfunc_arg_task_work(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_TASK_WORK; + if (is_kfunc_arg_irq_flag(meta->btf, &args[argno])) return KF_ARG_PTR_TO_IRQ_FLAG; @@ -12411,7 +12685,7 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno, if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) kfunc_class = IRQ_LOCK_KFUNC; } else { - verbose(env, "verifier internal error: unknown irq flags kfunc\n"); + verifier_bug(env, "unknown irq flags kfunc"); return -EFAULT; } @@ -12452,12 +12726,12 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state struct btf_record *rec = reg_btf_record(reg); if (!env->cur_state->active_locks) { - verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n"); + verifier_bug(env, "%s w/o active lock", __func__); return -EFAULT; } if (type_flag(reg->type) & NON_OWN_REF) { - verbose(env, "verifier internal error: NON_OWN_REF already set\n"); + verifier_bug(env, "NON_OWN_REF already set"); return -EFAULT; } @@ -12476,8 +12750,7 @@ static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_o int i; if (!ref_obj_id) { - verbose(env, "verifier internal error: ref_obj_id is zero for " - "owning -> non-owning conversion\n"); + verifier_bug(env, "ref_obj_id is zero for owning -> non-owning conversion"); return -EFAULT; } @@ -12497,7 +12770,7 @@ static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_o return 0; } - verbose(env, "verifier internal error: ref state missing for ref_obj_id\n"); + verifier_bug(env, "ref state missing for ref_obj_id"); return -EFAULT; } @@ -12559,7 +12832,7 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_ ptr = reg->btf; break; default: - verbose(env, "verifier internal error: unknown reg type for lock check\n"); + verifier_bug(env, "unknown reg type for lock check"); return -EFAULT; } id = reg->id; @@ -12579,14 +12852,19 @@ static bool is_bpf_list_api_kfunc(u32 btf_id) return btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] || btf_id == special_kfunc_list[KF_bpf_list_push_back_impl] || btf_id == special_kfunc_list[KF_bpf_list_pop_front] || - btf_id == special_kfunc_list[KF_bpf_list_pop_back]; + btf_id == special_kfunc_list[KF_bpf_list_pop_back] || + btf_id == special_kfunc_list[KF_bpf_list_front] || + btf_id == special_kfunc_list[KF_bpf_list_back]; } static bool is_bpf_rbtree_api_kfunc(u32 btf_id) { return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] || btf_id == special_kfunc_list[KF_bpf_rbtree_remove] || - btf_id == special_kfunc_list[KF_bpf_rbtree_first]; + btf_id == special_kfunc_list[KF_bpf_rbtree_first] || + btf_id == special_kfunc_list[KF_bpf_rbtree_root] || + btf_id == special_kfunc_list[KF_bpf_rbtree_left] || + btf_id == special_kfunc_list[KF_bpf_rbtree_right]; } static bool is_bpf_iter_num_api_kfunc(u32 btf_id) @@ -12623,7 +12901,8 @@ static bool is_sync_callback_calling_kfunc(u32 btf_id) static bool is_async_callback_calling_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; + return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl] || + is_task_work_add_kfunc(btf_id); } static bool is_bpf_throw_kfunc(struct bpf_insn *insn) @@ -12686,7 +12965,9 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env, break; case BPF_RB_NODE: ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] || - kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]); + kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] || + kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_left] || + kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_right]); break; default: verbose(env, "verifier internal error: unexpected graph node argument type %s\n", @@ -12713,7 +12994,7 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, u32 head_off; if (meta->btf != btf_vmlinux) { - verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n"); + verifier_bug(env, "unexpected btf mismatch in kfunc call"); return -EFAULT; } @@ -12744,7 +13025,7 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, } if (*head_field) { - verbose(env, "verifier internal error: repeating %s arg\n", head_type_name); + verifier_bug(env, "repeating %s arg", head_type_name); return -EFAULT; } *head_field = field; @@ -12781,7 +13062,7 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, u32 node_off; if (meta->btf != btf_vmlinux) { - verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n"); + verifier_bug(env, "unexpected btf mismatch in kfunc call"); return -EFAULT; } @@ -12906,6 +13187,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_arg_ignore(btf, &args[i])) continue; + if (is_kfunc_arg_prog(btf, &args[i])) { + /* Used to reject repeated use of __prog. */ + if (meta->arg_prog) { + verifier_bug(env, "Only 1 prog->aux argument supported per-kfunc"); + return -EFAULT; + } + meta->arg_prog = true; + cur_aux(env)->arg_prog = regno; + continue; + } + if (btf_type_is_scalar(t)) { if (reg->type != SCALAR_VALUE) { verbose(env, "R%d is not a scalar\n", regno); @@ -12914,7 +13206,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_arg_constant(meta->btf, &args[i])) { if (meta->arg_constant.found) { - verbose(env, "verifier internal error: only one constant argument permitted\n"); + verifier_bug(env, "only one constant argument permitted"); return -EFAULT; } if (!tnum_is_const(reg->var_off)) { @@ -12966,9 +13258,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (reg->ref_obj_id) { if (is_kfunc_release(meta) && meta->ref_obj_id) { - verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", - regno, reg->ref_obj_id, - meta->ref_obj_id); + verifier_bug(env, "more than one arg with ref_obj_id R%d %u %u", + regno, reg->ref_obj_id, + meta->ref_obj_id); return -EFAULT; } meta->ref_obj_id = reg->ref_obj_id; @@ -12991,7 +13283,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "pointer in R%d isn't map pointer\n", regno); return -EINVAL; } - if (meta->map.ptr && reg->map_ptr->record->wq_off >= 0) { + if (meta->map.ptr && (reg->map_ptr->record->wq_off >= 0 || + reg->map_ptr->record->task_work_off >= 0)) { /* Use map_uid (which is unique id of inner map) to reject: * inner_map1 = bpf_map_lookup_elem(outer_map, key1) * inner_map2 = bpf_map_lookup_elem(outer_map, key2) @@ -13006,6 +13299,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ */ if (meta->map.ptr != reg->map_ptr || meta->map.uid != reg->map_uid) { + if (reg->map_ptr->record->task_work_off >= 0) { + verbose(env, + "bpf_task_work pointer in R2 map_uid=%d doesn't match map pointer in R3 map_uid=%d\n", + meta->map.uid, reg->map_uid); + return -EINVAL; + } verbose(env, "workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n", meta->map.uid, reg->map_uid); @@ -13044,11 +13343,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_REFCOUNTED_KPTR: case KF_ARG_PTR_TO_CONST_STR: case KF_ARG_PTR_TO_WORKQUEUE: + case KF_ARG_PTR_TO_TASK_WORK: case KF_ARG_PTR_TO_IRQ_FLAG: case KF_ARG_PTR_TO_RES_SPIN_LOCK: break; default: - WARN_ON_ONCE(1); + verifier_bug(env, "unknown kfunc arg type %d", kf_arg_type); return -EFAULT; } @@ -13112,19 +13412,26 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ dynptr_arg_type |= DYNPTR_TYPE_SKB; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) { dynptr_arg_type |= DYNPTR_TYPE_XDP; + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb_meta]) { + dynptr_arg_type |= DYNPTR_TYPE_SKB_META; + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) { + dynptr_arg_type |= DYNPTR_TYPE_FILE; + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_file_discard]) { + dynptr_arg_type |= DYNPTR_TYPE_FILE; + meta->release_regno = regno; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] && (dynptr_arg_type & MEM_UNINIT)) { enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type; if (parent_type == BPF_DYNPTR_TYPE_INVALID) { - verbose(env, "verifier internal error: no dynptr type for parent of clone\n"); + verifier_bug(env, "no dynptr type for parent of clone"); return -EFAULT; } dynptr_arg_type |= (unsigned int)get_dynptr_type_flag(parent_type); clone_ref_obj_id = meta->initialized_dynptr.ref_obj_id; if (dynptr_type_refcounted(parent_type) && !clone_ref_obj_id) { - verbose(env, "verifier internal error: missing ref obj id for parent of clone\n"); + verifier_bug(env, "missing ref obj id for parent of clone"); return -EFAULT; } } @@ -13137,7 +13444,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ int id = dynptr_id(env, reg); if (id < 0) { - verbose(env, "verifier internal error: failed to obtain dynptr id\n"); + verifier_bug(env, "failed to obtain dynptr id"); return id; } meta->initialized_dynptr.id = id; @@ -13200,22 +13507,22 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return ret; break; case KF_ARG_PTR_TO_RB_NODE: - if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_remove]) { - if (!type_is_non_owning_ref(reg->type) || reg->ref_obj_id) { - verbose(env, "rbtree_remove node input must be non-owning ref\n"); + if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { + if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + verbose(env, "arg#%d expected pointer to allocated object\n", i); return -EINVAL; } - if (in_rbtree_lock_required_cb(env)) { - verbose(env, "rbtree_remove not allowed in rbtree cb\n"); + if (!reg->ref_obj_id) { + verbose(env, "allocated object must be referenced\n"); return -EINVAL; } } else { - if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d expected pointer to allocated object\n", i); + if (!type_is_non_owning_ref(reg->type) && !reg->ref_obj_id) { + verbose(env, "%s can only take non-owning or refcounted bpf_rb_node pointer\n", func_name); return -EINVAL; } - if (!reg->ref_obj_id) { - verbose(env, "allocated object must be referenced\n"); + if (in_rbtree_lock_required_cb(env)) { + verbose(env, "%s not allowed in rbtree cb\n", func_name); return -EINVAL; } } @@ -13273,7 +13580,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) { if (meta->arg_constant.found) { - verbose(env, "verifier internal error: only one constant argument permitted\n"); + verifier_bug(env, "only one constant argument permitted"); return -EFAULT; } if (!tnum_is_const(size_reg->var_off)) { @@ -13305,7 +13612,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ rec = reg_btf_record(reg); if (!rec) { - verbose(env, "verifier internal error: Couldn't find btf_record\n"); + verifier_bug(env, "Couldn't find btf_record"); return -EFAULT; } @@ -13335,6 +13642,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret < 0) return ret; break; + case KF_ARG_PTR_TO_TASK_WORK: + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "arg#%d doesn't point to a map value\n", i); + return -EINVAL; + } + ret = process_task_work_func(env, regno, meta); + if (ret < 0) + return ret; + break; case KF_ARG_PTR_TO_IRQ_FLAG: if (reg->type != PTR_TO_STACK) { verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i); @@ -13420,6 +13736,186 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env, return 0; } +/* check special kfuncs and return: + * 1 - not fall-through to 'else' branch, continue verification + * 0 - fall-through to 'else' branch + * < 0 - not fall-through to 'else' branch, return error + */ +static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, + struct bpf_reg_state *regs, struct bpf_insn_aux_data *insn_aux, + const struct btf_type *ptr_type, struct btf *desc_btf) +{ + const struct btf_type *ret_t; + int err = 0; + + if (meta->btf != btf_vmlinux) + return 0; + + if (meta->func_id == special_kfunc_list[KF_bpf_obj_new_impl] || + meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + struct btf_struct_meta *struct_meta; + struct btf *ret_btf; + u32 ret_btf_id; + + if (meta->func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set) + return -ENOMEM; + + if (((u64)(u32)meta->arg_constant.value) != meta->arg_constant.value) { + verbose(env, "local type ID argument must be in range [0, U32_MAX]\n"); + return -EINVAL; + } + + ret_btf = env->prog->aux->btf; + ret_btf_id = meta->arg_constant.value; + + /* This may be NULL due to user not supplying a BTF */ + if (!ret_btf) { + verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n"); + return -EINVAL; + } + + ret_t = btf_type_by_id(ret_btf, ret_btf_id); + if (!ret_t || !__btf_type_is_struct(ret_t)) { + verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n"); + return -EINVAL; + } + + if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) { + verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n", + ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE); + return -EINVAL; + } + + if (!bpf_global_percpu_ma_set) { + mutex_lock(&bpf_percpu_ma_lock); + if (!bpf_global_percpu_ma_set) { + /* Charge memory allocated with bpf_global_percpu_ma to + * root memcg. The obj_cgroup for root memcg is NULL. + */ + err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL); + if (!err) + bpf_global_percpu_ma_set = true; + } + mutex_unlock(&bpf_percpu_ma_lock); + if (err) + return err; + } + + mutex_lock(&bpf_percpu_ma_lock); + err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size); + mutex_unlock(&bpf_percpu_ma_lock); + if (err) + return err; + } + + struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id); + if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) { + verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n"); + return -EINVAL; + } + + if (struct_meta) { + verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n"); + return -EINVAL; + } + } + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; + regs[BPF_REG_0].btf = ret_btf; + regs[BPF_REG_0].btf_id = ret_btf_id; + if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) + regs[BPF_REG_0].type |= MEM_PERCPU; + + insn_aux->obj_new_size = ret_t->size; + insn_aux->kptr_struct_meta = struct_meta; + } else if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; + regs[BPF_REG_0].btf = meta->arg_btf; + regs[BPF_REG_0].btf_id = meta->arg_btf_id; + + insn_aux->kptr_struct_meta = + btf_find_struct_meta(meta->arg_btf, + meta->arg_btf_id); + } else if (is_list_node_type(ptr_type)) { + struct btf_field *field = meta->arg_list_head.field; + + mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); + } else if (is_rbtree_node_type(ptr_type)) { + struct btf_field *field = meta->arg_rbtree_root.field; + + mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); + } else if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED; + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].btf_id = meta->ret_btf_id; + } else if (meta->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { + ret_t = btf_type_by_id(desc_btf, meta->arg_constant.value); + if (!ret_t) { + verbose(env, "Unknown type ID %lld passed to kfunc bpf_rdonly_cast\n", + meta->arg_constant.value); + return -EINVAL; + } else if (btf_type_is_struct(ret_t)) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED; + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].btf_id = meta->arg_constant.value; + } else if (btf_type_is_void(ret_t)) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED; + regs[BPF_REG_0].mem_size = 0; + } else { + verbose(env, + "kfunc bpf_rdonly_cast type ID argument must be of a struct or void\n"); + return -EINVAL; + } + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice] || + meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) { + enum bpf_type_flag type_flag = get_dynptr_type_flag(meta->initialized_dynptr.type); + + mark_reg_known_zero(env, regs, BPF_REG_0); + + if (!meta->arg_constant.found) { + verifier_bug(env, "bpf_dynptr_slice(_rdwr) no constant size"); + return -EFAULT; + } + + regs[BPF_REG_0].mem_size = meta->arg_constant.value; + + /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */ + regs[BPF_REG_0].type = PTR_TO_MEM | type_flag; + + if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice]) { + regs[BPF_REG_0].type |= MEM_RDONLY; + } else { + /* this will set env->seen_direct_write to true */ + if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) { + verbose(env, "the prog does not allow writes to packet data\n"); + return -EINVAL; + } + } + + if (!meta->initialized_dynptr.id) { + verifier_bug(env, "no dynptr id"); + return -EFAULT; + } + regs[BPF_REG_0].dynptr_id = meta->initialized_dynptr.id; + + /* we don't need to set BPF_REG_0's ref obj id + * because packet slices are not refcounted (see + * dynptr_type_refcounted) + */ + } else { + return 0; + } + + return 1; +} + static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name); static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, @@ -13434,7 +13930,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn_aux_data *insn_aux; int err, insn_idx = *insn_idx_p; const struct btf_param *args; - const struct btf_type *ret_t; struct btf *desc_btf; /* skip for now, but return error when we find this in fixup_kfunc_call */ @@ -13458,9 +13953,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_reg_state *regs; branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); - if (!branch) { + if (IS_ERR(branch)) { verbose(env, "failed to push state for failed lock acquisition\n"); - return -ENOMEM; + return PTR_ERR(branch); } regs = branch->frame[branch->curframe]->regs; @@ -13476,6 +13971,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return err; } __mark_btf_func_reg_size(env, regs, BPF_REG_0, sizeof(u32)); + } else if (!insn->off && insn->imm == special_kfunc_list[KF___bpf_trap]) { + verbose(env, "unexpected __bpf_trap() due to uninitialized variable?\n"); + return -EFAULT; } if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) { @@ -13489,6 +13987,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EACCES; } + /* Track non-sleepable context for kfuncs, same as for helpers. */ + if (!in_sleepable_context(env)) + insn_aux->non_sleepable = true; + /* Check the arguments */ err = check_kfunc_args(env, &meta, insn_idx); if (err < 0) @@ -13519,42 +14021,49 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (is_task_work_add_kfunc(meta.func_id)) { + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_task_work_schedule_callback_state); + if (err) { + verbose(env, "kfunc %s#%d failed callback verification\n", + func_name, meta.func_id); + return err; + } + } + rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); preempt_disable = is_kfunc_bpf_preempt_disable(&meta); preempt_enable = is_kfunc_bpf_preempt_enable(&meta); - if (env->cur_state->active_rcu_lock) { + if (rcu_lock) { + env->cur_state->active_rcu_locks++; + } else if (rcu_unlock) { struct bpf_func_state *state; struct bpf_reg_state *reg; u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER); - if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) { - verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n"); - return -EACCES; - } - - if (rcu_lock) { - verbose(env, "nested rcu read lock (kernel function %s)\n", func_name); + if (env->cur_state->active_rcu_locks == 0) { + verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name); return -EINVAL; - } else if (rcu_unlock) { + } + if (--env->cur_state->active_rcu_locks == 0) { bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({ if (reg->type & MEM_RCU) { reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL); reg->type |= PTR_UNTRUSTED; } })); - env->cur_state->active_rcu_lock = false; - } else if (sleepable) { - verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name); - return -EACCES; } - } else if (rcu_lock) { - env->cur_state->active_rcu_lock = true; - } else if (rcu_unlock) { - verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name); - return -EINVAL; + } else if (sleepable && env->cur_state->active_rcu_locks) { + verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name); + return -EACCES; + } + + if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) { + verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n"); + return -EACCES; } if (env->cur_state->active_preempt_locks) { @@ -13578,16 +14087,27 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EACCES; } + if (is_kfunc_rcu_protected(&meta) && !in_rcu_cs(env)) { + verbose(env, "kernel func %s requires RCU critical section protection\n", func_name); + return -EACCES; + } + /* In case of release function, we get register number of refcounted * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now. */ if (meta.release_regno) { - err = release_reference(env, regs[meta.release_regno].ref_obj_id); - if (err) { - verbose(env, "kfunc %s#%d reference has not been acquired before\n", - func_name, meta.func_id); - return err; + struct bpf_reg_state *reg = ®s[meta.release_regno]; + + if (meta.initialized_dynptr.ref_obj_id) { + err = unmark_stack_slots_dynptr(env, reg); + } else { + err = release_reference(env, reg->ref_obj_id); + if (err) + verbose(env, "kfunc %s#%d reference has not been acquired before\n", + func_name, meta.func_id); } + if (err) + return err; } if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front_impl] || @@ -13654,165 +14174,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, mark_btf_func_reg_size(env, BPF_REG_0, t->size); } else if (btf_type_is_ptr(t)) { ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id); - - if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { - if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] || - meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { - struct btf_struct_meta *struct_meta; - struct btf *ret_btf; - u32 ret_btf_id; - - if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set) - return -ENOMEM; - - if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) { - verbose(env, "local type ID argument must be in range [0, U32_MAX]\n"); - return -EINVAL; - } - - ret_btf = env->prog->aux->btf; - ret_btf_id = meta.arg_constant.value; - - /* This may be NULL due to user not supplying a BTF */ - if (!ret_btf) { - verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n"); - return -EINVAL; - } - - ret_t = btf_type_by_id(ret_btf, ret_btf_id); - if (!ret_t || !__btf_type_is_struct(ret_t)) { - verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n"); - return -EINVAL; - } - - if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { - if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) { - verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n", - ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE); - return -EINVAL; - } - - if (!bpf_global_percpu_ma_set) { - mutex_lock(&bpf_percpu_ma_lock); - if (!bpf_global_percpu_ma_set) { - /* Charge memory allocated with bpf_global_percpu_ma to - * root memcg. The obj_cgroup for root memcg is NULL. - */ - err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL); - if (!err) - bpf_global_percpu_ma_set = true; - } - mutex_unlock(&bpf_percpu_ma_lock); - if (err) - return err; - } - - mutex_lock(&bpf_percpu_ma_lock); - err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size); - mutex_unlock(&bpf_percpu_ma_lock); - if (err) - return err; - } - - struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id); - if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { - if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) { - verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n"); - return -EINVAL; - } - - if (struct_meta) { - verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n"); - return -EINVAL; - } - } - - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; - regs[BPF_REG_0].btf = ret_btf; - regs[BPF_REG_0].btf_id = ret_btf_id; - if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) - regs[BPF_REG_0].type |= MEM_PERCPU; - - insn_aux->obj_new_size = ret_t->size; - insn_aux->kptr_struct_meta = struct_meta; - } else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) { - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; - regs[BPF_REG_0].btf = meta.arg_btf; - regs[BPF_REG_0].btf_id = meta.arg_btf_id; - - insn_aux->kptr_struct_meta = - btf_find_struct_meta(meta.arg_btf, - meta.arg_btf_id); - } else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] || - meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) { - struct btf_field *field = meta.arg_list_head.field; - - mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); - } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove] || - meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) { - struct btf_field *field = meta.arg_rbtree_root.field; - - mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); - } else if (meta.func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) { - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED; - regs[BPF_REG_0].btf = desc_btf; - regs[BPF_REG_0].btf_id = meta.ret_btf_id; - } else if (meta.func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { - ret_t = btf_type_by_id(desc_btf, meta.arg_constant.value); - if (!ret_t || !btf_type_is_struct(ret_t)) { - verbose(env, - "kfunc bpf_rdonly_cast type ID argument must be of a struct\n"); - return -EINVAL; - } - - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED; - regs[BPF_REG_0].btf = desc_btf; - regs[BPF_REG_0].btf_id = meta.arg_constant.value; - } else if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice] || - meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) { - enum bpf_type_flag type_flag = get_dynptr_type_flag(meta.initialized_dynptr.type); - - mark_reg_known_zero(env, regs, BPF_REG_0); - - if (!meta.arg_constant.found) { - verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n"); - return -EFAULT; - } - - regs[BPF_REG_0].mem_size = meta.arg_constant.value; - - /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */ - regs[BPF_REG_0].type = PTR_TO_MEM | type_flag; - - if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice]) { - regs[BPF_REG_0].type |= MEM_RDONLY; - } else { - /* this will set env->seen_direct_write to true */ - if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) { - verbose(env, "the prog does not allow writes to packet data\n"); - return -EINVAL; - } - } - - if (!meta.initialized_dynptr.id) { - verbose(env, "verifier internal error: no dynptr id\n"); - return -EFAULT; - } - regs[BPF_REG_0].dynptr_id = meta.initialized_dynptr.id; - - /* we don't need to set BPF_REG_0's ref obj id - * because packet slices are not refcounted (see - * dynptr_type_refcounted) - */ - } else { - verbose(env, "kernel function %s unhandled dynamic return type\n", - meta.func_name); - return -EFAULT; - } + err = check_special_kfunc(env, &meta, regs, insn_aux, ptr_type, desc_btf); + if (err) { + if (err < 0) + return err; } else if (btf_type_is_void(ptr_type)) { /* kfunc returning 'void *' is equivalent to returning scalar */ mark_reg_unknown(env, regs, BPF_REG_0); @@ -13846,6 +14211,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* Ensures we don't access the memory after a release_reference() */ if (meta.ref_obj_id) regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; + + if (is_kfunc_rcu_protected(&meta)) + regs[BPF_REG_0].type |= MEM_RCU; } else { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].btf = desc_btf; @@ -13854,6 +14222,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache]) regs[BPF_REG_0].type |= PTR_UNTRUSTED; + else if (is_kfunc_rcu_protected(&meta)) + regs[BPF_REG_0].type |= MEM_RCU; if (is_iter_next_kfunc(&meta)) { struct bpf_reg_state *cur_iter; @@ -13881,14 +14251,14 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (is_kfunc_ret_null(&meta)) regs[BPF_REG_0].id = id; regs[BPF_REG_0].ref_obj_id = id; - } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) { + } else if (is_rbtree_node_type(ptr_type) || is_list_node_type(ptr_type)) { ref_set_non_owning(env, ®s[BPF_REG_0]); } if (reg_may_point_to_spin_lock(®s[BPF_REG_0]) && !regs[BPF_REG_0].id) regs[BPF_REG_0].id = ++env->id_gen; } else if (btf_type_is_void(t)) { - if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { + if (meta.btf == btf_vmlinux) { if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl] || meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) { insn_aux->kptr_struct_meta = @@ -13898,6 +14268,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (is_kfunc_pkt_changing(&meta)) + clear_all_pkt_pointers(env); + nargs = btf_type_vlen(meta.func_proto); args = (const struct btf_param *)(meta.func_proto + 1); for (i = 0; i < nargs; i++) { @@ -13997,7 +14370,9 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env, const struct bpf_insn *insn) { - return env->bypass_spec_v1 || BPF_SRC(insn->code) == BPF_K; + return env->bypass_spec_v1 || + BPF_SRC(insn->code) == BPF_K || + cur_aux(env)->nospec; } static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, @@ -14038,16 +14413,15 @@ struct bpf_sanitize_info { bool mask_to_left; }; -static struct bpf_verifier_state * -sanitize_speculative_path(struct bpf_verifier_env *env, - const struct bpf_insn *insn, - u32 next_idx, u32 curr_idx) +static int sanitize_speculative_path(struct bpf_verifier_env *env, + const struct bpf_insn *insn, + u32 next_idx, u32 curr_idx) { struct bpf_verifier_state *branch; struct bpf_reg_state *regs; branch = push_stack(env, next_idx, curr_idx, true); - if (branch && insn) { + if (!IS_ERR(branch) && insn) { regs = branch->frame[branch->curframe]->regs; if (BPF_SRC(insn->code) == BPF_K) { mark_reg_unknown(env, regs, insn->dst_reg); @@ -14056,7 +14430,7 @@ sanitize_speculative_path(struct bpf_verifier_env *env, mark_reg_unknown(env, regs, insn->src_reg); } } - return branch; + return PTR_ERR_OR_ZERO(branch); } static int sanitize_ptr_alu(struct bpf_verifier_env *env, @@ -14075,7 +14449,6 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, u8 opcode = BPF_OP(insn->code); u32 alu_state, alu_limit; struct bpf_reg_state tmp; - bool ret; int err; if (can_skip_alu_sanitation(env, insn)) @@ -14148,11 +14521,12 @@ do_sim: tmp = *dst_reg; copy_register_state(dst_reg, ptr_reg); } - ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1, - env->insn_idx); - if (!ptr_is_dst_reg && ret) + err = sanitize_speculative_path(env, NULL, env->insn_idx + 1, env->insn_idx); + if (err < 0) + return REASON_STACK; + if (!ptr_is_dst_reg) *dst_reg = tmp; - return !ret ? REASON_STACK : 0; + return 0; } static void sanitize_mark_insn_seen(struct bpf_verifier_env *env) @@ -14197,10 +14571,9 @@ static int sanitize_err(struct bpf_verifier_env *env, case REASON_STACK: verbose(env, "R%d could not be pushed for speculative verification, %s\n", dst, err); - break; + return -ENOMEM; default: - verbose(env, "verifier internal error: unknown reason (%d)\n", - reason); + verifier_bug(env, "unknown reason (%d)", reason); break; } @@ -14267,7 +14640,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env, } break; default: - break; + return -EOPNOTSUPP; } return 0; @@ -14294,7 +14667,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, struct bpf_sanitize_info info = {}; u8 opcode = BPF_OP(insn->code); u32 dst = insn->dst_reg; - int ret; + int ret, bounds_ret; dst_reg = ®s[dst]; @@ -14326,6 +14699,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; } + /* + * Accesses to untrusted PTR_TO_MEM are done through probe + * instructions, hence no need to track offsets. + */ + if (base_type(ptr_reg->type) == PTR_TO_MEM && (ptr_reg->type & PTR_UNTRUSTED)) + return 0; + switch (base_type(ptr_reg->type)) { case PTR_TO_CTX: case PTR_TO_MAP_VALUE: @@ -14494,11 +14874,19 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type)) return -EINVAL; reg_bounds_sync(dst_reg); - if (sanitize_check_bounds(env, insn, dst_reg) < 0) - return -EACCES; + bounds_ret = sanitize_check_bounds(env, insn, dst_reg); + if (bounds_ret == -EACCES) + return bounds_ret; if (sanitize_needed(opcode)) { ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg, &info, true); + if (verifier_bug_if(!can_skip_alu_sanitation(env, insn) + && !env->cur_state->speculative + && bounds_ret + && !ret, + env, "Pointer type unsupported by sanitize_check_bounds() not rejected by retrieve_ptr_limit() as required")) { + return -EFAULT; + } if (ret < 0) return sanitize_err(env, insn, ret, off_reg, dst_reg); } @@ -14513,14 +14901,25 @@ static void scalar32_min_max_add(struct bpf_reg_state *dst_reg, s32 *dst_smax = &dst_reg->s32_max_value; u32 *dst_umin = &dst_reg->u32_min_value; u32 *dst_umax = &dst_reg->u32_max_value; + u32 umin_val = src_reg->u32_min_value; + u32 umax_val = src_reg->u32_max_value; + bool min_overflow, max_overflow; if (check_add_overflow(*dst_smin, src_reg->s32_min_value, dst_smin) || check_add_overflow(*dst_smax, src_reg->s32_max_value, dst_smax)) { *dst_smin = S32_MIN; *dst_smax = S32_MAX; } - if (check_add_overflow(*dst_umin, src_reg->u32_min_value, dst_umin) || - check_add_overflow(*dst_umax, src_reg->u32_max_value, dst_umax)) { + + /* If either all additions overflow or no additions overflow, then + * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax = + * dst_umax + src_umax. Otherwise (some additions overflow), set + * the output bounds to unbounded. + */ + min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin); + max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax); + + if (!min_overflow && max_overflow) { *dst_umin = 0; *dst_umax = U32_MAX; } @@ -14533,14 +14932,25 @@ static void scalar_min_max_add(struct bpf_reg_state *dst_reg, s64 *dst_smax = &dst_reg->smax_value; u64 *dst_umin = &dst_reg->umin_value; u64 *dst_umax = &dst_reg->umax_value; + u64 umin_val = src_reg->umin_value; + u64 umax_val = src_reg->umax_value; + bool min_overflow, max_overflow; if (check_add_overflow(*dst_smin, src_reg->smin_value, dst_smin) || check_add_overflow(*dst_smax, src_reg->smax_value, dst_smax)) { *dst_smin = S64_MIN; *dst_smax = S64_MAX; } - if (check_add_overflow(*dst_umin, src_reg->umin_value, dst_umin) || - check_add_overflow(*dst_umax, src_reg->umax_value, dst_umax)) { + + /* If either all additions overflow or no additions overflow, then + * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax = + * dst_umax + src_umax. Otherwise (some additions overflow), set + * the output bounds to unbounded. + */ + min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin); + max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax); + + if (!min_overflow && max_overflow) { *dst_umin = 0; *dst_umax = U64_MAX; } @@ -14551,8 +14961,11 @@ static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg, { s32 *dst_smin = &dst_reg->s32_min_value; s32 *dst_smax = &dst_reg->s32_max_value; + u32 *dst_umin = &dst_reg->u32_min_value; + u32 *dst_umax = &dst_reg->u32_max_value; u32 umin_val = src_reg->u32_min_value; u32 umax_val = src_reg->u32_max_value; + bool min_underflow, max_underflow; if (check_sub_overflow(*dst_smin, src_reg->s32_max_value, dst_smin) || check_sub_overflow(*dst_smax, src_reg->s32_min_value, dst_smax)) { @@ -14560,14 +14973,18 @@ static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg, *dst_smin = S32_MIN; *dst_smax = S32_MAX; } - if (dst_reg->u32_min_value < umax_val) { - /* Overflow possible, we know nothing */ - dst_reg->u32_min_value = 0; - dst_reg->u32_max_value = U32_MAX; - } else { - /* Cannot overflow (as long as bounds are consistent) */ - dst_reg->u32_min_value -= umax_val; - dst_reg->u32_max_value -= umin_val; + + /* If either all subtractions underflow or no subtractions + * underflow, it is okay to set: dst_umin = dst_umin - src_umax, + * dst_umax = dst_umax - src_umin. Otherwise (some subtractions + * underflow), set the output bounds to unbounded. + */ + min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin); + max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax); + + if (min_underflow && !max_underflow) { + *dst_umin = 0; + *dst_umax = U32_MAX; } } @@ -14576,8 +14993,11 @@ static void scalar_min_max_sub(struct bpf_reg_state *dst_reg, { s64 *dst_smin = &dst_reg->smin_value; s64 *dst_smax = &dst_reg->smax_value; + u64 *dst_umin = &dst_reg->umin_value; + u64 *dst_umax = &dst_reg->umax_value; u64 umin_val = src_reg->umin_value; u64 umax_val = src_reg->umax_value; + bool min_underflow, max_underflow; if (check_sub_overflow(*dst_smin, src_reg->smax_value, dst_smin) || check_sub_overflow(*dst_smax, src_reg->smin_value, dst_smax)) { @@ -14585,14 +15005,18 @@ static void scalar_min_max_sub(struct bpf_reg_state *dst_reg, *dst_smin = S64_MIN; *dst_smax = S64_MAX; } - if (dst_reg->umin_value < umax_val) { - /* Overflow possible, we know nothing */ - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; - } else { - /* Cannot overflow (as long as bounds are consistent) */ - dst_reg->umin_value -= umax_val; - dst_reg->umax_value -= umin_val; + + /* If either all subtractions underflow or no subtractions + * underflow, it is okay to set: dst_umin = dst_umin - src_umax, + * dst_umax = dst_umax - src_umin. Otherwise (some subtractions + * underflow), set the output bounds to unbounded. + */ + min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin); + max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax); + + if (min_underflow && !max_underflow) { + *dst_umin = 0; + *dst_umax = U64_MAX; } } @@ -15054,6 +15478,7 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, switch (BPF_OP(insn->code)) { case BPF_ADD: case BPF_SUB: + case BPF_NEG: case BPF_AND: case BPF_XOR: case BPF_OR: @@ -15122,6 +15547,13 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, scalar_min_max_sub(dst_reg, &src_reg); dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off); break; + case BPF_NEG: + env->fake_reg[0] = *dst_reg; + __mark_reg_known(dst_reg, 0); + scalar32_min_max_sub(dst_reg, &env->fake_reg[0]); + scalar_min_max_sub(dst_reg, &env->fake_reg[0]); + dst_reg->var_off = tnum_neg(env->fake_reg[0].var_off); + break; case BPF_MUL: dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off); scalar32_min_max_mul(dst_reg, &src_reg); @@ -15261,12 +15693,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, if (WARN_ON_ONCE(ptr_reg)) { print_verifier_state(env, vstate, vstate->curframe, true); verbose(env, "verifier internal error: unexpected ptr_reg\n"); - return -EINVAL; + return -EFAULT; } if (WARN_ON(!src_reg)) { print_verifier_state(env, vstate, vstate->curframe, true); verbose(env, "verifier internal error: no src_reg\n"); - return -EINVAL; + return -EFAULT; } err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); if (err) @@ -15345,7 +15777,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* check dest operand */ - err = check_reg_arg(env, insn->dst_reg, DST_OP); + if (opcode == BPF_NEG && + regs[insn->dst_reg].type == SCALAR_VALUE) { + err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); + err = err ?: adjust_scalar_min_max_vals(env, insn, + ®s[insn->dst_reg], + regs[insn->dst_reg]); + } else { + err = check_reg_arg(env, insn->dst_reg, DST_OP); + } if (err) return err; @@ -15410,7 +15850,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) */ assign_scalar_id_before_mov(env, src_reg); copy_register_state(dst_reg, src_reg); - dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = DEF_NOT_SUBREG; } else { /* case: R1 = (s8, s16 s32)R2 */ @@ -15429,7 +15868,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (!no_sext) dst_reg->id = 0; coerce_reg_to_size_sx(dst_reg, insn->off >> 3); - dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = DEF_NOT_SUBREG; } else { mark_reg_unknown(env, regs, insn->dst_reg); @@ -15455,7 +15893,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) */ if (!is_src_reg_u32) dst_reg->id = 0; - dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = env->insn_idx + 1; } else { /* case: W1 = (s8, s16)W2 */ @@ -15466,7 +15903,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) copy_register_state(dst_reg, src_reg); if (!no_sext) dst_reg->id = 0; - dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = env->insn_idx + 1; coerce_subreg_to_size_sx(dst_reg, insn->off >> 3); } @@ -15500,7 +15936,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else { /* all other ALU ops: and, sub, xor, add, ... */ if (BPF_SRC(insn->code) == BPF_X) { - if (insn->imm != 0 || insn->off > 1 || + if (insn->imm != 0 || (insn->off != 0 && insn->off != 1) || (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) { verbose(env, "BPF_ALU uses reserved fields\n"); return -EINVAL; @@ -15510,7 +15946,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (err) return err; } else { - if (insn->src_reg != BPF_REG_0 || insn->off > 1 || + if (insn->src_reg != BPF_REG_0 || (insn->off != 0 && insn->off != 1) || (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) { verbose(env, "BPF_ALU uses reserved fields\n"); return -EINVAL; @@ -15644,6 +16080,30 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta s64 smin2 = is_jmp32 ? (s64)reg2->s32_min_value : reg2->smin_value; s64 smax2 = is_jmp32 ? (s64)reg2->s32_max_value : reg2->smax_value; + if (reg1 == reg2) { + switch (opcode) { + case BPF_JGE: + case BPF_JLE: + case BPF_JSGE: + case BPF_JSLE: + case BPF_JEQ: + return 1; + case BPF_JGT: + case BPF_JLT: + case BPF_JSGT: + case BPF_JSLT: + case BPF_JNE: + return 0; + case BPF_JSET: + if (tnum_is_const(t1)) + return t1.value != 0; + else + return (smin1 <= 0 && smax1 >= 0) ? -1 : 1; + default: + return -1; + } + } + switch (opcode) { case BPF_JEQ: /* constants, umin/umax and smin/smax checks would be @@ -15651,6 +16111,8 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta */ if (tnum_is_const(t1) && tnum_is_const(t2)) return t1.value == t2.value; + if (!tnum_overlap(t1, t2)) + return 0; /* non-overlapping ranges */ if (umin1 > umax2 || umax1 < umin2) return 0; @@ -15675,6 +16137,8 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta */ if (tnum_is_const(t1) && tnum_is_const(t2)) return t1.value != t2.value; + if (!tnum_overlap(t1, t2)) + return 1; /* non-overlapping ranges */ if (umin1 > umax2 || umax1 < umin2) return 1; @@ -16012,6 +16476,10 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state if (!is_reg_const(reg2, is_jmp32)) break; val = reg_const_value(reg2, is_jmp32); + /* Forget the ranges before narrowing tnums, to avoid invariant + * violations if we're on a dead branch. + */ + __mark_reg_unbounded(reg1); if (is_jmp32) { t = tnum_and(tnum_subreg(reg1->var_off), tnum_const(~val)); reg1->var_off = tnum_with_subreg(reg1->var_off, t); @@ -16082,6 +16550,13 @@ static int reg_set_min_max(struct bpf_verifier_env *env, if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE) return 0; + /* We compute branch direction for same SCALAR_VALUE registers in + * is_scalar_branch_taken(). For unknown branch directions (e.g., BPF_JSET) + * on the same registers, we don't need to adjust the min/max values. + */ + if (false_reg1 == false_reg2) + return 0; + /* fallthrough (FALSE) branch */ regs_refine_cond_op(false_reg1, false_reg2, rev_opcode(opcode), is_jmp32); reg_bounds_sync(false_reg1); @@ -16377,6 +16852,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_reg_state *eq_branch_regs; struct linked_regs linked_regs = {}; u8 opcode = BPF_OP(insn->code); + int insn_flags = 0; bool is_jmp32; int pred = -1; int err; @@ -16401,8 +16877,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, /* branch out 'fallthrough' insn as a new state to explore */ queued_st = push_stack(env, idx + 1, idx, false); - if (!queued_st) - return -ENOMEM; + if (IS_ERR(queued_st)) + return PTR_ERR(queued_st); queued_st->may_goto_depth++; if (prev_st) @@ -16435,6 +16911,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, insn->src_reg); return -EACCES; } + + if (src_reg->type == PTR_TO_STACK) + insn_flags |= INSN_F_SRC_REG_STACK; + if (dst_reg->type == PTR_TO_STACK) + insn_flags |= INSN_F_DST_REG_STACK; } else { if (insn->src_reg != BPF_REG_0) { verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); @@ -16444,6 +16925,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, memset(src_reg, 0, sizeof(*src_reg)); src_reg->type = SCALAR_VALUE; __mark_reg_known(src_reg, insn->imm); + + if (dst_reg->type == PTR_TO_STACK) + insn_flags |= INSN_F_DST_REG_STACK; + } + + if (insn_flags) { + err = push_jmp_history(env, this_branch, insn_flags, 0); + if (err) + return err; } is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; @@ -16466,10 +16956,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, * the fall-through branch for simulation under speculative * execution. */ - if (!env->bypass_spec_v1 && - !sanitize_speculative_path(env, insn, *insn_idx + 1, - *insn_idx)) - return -EFAULT; + if (!env->bypass_spec_v1) { + err = sanitize_speculative_path(env, insn, *insn_idx + 1, *insn_idx); + if (err < 0) + return err; + } if (env->log.level & BPF_LOG_LEVEL) print_insn_state(env, this_branch, this_branch->curframe); *insn_idx += insn->off; @@ -16479,11 +16970,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, * program will go. If needed, push the goto branch for * simulation under speculative execution. */ - if (!env->bypass_spec_v1 && - !sanitize_speculative_path(env, insn, - *insn_idx + insn->off + 1, - *insn_idx)) - return -EFAULT; + if (!env->bypass_spec_v1) { + err = sanitize_speculative_path(env, insn, *insn_idx + insn->off + 1, + *insn_idx); + if (err < 0) + return err; + } if (env->log.level & BPF_LOG_LEVEL) print_insn_state(env, this_branch, this_branch->curframe); return 0; @@ -16499,15 +16991,14 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (dst_reg->type == SCALAR_VALUE && dst_reg->id) collect_linked_regs(this_branch, dst_reg->id, &linked_regs); if (linked_regs.cnt > 1) { - err = push_insn_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); + err = push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); if (err) return err; } - other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, - false); - if (!other_branch) - return -EFAULT; + other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, false); + if (IS_ERR(other_branch)) + return PTR_ERR(other_branch); other_branch_regs = other_branch->frame[other_branch->curframe]->regs; if (BPF_SRC(insn->code) == BPF_X) { @@ -16654,7 +17145,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) dst_reg->btf_id = aux->btf_var.btf_id; break; default: - verbose(env, "bpf verifier is misconfigured\n"); + verifier_bug(env, "pseudo btf id: unexpected dst reg type"); return -EFAULT; } return 0; @@ -16690,14 +17181,15 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) } dst_reg->type = PTR_TO_MAP_VALUE; dst_reg->off = aux->map_off; - WARN_ON_ONCE(map->max_entries != 1); + WARN_ON_ONCE(map->map_type != BPF_MAP_TYPE_INSN_ARRAY && + map->max_entries != 1); /* We want reg->id to be same (0) as map_value is not distinct */ } else if (insn->src_reg == BPF_PSEUDO_MAP_FD || insn->src_reg == BPF_PSEUDO_MAP_IDX) { dst_reg->type = CONST_PTR_TO_MAP; } else { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + verifier_bug(env, "unexpected src reg value for ldimm64"); + return -EFAULT; } return 0; @@ -16743,8 +17235,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) } if (!env->ops->gen_ld_abs) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + verifier_bug(env, "gen_ld_abs is null"); + return -EFAULT; } if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || @@ -16863,9 +17355,8 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char } if (frame->in_async_callback_fn) { - /* enforce return zero from async callbacks like timer */ exit_ctx = "At async callback return"; - range = retval_range(0, 0); + range = frame->callback_ret_range; goto enforce_retval; } @@ -17004,7 +17495,7 @@ static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *subprog; - subprog = find_containing_subprog(env, off); + subprog = bpf_find_containing_subprog(env, off); subprog->changes_pkt_data = true; } @@ -17012,7 +17503,7 @@ static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *subprog; - subprog = find_containing_subprog(env, off); + subprog = bpf_find_containing_subprog(env, off); subprog->might_sleep = true; } @@ -17026,8 +17517,8 @@ static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w) { struct bpf_subprog_info *caller, *callee; - caller = find_containing_subprog(env, t); - callee = find_containing_subprog(env, w); + caller = bpf_find_containing_subprog(env, t); + callee = bpf_find_containing_subprog(env, w); caller->changes_pkt_data |= callee->changes_pkt_data; caller->might_sleep |= callee->might_sleep; } @@ -17097,7 +17588,7 @@ static void mark_calls_callback(struct bpf_verifier_env *env, int idx) env->insn_aux_data[idx].calls_callback = true; } -static bool calls_callback(struct bpf_verifier_env *env, int insn_idx) +bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx) { return env->insn_aux_data[insn_idx].calls_callback; } @@ -17154,7 +17645,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) /* forward- or cross-edge */ insn_state[t] = DISCOVERED | e; } else { - verbose(env, "insn state internal bug\n"); + verifier_bug(env, "insn state internal bug"); return -EFAULT; } return DONE_EXPLORING; @@ -17443,6 +17934,247 @@ static int mark_fastcall_patterns(struct bpf_verifier_env *env) return 0; } +static struct bpf_iarray *iarray_realloc(struct bpf_iarray *old, size_t n_elem) +{ + size_t new_size = sizeof(struct bpf_iarray) + n_elem * sizeof(old->items[0]); + struct bpf_iarray *new; + + new = kvrealloc(old, new_size, GFP_KERNEL_ACCOUNT); + if (!new) { + /* this is what callers always want, so simplify the call site */ + kvfree(old); + return NULL; + } + + new->cnt = n_elem; + return new; +} + +static int copy_insn_array(struct bpf_map *map, u32 start, u32 end, u32 *items) +{ + struct bpf_insn_array_value *value; + u32 i; + + for (i = start; i <= end; i++) { + value = map->ops->map_lookup_elem(map, &i); + /* + * map_lookup_elem of an array map will never return an error, + * but not checking it makes some static analysers to worry + */ + if (IS_ERR(value)) + return PTR_ERR(value); + else if (!value) + return -EINVAL; + items[i - start] = value->xlated_off; + } + return 0; +} + +static int cmp_ptr_to_u32(const void *a, const void *b) +{ + return *(u32 *)a - *(u32 *)b; +} + +static int sort_insn_array_uniq(u32 *items, int cnt) +{ + int unique = 1; + int i; + + sort(items, cnt, sizeof(items[0]), cmp_ptr_to_u32, NULL); + + for (i = 1; i < cnt; i++) + if (items[i] != items[unique - 1]) + items[unique++] = items[i]; + + return unique; +} + +/* + * sort_unique({map[start], ..., map[end]}) into off + */ +static int copy_insn_array_uniq(struct bpf_map *map, u32 start, u32 end, u32 *off) +{ + u32 n = end - start + 1; + int err; + + err = copy_insn_array(map, start, end, off); + if (err) + return err; + + return sort_insn_array_uniq(off, n); +} + +/* + * Copy all unique offsets from the map + */ +static struct bpf_iarray *jt_from_map(struct bpf_map *map) +{ + struct bpf_iarray *jt; + int err; + int n; + + jt = iarray_realloc(NULL, map->max_entries); + if (!jt) + return ERR_PTR(-ENOMEM); + + n = copy_insn_array_uniq(map, 0, map->max_entries - 1, jt->items); + if (n < 0) { + err = n; + goto err_free; + } + if (n == 0) { + err = -EINVAL; + goto err_free; + } + jt->cnt = n; + return jt; + +err_free: + kvfree(jt); + return ERR_PTR(err); +} + +/* + * Find and collect all maps which fit in the subprog. Return the result as one + * combined jump table in jt->items (allocated with kvcalloc) + */ +static struct bpf_iarray *jt_from_subprog(struct bpf_verifier_env *env, + int subprog_start, int subprog_end) +{ + struct bpf_iarray *jt = NULL; + struct bpf_map *map; + struct bpf_iarray *jt_cur; + int i; + + for (i = 0; i < env->insn_array_map_cnt; i++) { + /* + * TODO (when needed): collect only jump tables, not static keys + * or maps for indirect calls + */ + map = env->insn_array_maps[i]; + + jt_cur = jt_from_map(map); + if (IS_ERR(jt_cur)) { + kvfree(jt); + return jt_cur; + } + + /* + * This is enough to check one element. The full table is + * checked to fit inside the subprog later in create_jt() + */ + if (jt_cur->items[0] >= subprog_start && jt_cur->items[0] < subprog_end) { + u32 old_cnt = jt ? jt->cnt : 0; + jt = iarray_realloc(jt, old_cnt + jt_cur->cnt); + if (!jt) { + kvfree(jt_cur); + return ERR_PTR(-ENOMEM); + } + memcpy(jt->items + old_cnt, jt_cur->items, jt_cur->cnt << 2); + } + + kvfree(jt_cur); + } + + if (!jt) { + verbose(env, "no jump tables found for subprog starting at %u\n", subprog_start); + return ERR_PTR(-EINVAL); + } + + jt->cnt = sort_insn_array_uniq(jt->items, jt->cnt); + return jt; +} + +static struct bpf_iarray * +create_jt(int t, struct bpf_verifier_env *env) +{ + static struct bpf_subprog_info *subprog; + int subprog_start, subprog_end; + struct bpf_iarray *jt; + int i; + + subprog = bpf_find_containing_subprog(env, t); + subprog_start = subprog->start; + subprog_end = (subprog + 1)->start; + jt = jt_from_subprog(env, subprog_start, subprog_end); + if (IS_ERR(jt)) + return jt; + + /* Check that the every element of the jump table fits within the given subprogram */ + for (i = 0; i < jt->cnt; i++) { + if (jt->items[i] < subprog_start || jt->items[i] >= subprog_end) { + verbose(env, "jump table for insn %d points outside of the subprog [%u,%u]\n", + t, subprog_start, subprog_end); + kvfree(jt); + return ERR_PTR(-EINVAL); + } + } + + return jt; +} + +/* "conditional jump with N edges" */ +static int visit_gotox_insn(int t, struct bpf_verifier_env *env) +{ + int *insn_stack = env->cfg.insn_stack; + int *insn_state = env->cfg.insn_state; + bool keep_exploring = false; + struct bpf_iarray *jt; + int i, w; + + jt = env->insn_aux_data[t].jt; + if (!jt) { + jt = create_jt(t, env); + if (IS_ERR(jt)) + return PTR_ERR(jt); + + env->insn_aux_data[t].jt = jt; + } + + mark_prune_point(env, t); + for (i = 0; i < jt->cnt; i++) { + w = jt->items[i]; + if (w < 0 || w >= env->prog->len) { + verbose(env, "indirect jump out of range from insn %d to %d\n", t, w); + return -EINVAL; + } + + mark_jmp_point(env, w); + + /* EXPLORED || DISCOVERED */ + if (insn_state[w]) + continue; + + if (env->cfg.cur_stack >= env->prog->len) + return -E2BIG; + + insn_stack[env->cfg.cur_stack++] = w; + insn_state[w] |= DISCOVERED; + keep_exploring = true; + } + + return keep_exploring ? KEEP_EXPLORING : DONE_EXPLORING; +} + +static int visit_tailcall_insn(struct bpf_verifier_env *env, int t) +{ + static struct bpf_subprog_info *subprog; + struct bpf_iarray *jt; + + if (env->insn_aux_data[t].jt) + return 0; + + jt = iarray_realloc(NULL, 2); + if (!jt) + return -ENOMEM; + + subprog = bpf_find_containing_subprog(env, t); + jt->items[0] = t + 1; + jt->items[1] = subprog->exit_idx; + env->insn_aux_data[t].jt = jt; + return 0; +} + /* Visits the instruction at index t and returns one of the following: * < 0 - an error occurred * DONE_EXPLORING - the instruction was fully explored @@ -17503,6 +18235,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env) mark_subprog_might_sleep(env, t); if (bpf_helper_changes_pkt_data(insn->imm)) mark_subprog_changes_pkt_data(env, t); + if (insn->imm == BPF_FUNC_tail_call) + visit_tailcall_insn(env, t); } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { struct bpf_kfunc_call_arg_meta meta; @@ -17529,12 +18263,14 @@ static int visit_insn(int t, struct bpf_verifier_env *env) */ if (ret == 0 && is_kfunc_sleepable(&meta)) mark_subprog_might_sleep(env, t); + if (ret == 0 && is_kfunc_pkt_changing(&meta)) + mark_subprog_changes_pkt_data(env, t); } return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); case BPF_JA: - if (BPF_SRC(insn->code) != BPF_K) - return -EINVAL; + if (BPF_SRC(insn->code) == BPF_X) + return visit_gotox_insn(t, env); if (BPF_CLASS(insn->code) == BPF_JMP) off = insn->off; @@ -17571,26 +18307,19 @@ static int visit_insn(int t, struct bpf_verifier_env *env) static int check_cfg(struct bpf_verifier_env *env) { int insn_cnt = env->prog->len; - int *insn_stack, *insn_state, *insn_postorder; + int *insn_stack, *insn_state; int ex_insn_beg, i, ret = 0; - insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); if (!insn_state) return -ENOMEM; - insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); if (!insn_stack) { kvfree(insn_state); return -ENOMEM; } - insn_postorder = env->cfg.insn_postorder = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); - if (!insn_postorder) { - kvfree(insn_state); - kvfree(insn_stack); - return -ENOMEM; - } - ex_insn_beg = env->exception_callback_subprog ? env->subprog_info[env->exception_callback_subprog].start : 0; @@ -17608,13 +18337,12 @@ walk_cfg: case DONE_EXPLORING: insn_state[t] = EXPLORED; env->cfg.cur_stack--; - insn_postorder[env->cfg.cur_postorder++] = t; break; case KEEP_EXPLORING: break; default: if (ret > 0) { - verbose(env, "visit_insn internal bug\n"); + verifier_bug(env, "visit_insn internal bug"); ret = -EFAULT; } goto err_free; @@ -17622,7 +18350,7 @@ walk_cfg: } if (env->cfg.cur_stack < 0) { - verbose(env, "pop stack internal bug\n"); + verifier_bug(env, "pop stack internal bug"); ret = -EFAULT; goto err_free; } @@ -17662,6 +18390,57 @@ err_free: return ret; } +/* + * For each subprogram 'i' fill array env->cfg.insn_subprogram sub-range + * [env->subprog_info[i].postorder_start, env->subprog_info[i+1].postorder_start) + * with indices of 'i' instructions in postorder. + */ +static int compute_postorder(struct bpf_verifier_env *env) +{ + u32 cur_postorder, i, top, stack_sz, s; + int *stack = NULL, *postorder = NULL, *state = NULL; + struct bpf_iarray *succ; + + postorder = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT); + state = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT); + stack = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT); + if (!postorder || !state || !stack) { + kvfree(postorder); + kvfree(state); + kvfree(stack); + return -ENOMEM; + } + cur_postorder = 0; + for (i = 0; i < env->subprog_cnt; i++) { + env->subprog_info[i].postorder_start = cur_postorder; + stack[0] = env->subprog_info[i].start; + stack_sz = 1; + do { + top = stack[stack_sz - 1]; + state[top] |= DISCOVERED; + if (state[top] & EXPLORED) { + postorder[cur_postorder++] = top; + stack_sz--; + continue; + } + succ = bpf_insn_successors(env, top); + for (s = 0; s < succ->cnt; ++s) { + if (!state[succ->items[s]]) { + stack[stack_sz++] = succ->items[s]; + state[succ->items[s]] |= DISCOVERED; + } + } + state[top] |= EXPLORED; + } while (stack_sz); + } + env->subprog_info[i].postorder_start = cur_postorder; + env->cfg.insn_postorder = postorder; + env->cfg.cur_postorder = cur_postorder; + kvfree(stack); + kvfree(state); + return 0; +} + static int check_abnormal_return(struct bpf_verifier_env *env) { int i; @@ -17718,7 +18497,7 @@ static int check_btf_func_early(struct bpf_verifier_env *env, urecord = make_bpfptr(attr->func_info, uattr.is_kernel); min_size = min_t(u32, krec_size, urec_size); - krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN); + krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!krecord) return -ENOMEM; @@ -17818,7 +18597,7 @@ static int check_btf_func(struct bpf_verifier_env *env, urecord = make_bpfptr(attr->func_info, uattr.is_kernel); krecord = prog->aux->func_info; - info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN); + info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!info_aux) return -ENOMEM; @@ -17904,7 +18683,7 @@ static int check_btf_line(struct bpf_verifier_env *env, * pass in a smaller bpf_line_info object. */ linfo = kvcalloc(nr_linfo, sizeof(struct bpf_line_info), - GFP_KERNEL | __GFP_NOWARN); + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!linfo) return -ENOMEM; @@ -18194,16 +18973,15 @@ static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) } static void clean_func_state(struct bpf_verifier_env *env, - struct bpf_func_state *st) + struct bpf_func_state *st, + u32 ip) { - enum bpf_reg_liveness live; + u16 live_regs = env->insn_aux_data[ip].live_regs_before; int i, j; for (i = 0; i < BPF_REG_FP; i++) { - live = st->regs[i].live; /* liveness must not touch this register anymore */ - st->regs[i].live |= REG_LIVE_DONE; - if (!(live & REG_LIVE_READ)) + if (!(live_regs & BIT(i))) /* since the register is unused, clear its state * to make further comparison simpler */ @@ -18211,10 +18989,7 @@ static void clean_func_state(struct bpf_verifier_env *env, } for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) { - live = st->stack[i].spilled_ptr.live; - /* liveness must not touch this stack slot anymore */ - st->stack[i].spilled_ptr.live |= REG_LIVE_DONE; - if (!(live & REG_LIVE_READ)) { + if (!bpf_stack_slot_alive(env, st->frameno, i)) { __mark_reg_not_init(env, &st->stack[i].spilled_ptr); for (j = 0; j < BPF_REG_SIZE; j++) st->stack[i].slot_type[j] = STACK_INVALID; @@ -18225,43 +19000,41 @@ static void clean_func_state(struct bpf_verifier_env *env, static void clean_verifier_state(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { - int i; - - if (st->frame[0]->regs[0].live & REG_LIVE_DONE) - /* all regs in this state in all frames were already marked */ - return; + int i, ip; - for (i = 0; i <= st->curframe; i++) - clean_func_state(env, st->frame[i]); + bpf_live_stack_query_init(env, st); + st->cleaned = true; + for (i = 0; i <= st->curframe; i++) { + ip = frame_insn_idx(st, i); + clean_func_state(env, st->frame[i], ip); + } } /* the parentage chains form a tree. * the verifier states are added to state lists at given insn and * pushed into state stack for future exploration. - * when the verifier reaches bpf_exit insn some of the verifer states + * when the verifier reaches bpf_exit insn some of the verifier states * stored in the state lists have their final liveness state already, * but a lot of states will get revised from liveness point of view when * the verifier explores other branches. * Example: - * 1: r0 = 1 + * 1: *(u64)(r10 - 8) = 1 * 2: if r1 == 100 goto pc+1 - * 3: r0 = 2 - * 4: exit - * when the verifier reaches exit insn the register r0 in the state list of - * insn 2 will be seen as !REG_LIVE_READ. Then the verifier pops the other_branch - * of insn 2 and goes exploring further. At the insn 4 it will walk the - * parentage chain from insn 4 into insn 2 and will mark r0 as REG_LIVE_READ. + * 3: *(u64)(r10 - 8) = 2 + * 4: r0 = *(u64)(r10 - 8) + * 5: exit + * when the verifier reaches exit insn the stack slot -8 in the state list of + * insn 2 is not yet marked alive. Then the verifier pops the other_branch + * of insn 2 and goes exploring further. After the insn 4 read, liveness + * analysis would propagate read mark for -8 at insn 2. * * Since the verifier pushes the branch states as it sees them while exploring * the program the condition of walking the branch instruction for the second * time means that all states below this branch were already explored and * their final liveness marks are already propagated. * Hence when the verifier completes the search of state list in is_state_visited() - * we can call this clean_live_states() function to mark all liveness states - * as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state' - * will not be used. - * This function also clears the registers and stack for states that !READ - * to simplify state merging. + * we can call this clean_live_states() function to clear dead the registers and stack + * slots to simplify state merging. * * Important note here that walking the same branch instruction in the callee * doesn't meant that the states are DONE. The verifier has to compare @@ -18270,7 +19043,6 @@ static void clean_verifier_state(struct bpf_verifier_env *env, static void clean_live_states(struct bpf_verifier_env *env, int insn, struct bpf_verifier_state *cur) { - struct bpf_verifier_state *loop_entry; struct bpf_verifier_state_list *sl; struct list_head *pos, *head; @@ -18279,12 +19051,14 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, sl = container_of(pos, struct bpf_verifier_state_list, node); if (sl->state.branches) continue; - loop_entry = get_loop_entry(env, &sl->state); - if (!IS_ERR_OR_NULL(loop_entry) && loop_entry->branches) - continue; if (sl->state.insn_idx != insn || !same_callsites(&sl->state, cur)) continue; + if (sl->state.cleaned) + /* all regs in this state in all frames were already marked */ + continue; + if (incomplete_read_marks(env, &sl->state)) + continue; clean_verifier_state(env, &sl->state); } } @@ -18312,9 +19086,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, if (exact == EXACT) return regs_exact(rold, rcur, idmap); - if (!(rold->live & REG_LIVE_READ) && exact == NOT_EXACT) - /* explored state didn't use this */ - return true; if (rold->type == NOT_INIT) { if (exact == NOT_EXACT || rcur->type == NOT_INIT) /* explored state can't have used this */ @@ -18428,6 +19199,10 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno; case PTR_TO_ARENA: return true; + case PTR_TO_INSN: + return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 && + rold->off == rcur->off && range_within(rold, rcur) && + tnum_in(rold->var_off, rcur->var_off); default: return regs_exact(rold, rcur, idmap); } @@ -18438,7 +19213,6 @@ static struct bpf_reg_state unbound_reg; static __init int unbound_reg_init(void) { __mark_reg_unknown_imprecise(&unbound_reg); - unbound_reg.live |= REG_LIVE_READ; return 0; } late_initcall(unbound_reg_init); @@ -18491,13 +19265,6 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, cur->stack[spi].slot_type[i % BPF_REG_SIZE])) return false; - if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ) - && exact == NOT_EXACT) { - i += BPF_REG_SIZE - 1; - /* explored state didn't use this */ - continue; - } - if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) continue; @@ -18616,7 +19383,7 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c if (old->active_preempt_locks != cur->active_preempt_locks) return false; - if (old->active_rcu_lock != cur->active_rcu_lock) + if (old->active_rcu_locks != cur->active_rcu_locks) return false; if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap)) @@ -18731,9 +19498,7 @@ static bool states_equal(struct bpf_verifier_env *env, * and all frame states need to be equivalent */ for (i = 0; i <= old->curframe; i++) { - insn_idx = i == old->curframe - ? env->insn_idx - : old->frame[i + 1]->callsite; + insn_idx = frame_insn_idx(old, i); if (old->frame[i]->callsite != cur->frame[i]->callsite) return false; if (!func_states_equal(env, old->frame[i], cur->frame[i], insn_idx, exact)) @@ -18742,91 +19507,13 @@ static bool states_equal(struct bpf_verifier_env *env, return true; } -/* Return 0 if no propagation happened. Return negative error code if error - * happened. Otherwise, return the propagated bit. - */ -static int propagate_liveness_reg(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, - struct bpf_reg_state *parent_reg) -{ - u8 parent_flag = parent_reg->live & REG_LIVE_READ; - u8 flag = reg->live & REG_LIVE_READ; - int err; - - /* When comes here, read flags of PARENT_REG or REG could be any of - * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need - * of propagation if PARENT_REG has strongest REG_LIVE_READ64. - */ - if (parent_flag == REG_LIVE_READ64 || - /* Or if there is no read flag from REG. */ - !flag || - /* Or if the read flag from REG is the same as PARENT_REG. */ - parent_flag == flag) - return 0; - - err = mark_reg_read(env, reg, parent_reg, flag); - if (err) - return err; - - return flag; -} - -/* A write screens off any subsequent reads; but write marks come from the - * straight-line code between a state and its parent. When we arrive at an - * equivalent state (jump target or such) we didn't arrive by the straight-line - * code, so read marks in the state must propagate to the parent regardless - * of the state's write marks. That's what 'parent == state->parent' comparison - * in mark_reg_read() is for. - */ -static int propagate_liveness(struct bpf_verifier_env *env, - const struct bpf_verifier_state *vstate, - struct bpf_verifier_state *vparent) -{ - struct bpf_reg_state *state_reg, *parent_reg; - struct bpf_func_state *state, *parent; - int i, frame, err = 0; - - if (vparent->curframe != vstate->curframe) { - WARN(1, "propagate_live: parent frame %d current frame %d\n", - vparent->curframe, vstate->curframe); - return -EFAULT; - } - /* Propagate read liveness of registers... */ - BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); - for (frame = 0; frame <= vstate->curframe; frame++) { - parent = vparent->frame[frame]; - state = vstate->frame[frame]; - parent_reg = parent->regs; - state_reg = state->regs; - /* We don't need to worry about FP liveness, it's read-only */ - for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { - err = propagate_liveness_reg(env, &state_reg[i], - &parent_reg[i]); - if (err < 0) - return err; - if (err == REG_LIVE_READ64) - mark_insn_zext(env, &parent_reg[i]); - } - - /* Propagate stack slots. */ - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && - i < parent->allocated_stack / BPF_REG_SIZE; i++) { - parent_reg = &parent->stack[i].spilled_ptr; - state_reg = &state->stack[i].spilled_ptr; - err = propagate_liveness_reg(env, state_reg, - parent_reg); - if (err < 0) - return err; - } - } - return 0; -} - /* find precise scalars in the previous equivalent state and * propagate them into the current state */ static int propagate_precision(struct bpf_verifier_env *env, - const struct bpf_verifier_state *old) + const struct bpf_verifier_state *old, + struct bpf_verifier_state *cur, + bool *changed) { struct bpf_reg_state *state_reg; struct bpf_func_state *state; @@ -18839,8 +19526,7 @@ static int propagate_precision(struct bpf_verifier_env *env, first = true; for (i = 0; i < BPF_REG_FP; i++, state_reg++) { if (state_reg->type != SCALAR_VALUE || - !state_reg->precise || - !(state_reg->live & REG_LIVE_READ)) + !state_reg->precise) continue; if (env->log.level & BPF_LOG_LEVEL2) { if (first) @@ -18857,8 +19543,7 @@ static int propagate_precision(struct bpf_verifier_env *env, continue; state_reg = &state->stack[i].spilled_ptr; if (state_reg->type != SCALAR_VALUE || - !state_reg->precise || - !(state_reg->live & REG_LIVE_READ)) + !state_reg->precise) continue; if (env->log.level & BPF_LOG_LEVEL2) { if (first) @@ -18870,17 +19555,54 @@ static int propagate_precision(struct bpf_verifier_env *env, bt_set_frame_slot(&env->bt, fr, i); first = false; } - if (!first) + if (!first && (env->log.level & BPF_LOG_LEVEL2)) verbose(env, "\n"); } - err = mark_chain_precision_batch(env); + err = __mark_chain_precision(env, cur, -1, changed); if (err < 0) return err; return 0; } +#define MAX_BACKEDGE_ITERS 64 + +/* Propagate read and precision marks from visit->backedges[*].state->equal_state + * to corresponding parent states of visit->backedges[*].state until fixed point is reached, + * then free visit->backedges. + * After execution of this function incomplete_read_marks() will return false + * for all states corresponding to @visit->callchain. + */ +static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit) +{ + struct bpf_scc_backedge *backedge; + struct bpf_verifier_state *st; + bool changed; + int i, err; + + i = 0; + do { + if (i++ > MAX_BACKEDGE_ITERS) { + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "%s: too many iterations\n", __func__); + for (backedge = visit->backedges; backedge; backedge = backedge->next) + mark_all_scalars_precise(env, &backedge->state); + break; + } + changed = false; + for (backedge = visit->backedges; backedge; backedge = backedge->next) { + st = &backedge->state; + err = propagate_precision(env, st->equal_state, st, &changed); + if (err) + return err; + } + } while (changed); + + free_backedges(visit); + return 0; +} + static bool states_maybe_looping(struct bpf_verifier_state *old, struct bpf_verifier_state *cur) { @@ -18894,7 +19616,7 @@ static bool states_maybe_looping(struct bpf_verifier_state *old, fcur = cur->frame[fr]; for (i = 0; i < MAX_BPF_REG; i++) if (memcmp(&fold->regs[i], &fcur->regs[i], - offsetof(struct bpf_reg_state, parent))) + offsetof(struct bpf_reg_state, frameno))) return false; return true; } @@ -18908,7 +19630,7 @@ static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx) * terminology) calls specially: as opposed to bounded BPF loops, it *expects* * states to match, which otherwise would look like an infinite loop. So while * iter_next() calls are taken care of, we still need to be careful and - * prevent erroneous and too eager declaration of "ininite loop", when + * prevent erroneous and too eager declaration of "infinite loop", when * iterators are involved. * * Here's a situation in pseudo-BPF assembly form: @@ -18950,7 +19672,7 @@ static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx) * * This approach allows to keep infinite loop heuristic even in the face of * active iterator. E.g., C snippet below is and will be detected as - * inifintely looping: + * infinitely looping: * * struct bpf_iter_num it; * int *p, x; @@ -18990,14 +19712,14 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; - struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry; - int i, j, n, err, states_cnt = 0; - bool force_new_state, add_new_state, force_exact; + struct bpf_verifier_state *cur = env->cur_state, *new; + bool force_new_state, add_new_state, loop; + int n, err, states_cnt = 0; struct list_head *pos, *tmp, *head; force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) || /* Avoid accumulating infinitely long jmp history */ - cur->insn_hist_end - cur->insn_hist_start > 40; + cur->jmp_history_cnt > 40; /* bpf progs typically have pruning point every 4 instructions * http://vger.kernel.org/bpfconf2019.html#session-1 @@ -19014,6 +19736,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) clean_live_states(env, insn_idx, cur); + loop = false; head = explored_state(env, insn_idx); list_for_each_safe(pos, tmp, head) { sl = container_of(pos, struct bpf_verifier_state_list, node); @@ -19093,7 +19816,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) spi = __get_spi(iter_reg->off + iter_reg->var_off.value); iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr; if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) { - update_loop_entry(env, cur, &sl->state); + loop = true; goto hit; } } @@ -19102,11 +19825,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) if (is_may_goto_insn_at(env, insn_idx)) { if (sl->state.may_goto_depth != cur->may_goto_depth && states_equal(env, &sl->state, cur, RANGE_WITHIN)) { - update_loop_entry(env, cur, &sl->state); + loop = true; goto hit; } } - if (calls_callback(env, insn_idx)) { + if (bpf_calls_callback(env, insn_idx)) { if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) goto hit; goto skip_inf_loop_check; @@ -19144,62 +19867,111 @@ skip_inf_loop_check: add_new_state = false; goto miss; } - /* If sl->state is a part of a loop and this loop's entry is a part of - * current verification path then states have to be compared exactly. - * 'force_exact' is needed to catch the following case: - * - * initial Here state 'succ' was processed first, - * | it was eventually tracked to produce a - * V state identical to 'hdr'. - * .---------> hdr All branches from 'succ' had been explored - * | | and thus 'succ' has its .branches == 0. - * | V - * | .------... Suppose states 'cur' and 'succ' correspond - * | | | to the same instruction + callsites. - * | V V In such case it is necessary to check - * | ... ... if 'succ' and 'cur' are states_equal(). - * | | | If 'succ' and 'cur' are a part of the - * | V V same loop exact flag has to be set. - * | succ <- cur To check if that is the case, verify - * | | if loop entry of 'succ' is in current - * | V DFS path. - * | ... - * | | - * '----' - * - * Additional details are in the comment before get_loop_entry(). - */ - loop_entry = get_loop_entry(env, &sl->state); - if (IS_ERR(loop_entry)) - return PTR_ERR(loop_entry); - force_exact = loop_entry && loop_entry->branches > 0; - if (states_equal(env, &sl->state, cur, force_exact ? RANGE_WITHIN : NOT_EXACT)) { - if (force_exact) - update_loop_entry(env, cur, loop_entry); + /* See comments for mark_all_regs_read_and_precise() */ + loop = incomplete_read_marks(env, &sl->state); + if (states_equal(env, &sl->state, cur, loop ? RANGE_WITHIN : NOT_EXACT)) { hit: sl->hit_cnt++; - /* reached equivalent register/stack state, - * prune the search. - * Registers read by the continuation are read by us. - * If we have any write marks in env->cur_state, they - * will prevent corresponding reads in the continuation - * from reaching our parent (an explored_state). Our - * own state will get the read marks recorded, but - * they'll be immediately forgotten as we're pruning - * this state and will pop a new one. - */ - err = propagate_liveness(env, &sl->state, cur); /* if previous state reached the exit with precision and * current state is equivalent to it (except precision marks) * the precision needs to be propagated back in * the current state. */ + err = 0; if (is_jmp_point(env, env->insn_idx)) - err = err ? : push_insn_history(env, cur, 0, 0); - err = err ? : propagate_precision(env, &sl->state); + err = push_jmp_history(env, cur, 0, 0); + err = err ? : propagate_precision(env, &sl->state, cur, NULL); if (err) return err; + /* When processing iterator based loops above propagate_liveness and + * propagate_precision calls are not sufficient to transfer all relevant + * read and precision marks. E.g. consider the following case: + * + * .-> A --. Assume the states are visited in the order A, B, C. + * | | | Assume that state B reaches a state equivalent to state A. + * | v v At this point, state C is not processed yet, so state A + * '-- B C has not received any read or precision marks from C. + * Thus, marks propagated from A to B are incomplete. + * + * The verifier mitigates this by performing the following steps: + * + * - Prior to the main verification pass, strongly connected components + * (SCCs) are computed over the program's control flow graph, + * intraprocedurally. + * + * - During the main verification pass, `maybe_enter_scc()` checks + * whether the current verifier state is entering an SCC. If so, an + * instance of a `bpf_scc_visit` object is created, and the state + * entering the SCC is recorded as the entry state. + * + * - This instance is associated not with the SCC itself, but with a + * `bpf_scc_callchain`: a tuple consisting of the call sites leading to + * the SCC and the SCC id. See `compute_scc_callchain()`. + * + * - When a verification path encounters a `states_equal(..., + * RANGE_WITHIN)` condition, there exists a call chain describing the + * current state and a corresponding `bpf_scc_visit` instance. A copy + * of the current state is created and added to + * `bpf_scc_visit->backedges`. + * + * - When a verification path terminates, `maybe_exit_scc()` is called + * from `update_branch_counts()`. For states with `branches == 0`, it + * checks whether the state is the entry state of any `bpf_scc_visit` + * instance. If it is, this indicates that all paths originating from + * this SCC visit have been explored. `propagate_backedges()` is then + * called, which propagates read and precision marks through the + * backedges until a fixed point is reached. + * (In the earlier example, this would propagate marks from A to B, + * from C to A, and then again from A to B.) + * + * A note on callchains + * -------------------- + * + * Consider the following example: + * + * void foo() { loop { ... SCC#1 ... } } + * void main() { + * A: foo(); + * B: ... + * C: foo(); + * } + * + * Here, there are two distinct callchains leading to SCC#1: + * - (A, SCC#1) + * - (C, SCC#1) + * + * Each callchain identifies a separate `bpf_scc_visit` instance that + * accumulates backedge states. The `propagate_{liveness,precision}()` + * functions traverse the parent state of each backedge state, which + * means these parent states must remain valid (i.e., not freed) while + * the corresponding `bpf_scc_visit` instance exists. + * + * Associating `bpf_scc_visit` instances directly with SCCs instead of + * callchains would break this invariant: + * - States explored during `C: foo()` would contribute backedges to + * SCC#1, but SCC#1 would only be exited once the exploration of + * `A: foo()` completes. + * - By that time, the states explored between `A: foo()` and `C: foo()` + * (i.e., `B: ...`) may have already been freed, causing the parent + * links for states from `C: foo()` to become invalid. + */ + if (loop) { + struct bpf_scc_backedge *backedge; + + backedge = kzalloc(sizeof(*backedge), GFP_KERNEL_ACCOUNT); + if (!backedge) + return -ENOMEM; + err = copy_verifier_state(&backedge->state, cur); + backedge->state.equal_state = &sl->state; + backedge->state.insn_idx = insn_idx; + err = err ?: add_scc_backedge(env, &sl->state, backedge); + if (err) { + free_verifier_state(&backedge->state, false); + kfree(backedge); + return err; + } + } return 1; } miss: @@ -19251,7 +20023,7 @@ miss: * When looping the sl->state.branches will be > 0 and this state * will not be considered for equivalence until branches == 0. */ - new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); + new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL_ACCOUNT); if (!new_sl) return -ENOMEM; env->total_states++; @@ -19273,46 +20045,21 @@ miss: return err; } new->insn_idx = insn_idx; - WARN_ONCE(new->branches != 1, - "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx); + verifier_bug_if(new->branches != 1, env, + "%s:branches_to_explore=%d insn %d", + __func__, new->branches, insn_idx); + err = maybe_enter_scc(env, new); + if (err) { + free_verifier_state(new, false); + kfree(new_sl); + return err; + } cur->parent = new; cur->first_insn_idx = insn_idx; - cur->insn_hist_start = cur->insn_hist_end; cur->dfs_depth = new->dfs_depth + 1; + clear_jmp_history(cur); list_add(&new_sl->node, head); - - /* connect new state to parentage chain. Current frame needs all - * registers connected. Only r6 - r9 of the callers are alive (pushed - * to the stack implicitly by JITs) so in callers' frames connect just - * r6 - r9 as an optimization. Callers will have r1 - r5 connected to - * the state of the call instruction (with WRITTEN set), and r0 comes - * from callee with its full parentage chain, anyway. - */ - /* clear write marks in current state: the writes we did are not writes - * our child did, so they don't screen off its reads from us. - * (There are no read marks in current state, because reads always mark - * their parent and current state never has children yet. Only - * explored_states can get read marks.) - */ - for (j = 0; j <= cur->curframe; j++) { - for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) - cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i]; - for (i = 0; i < BPF_REG_FP; i++) - cur->frame[j]->regs[i].live = REG_LIVE_NONE; - } - - /* all stack frames are accessible from callee, clear them all */ - for (j = 0; j <= cur->curframe; j++) { - struct bpf_func_state *frame = cur->frame[j]; - struct bpf_func_state *newframe = new->frame[j]; - - for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) { - frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; - frame->stack[i].spilled_ptr.parent = - &newframe->stack[i].spilled_ptr; - } - } return 0; } @@ -19351,10 +20098,27 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) !reg_type_mismatch_ok(prev)); } +static bool is_ptr_to_mem_or_btf_id(enum bpf_reg_type type) +{ + switch (base_type(type)) { + case PTR_TO_MEM: + case PTR_TO_BTF_ID: + return true; + default: + return false; + } +} + +static bool is_ptr_to_mem(enum bpf_reg_type type) +{ + return base_type(type) == PTR_TO_MEM; +} + static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, bool allow_trust_mismatch) { enum bpf_reg_type *prev_type = &env->insn_aux_data[env->insn_idx].ptr_type; + enum bpf_reg_type merged_type; if (*prev_type == NOT_INIT) { /* Saw a valid insn @@ -19371,15 +20135,24 @@ static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type typ * Reject it. */ if (allow_trust_mismatch && - base_type(type) == PTR_TO_BTF_ID && - base_type(*prev_type) == PTR_TO_BTF_ID) { + is_ptr_to_mem_or_btf_id(type) && + is_ptr_to_mem_or_btf_id(*prev_type)) { /* * Have to support a use case when one path through * the program yields TRUSTED pointer while another * is UNTRUSTED. Fallback to UNTRUSTED to generate * BPF_PROBE_MEM/BPF_PROBE_MEMSX. + * Same behavior of MEM_RDONLY flag. */ - *prev_type = PTR_TO_BTF_ID | PTR_UNTRUSTED; + if (is_ptr_to_mem(type) || is_ptr_to_mem(*prev_type)) + merged_type = PTR_TO_MEM; + else + merged_type = PTR_TO_BTF_ID; + if ((type & PTR_UNTRUSTED) || (*prev_type & PTR_UNTRUSTED)) + merged_type |= PTR_UNTRUSTED; + if ((type & MEM_RDONLY) || (*prev_type & MEM_RDONLY)) + merged_type |= MEM_RDONLY; + *prev_type = merged_type; } else { verbose(env, "same insn cannot be used with different pointers\n"); return -EINVAL; @@ -19389,21 +20162,327 @@ static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type typ return 0; } +enum { + PROCESS_BPF_EXIT = 1 +}; + +static int process_bpf_exit_full(struct bpf_verifier_env *env, + bool *do_print_state, + bool exception_exit) +{ + /* We must do check_reference_leak here before + * prepare_func_exit to handle the case when + * state->curframe > 0, it may be a callback function, + * for which reference_state must match caller reference + * state when it exits. + */ + int err = check_resource_leak(env, exception_exit, + !env->cur_state->curframe, + "BPF_EXIT instruction in main prog"); + if (err) + return err; + + /* The side effect of the prepare_func_exit which is + * being skipped is that it frees bpf_func_state. + * Typically, process_bpf_exit will only be hit with + * outermost exit. copy_verifier_state in pop_stack will + * handle freeing of any extra bpf_func_state left over + * from not processing all nested function exits. We + * also skip return code checks as they are not needed + * for exceptional exits. + */ + if (exception_exit) + return PROCESS_BPF_EXIT; + + if (env->cur_state->curframe) { + /* exit from nested function */ + err = prepare_func_exit(env, &env->insn_idx); + if (err) + return err; + *do_print_state = true; + return 0; + } + + err = check_return_code(env, BPF_REG_0, "R0"); + if (err) + return err; + return PROCESS_BPF_EXIT; +} + +static int indirect_jump_min_max_index(struct bpf_verifier_env *env, + int regno, + struct bpf_map *map, + u32 *pmin_index, u32 *pmax_index) +{ + struct bpf_reg_state *reg = reg_state(env, regno); + u64 min_index, max_index; + const u32 size = 8; + + if (check_add_overflow(reg->umin_value, reg->off, &min_index) || + (min_index > (u64) U32_MAX * size)) { + verbose(env, "the sum of R%u umin_value %llu and off %u is too big\n", + regno, reg->umin_value, reg->off); + return -ERANGE; + } + if (check_add_overflow(reg->umax_value, reg->off, &max_index) || + (max_index > (u64) U32_MAX * size)) { + verbose(env, "the sum of R%u umax_value %llu and off %u is too big\n", + regno, reg->umax_value, reg->off); + return -ERANGE; + } + + min_index /= size; + max_index /= size; + + if (max_index >= map->max_entries) { + verbose(env, "R%u points to outside of jump table: [%llu,%llu] max_entries %u\n", + regno, min_index, max_index, map->max_entries); + return -EINVAL; + } + + *pmin_index = min_index; + *pmax_index = max_index; + return 0; +} + +/* gotox *dst_reg */ +static int check_indirect_jump(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + struct bpf_verifier_state *other_branch; + struct bpf_reg_state *dst_reg; + struct bpf_map *map; + u32 min_index, max_index; + int err = 0; + int n; + int i; + + dst_reg = reg_state(env, insn->dst_reg); + if (dst_reg->type != PTR_TO_INSN) { + verbose(env, "R%d has type %s, expected PTR_TO_INSN\n", + insn->dst_reg, reg_type_str(env, dst_reg->type)); + return -EINVAL; + } + + map = dst_reg->map_ptr; + if (verifier_bug_if(!map, env, "R%d has an empty map pointer", insn->dst_reg)) + return -EFAULT; + + if (verifier_bug_if(map->map_type != BPF_MAP_TYPE_INSN_ARRAY, env, + "R%d has incorrect map type %d", insn->dst_reg, map->map_type)) + return -EFAULT; + + err = indirect_jump_min_max_index(env, insn->dst_reg, map, &min_index, &max_index); + if (err) + return err; + + /* Ensure that the buffer is large enough */ + if (!env->gotox_tmp_buf || env->gotox_tmp_buf->cnt < max_index - min_index + 1) { + env->gotox_tmp_buf = iarray_realloc(env->gotox_tmp_buf, + max_index - min_index + 1); + if (!env->gotox_tmp_buf) + return -ENOMEM; + } + + n = copy_insn_array_uniq(map, min_index, max_index, env->gotox_tmp_buf->items); + if (n < 0) + return n; + if (n == 0) { + verbose(env, "register R%d doesn't point to any offset in map id=%d\n", + insn->dst_reg, map->id); + return -EINVAL; + } + + for (i = 0; i < n - 1; i++) { + other_branch = push_stack(env, env->gotox_tmp_buf->items[i], + env->insn_idx, env->cur_state->speculative); + if (IS_ERR(other_branch)) + return PTR_ERR(other_branch); + } + env->insn_idx = env->gotox_tmp_buf->items[n-1]; + return 0; +} + +static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) +{ + int err; + struct bpf_insn *insn = &env->prog->insnsi[env->insn_idx]; + u8 class = BPF_CLASS(insn->code); + + if (class == BPF_ALU || class == BPF_ALU64) { + err = check_alu_op(env, insn); + if (err) + return err; + + } else if (class == BPF_LDX) { + bool is_ldsx = BPF_MODE(insn->code) == BPF_MEMSX; + + /* Check for reserved fields is already done in + * resolve_pseudo_ldimm64(). + */ + err = check_load_mem(env, insn, false, is_ldsx, true, "ldx"); + if (err) + return err; + } else if (class == BPF_STX) { + if (BPF_MODE(insn->code) == BPF_ATOMIC) { + err = check_atomic(env, insn); + if (err) + return err; + env->insn_idx++; + return 0; + } + + if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) { + verbose(env, "BPF_STX uses reserved fields\n"); + return -EINVAL; + } + + err = check_store_reg(env, insn, false); + if (err) + return err; + } else if (class == BPF_ST) { + enum bpf_reg_type dst_reg_type; + + if (BPF_MODE(insn->code) != BPF_MEM || + insn->src_reg != BPF_REG_0) { + verbose(env, "BPF_ST uses reserved fields\n"); + return -EINVAL; + } + /* check src operand */ + err = check_reg_arg(env, insn->dst_reg, SRC_OP); + if (err) + return err; + + dst_reg_type = cur_regs(env)[insn->dst_reg].type; + + /* check that memory (dst_reg + off) is writeable */ + err = check_mem_access(env, env->insn_idx, insn->dst_reg, + insn->off, BPF_SIZE(insn->code), + BPF_WRITE, -1, false, false); + if (err) + return err; + + err = save_aux_ptr_type(env, dst_reg_type, false); + if (err) + return err; + } else if (class == BPF_JMP || class == BPF_JMP32) { + u8 opcode = BPF_OP(insn->code); + + env->jmps_processed++; + if (opcode == BPF_CALL) { + if (BPF_SRC(insn->code) != BPF_K || + (insn->src_reg != BPF_PSEUDO_KFUNC_CALL && + insn->off != 0) || + (insn->src_reg != BPF_REG_0 && + insn->src_reg != BPF_PSEUDO_CALL && + insn->src_reg != BPF_PSEUDO_KFUNC_CALL) || + insn->dst_reg != BPF_REG_0 || class == BPF_JMP32) { + verbose(env, "BPF_CALL uses reserved fields\n"); + return -EINVAL; + } + + if (env->cur_state->active_locks) { + if ((insn->src_reg == BPF_REG_0 && + insn->imm != BPF_FUNC_spin_unlock) || + (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && + (insn->off != 0 || !kfunc_spin_allowed(insn->imm)))) { + verbose(env, + "function calls are not allowed while holding a lock\n"); + return -EINVAL; + } + } + if (insn->src_reg == BPF_PSEUDO_CALL) { + err = check_func_call(env, insn, &env->insn_idx); + } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { + err = check_kfunc_call(env, insn, &env->insn_idx); + if (!err && is_bpf_throw_kfunc(insn)) + return process_bpf_exit_full(env, do_print_state, true); + } else { + err = check_helper_call(env, insn, &env->insn_idx); + } + if (err) + return err; + + mark_reg_scratched(env, BPF_REG_0); + } else if (opcode == BPF_JA) { + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->src_reg != BPF_REG_0 || + insn->imm != 0 || insn->off != 0) { + verbose(env, "BPF_JA|BPF_X uses reserved fields\n"); + return -EINVAL; + } + return check_indirect_jump(env, insn); + } + + if (BPF_SRC(insn->code) != BPF_K || + insn->src_reg != BPF_REG_0 || + insn->dst_reg != BPF_REG_0 || + (class == BPF_JMP && insn->imm != 0) || + (class == BPF_JMP32 && insn->off != 0)) { + verbose(env, "BPF_JA uses reserved fields\n"); + return -EINVAL; + } + + if (class == BPF_JMP) + env->insn_idx += insn->off + 1; + else + env->insn_idx += insn->imm + 1; + return 0; + } else if (opcode == BPF_EXIT) { + if (BPF_SRC(insn->code) != BPF_K || + insn->imm != 0 || + insn->src_reg != BPF_REG_0 || + insn->dst_reg != BPF_REG_0 || + class == BPF_JMP32) { + verbose(env, "BPF_EXIT uses reserved fields\n"); + return -EINVAL; + } + return process_bpf_exit_full(env, do_print_state, false); + } else { + err = check_cond_jmp_op(env, insn, &env->insn_idx); + if (err) + return err; + } + } else if (class == BPF_LD) { + u8 mode = BPF_MODE(insn->code); + + if (mode == BPF_ABS || mode == BPF_IND) { + err = check_ld_abs(env, insn); + if (err) + return err; + + } else if (mode == BPF_IMM) { + err = check_ld_imm(env, insn); + if (err) + return err; + + env->insn_idx++; + sanitize_mark_insn_seen(env); + } else { + verbose(env, "invalid BPF_LD mode\n"); + return -EINVAL; + } + } else { + verbose(env, "unknown insn class %d\n", class); + return -EINVAL; + } + + env->insn_idx++; + return 0; +} + static int do_check(struct bpf_verifier_env *env) { bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_verifier_state *state = env->cur_state; struct bpf_insn *insns = env->prog->insnsi; - struct bpf_reg_state *regs; int insn_cnt = env->prog->len; bool do_print_state = false; int prev_insn_idx = -1; for (;;) { - bool exception_exit = false; struct bpf_insn *insn; - u8 class; - int err; + struct bpf_insn_aux_data *insn_aux; + int err, marks_err; /* reset current history entry on each new instruction */ env->cur_hist_ent = NULL; @@ -19416,7 +20495,7 @@ static int do_check(struct bpf_verifier_env *env) } insn = &insns[env->insn_idx]; - class = BPF_CLASS(insn->code); + insn_aux = &env->insn_aux_data[env->insn_idx]; if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { verbose(env, @@ -19426,6 +20505,7 @@ static int do_check(struct bpf_verifier_env *env) } state->last_insn_idx = env->prev_insn_idx; + state->insn_idx = env->insn_idx; if (is_prune_point(env, env->insn_idx)) { err = is_state_visited(env, env->insn_idx); @@ -19447,7 +20527,7 @@ static int do_check(struct bpf_verifier_env *env) } if (is_jmp_point(env, env->insn_idx)) { - err = push_insn_history(env, state, 0, 0); + err = push_jmp_history(env, state, 0, 0); if (err) return err; } @@ -19486,216 +20566,78 @@ static int do_check(struct bpf_verifier_env *env) return err; } - regs = cur_regs(env); sanitize_mark_insn_seen(env); prev_insn_idx = env->insn_idx; - if (class == BPF_ALU || class == BPF_ALU64) { - err = check_alu_op(env, insn); - if (err) - return err; - - } else if (class == BPF_LDX) { - bool is_ldsx = BPF_MODE(insn->code) == BPF_MEMSX; + /* Reduce verification complexity by stopping speculative path + * verification when a nospec is encountered. + */ + if (state->speculative && insn_aux->nospec) + goto process_bpf_exit; - /* Check for reserved fields is already done in - * resolve_pseudo_ldimm64(). + err = bpf_reset_stack_write_marks(env, env->insn_idx); + if (err) + return err; + err = do_check_insn(env, &do_print_state); + if (err >= 0 || error_recoverable_with_nospec(err)) { + marks_err = bpf_commit_stack_write_marks(env); + if (marks_err) + return marks_err; + } + if (error_recoverable_with_nospec(err) && state->speculative) { + /* Prevent this speculative path from ever reaching the + * insn that would have been unsafe to execute. */ - err = check_load_mem(env, insn, false, is_ldsx, true, - "ldx"); - if (err) - return err; - } else if (class == BPF_STX) { - if (BPF_MODE(insn->code) == BPF_ATOMIC) { - err = check_atomic(env, insn); - if (err) - return err; - env->insn_idx++; - continue; - } - - if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) { - verbose(env, "BPF_STX uses reserved fields\n"); - return -EINVAL; - } - - err = check_store_reg(env, insn, false); - if (err) - return err; - } else if (class == BPF_ST) { - enum bpf_reg_type dst_reg_type; - - if (BPF_MODE(insn->code) != BPF_MEM || - insn->src_reg != BPF_REG_0) { - verbose(env, "BPF_ST uses reserved fields\n"); - return -EINVAL; - } - /* check src operand */ - err = check_reg_arg(env, insn->dst_reg, SRC_OP); - if (err) - return err; - - dst_reg_type = regs[insn->dst_reg].type; - - /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, env->insn_idx, insn->dst_reg, - insn->off, BPF_SIZE(insn->code), - BPF_WRITE, -1, false, false); + insn_aux->nospec = true; + /* If it was an ADD/SUB insn, potentially remove any + * markings for alu sanitization. + */ + insn_aux->alu_state = 0; + goto process_bpf_exit; + } else if (err < 0) { + return err; + } else if (err == PROCESS_BPF_EXIT) { + goto process_bpf_exit; + } + WARN_ON_ONCE(err); + + if (state->speculative && insn_aux->nospec_result) { + /* If we are on a path that performed a jump-op, this + * may skip a nospec patched-in after the jump. This can + * currently never happen because nospec_result is only + * used for the write-ops + * `*(size*)(dst_reg+off)=src_reg|imm32` which must + * never skip the following insn. Still, add a warning + * to document this in case nospec_result is used + * elsewhere in the future. + * + * All non-branch instructions have a single + * fall-through edge. For these, nospec_result should + * already work. + */ + if (verifier_bug_if(BPF_CLASS(insn->code) == BPF_JMP || + BPF_CLASS(insn->code) == BPF_JMP32, env, + "speculation barrier after jump instruction may not have the desired effect")) + return -EFAULT; +process_bpf_exit: + mark_verifier_state_scratched(env); + err = update_branch_counts(env, env->cur_state); if (err) return err; - - err = save_aux_ptr_type(env, dst_reg_type, false); + err = bpf_update_live_stack(env); if (err) return err; - } else if (class == BPF_JMP || class == BPF_JMP32) { - u8 opcode = BPF_OP(insn->code); - - env->jmps_processed++; - if (opcode == BPF_CALL) { - if (BPF_SRC(insn->code) != BPF_K || - (insn->src_reg != BPF_PSEUDO_KFUNC_CALL - && insn->off != 0) || - (insn->src_reg != BPF_REG_0 && - insn->src_reg != BPF_PSEUDO_CALL && - insn->src_reg != BPF_PSEUDO_KFUNC_CALL) || - insn->dst_reg != BPF_REG_0 || - class == BPF_JMP32) { - verbose(env, "BPF_CALL uses reserved fields\n"); - return -EINVAL; - } - - if (env->cur_state->active_locks) { - if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) || - (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && - (insn->off != 0 || !kfunc_spin_allowed(insn->imm)))) { - verbose(env, "function calls are not allowed while holding a lock\n"); - return -EINVAL; - } - } - if (insn->src_reg == BPF_PSEUDO_CALL) { - err = check_func_call(env, insn, &env->insn_idx); - } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { - err = check_kfunc_call(env, insn, &env->insn_idx); - if (!err && is_bpf_throw_kfunc(insn)) { - exception_exit = true; - goto process_bpf_exit_full; - } - } else { - err = check_helper_call(env, insn, &env->insn_idx); - } - if (err) - return err; - - mark_reg_scratched(env, BPF_REG_0); - } else if (opcode == BPF_JA) { - if (BPF_SRC(insn->code) != BPF_K || - insn->src_reg != BPF_REG_0 || - insn->dst_reg != BPF_REG_0 || - (class == BPF_JMP && insn->imm != 0) || - (class == BPF_JMP32 && insn->off != 0)) { - verbose(env, "BPF_JA uses reserved fields\n"); - return -EINVAL; - } - - if (class == BPF_JMP) - env->insn_idx += insn->off + 1; - else - env->insn_idx += insn->imm + 1; - continue; - - } else if (opcode == BPF_EXIT) { - if (BPF_SRC(insn->code) != BPF_K || - insn->imm != 0 || - insn->src_reg != BPF_REG_0 || - insn->dst_reg != BPF_REG_0 || - class == BPF_JMP32) { - verbose(env, "BPF_EXIT uses reserved fields\n"); - return -EINVAL; - } -process_bpf_exit_full: - /* We must do check_reference_leak here before - * prepare_func_exit to handle the case when - * state->curframe > 0, it may be a callback - * function, for which reference_state must - * match caller reference state when it exits. - */ - err = check_resource_leak(env, exception_exit, !env->cur_state->curframe, - "BPF_EXIT instruction in main prog"); - if (err) - return err; - - /* The side effect of the prepare_func_exit - * which is being skipped is that it frees - * bpf_func_state. Typically, process_bpf_exit - * will only be hit with outermost exit. - * copy_verifier_state in pop_stack will handle - * freeing of any extra bpf_func_state left over - * from not processing all nested function - * exits. We also skip return code checks as - * they are not needed for exceptional exits. - */ - if (exception_exit) - goto process_bpf_exit; - - if (state->curframe) { - /* exit from nested function */ - err = prepare_func_exit(env, &env->insn_idx); - if (err) - return err; - do_print_state = true; - continue; - } - - err = check_return_code(env, BPF_REG_0, "R0"); - if (err) - return err; -process_bpf_exit: - mark_verifier_state_scratched(env); - update_branch_counts(env, env->cur_state); - err = pop_stack(env, &prev_insn_idx, - &env->insn_idx, pop_log); - if (err < 0) { - if (err != -ENOENT) - return err; - break; - } else { - if (WARN_ON_ONCE(env->cur_state->loop_entry)) { - verbose(env, "verifier bug: env->cur_state->loop_entry != NULL\n"); - return -EFAULT; - } - do_print_state = true; - continue; - } - } else { - err = check_cond_jmp_op(env, insn, &env->insn_idx); - if (err) - return err; - } - } else if (class == BPF_LD) { - u8 mode = BPF_MODE(insn->code); - - if (mode == BPF_ABS || mode == BPF_IND) { - err = check_ld_abs(env, insn); - if (err) - return err; - - } else if (mode == BPF_IMM) { - err = check_ld_imm(env, insn); - if (err) + err = pop_stack(env, &prev_insn_idx, &env->insn_idx, + pop_log); + if (err < 0) { + if (err != -ENOENT) return err; - - env->insn_idx++; - sanitize_mark_insn_seen(env); + break; } else { - verbose(env, "invalid BPF_LD mode\n"); - return -EINVAL; + do_print_state = true; + continue; } - } else { - verbose(env, "unknown insn class %d\n", class); - return -EINVAL; } - - env->insn_idx++; } return 0; @@ -19747,8 +20689,11 @@ static int __add_used_btf(struct bpf_verifier_env *env, struct btf *btf) if (env->used_btfs[i].btf == btf) return i; - if (env->used_btf_cnt >= MAX_USED_BTFS) + if (env->used_btf_cnt >= MAX_USED_BTFS) { + verbose(env, "The total number of btfs per program has reached the limit of %u\n", + MAX_USED_BTFS); return -E2BIG; + } btf_get(btf); @@ -19914,6 +20859,12 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, { enum bpf_prog_type prog_type = resolve_prog_type(prog); + if (map->excl_prog_sha && + memcmp(map->excl_prog_sha, prog->digest, SHA256_DIGEST_SIZE)) { + verbose(env, "program's hash doesn't match map's excl_prog_hash\n"); + return -EACCES; + } + if (btf_record_has_field(map->record, BPF_LIST_HEAD) || btf_record_has_field(map->record, BPF_RB_ROOT)) { if (is_tracing_prog_type(prog_type)) { @@ -19978,6 +20929,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_QUEUE: case BPF_MAP_TYPE_STACK: case BPF_MAP_TYPE_ARENA: + case BPF_MAP_TYPE_INSN_ARRAY: break; default: verbose(env, @@ -20049,6 +21001,15 @@ static int __add_used_map(struct bpf_verifier_env *env, struct bpf_map *map) env->used_maps[env->used_map_cnt++] = map; + if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) { + err = bpf_insn_array_init(map, env->prog); + if (err) { + verbose(env, "Failed to properly initialize insn array\n"); + return err; + } + env->insn_array_maps[env->insn_array_map_cnt++] = map; + } + return env->used_map_cnt - 1; } @@ -20253,12 +21214,11 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) * [0, off) and [off, end) to new locations, so the patched range stays zero */ static void adjust_insn_aux_data(struct bpf_verifier_env *env, - struct bpf_insn_aux_data *new_data, struct bpf_prog *new_prog, u32 off, u32 cnt) { - struct bpf_insn_aux_data *old_data = env->insn_aux_data; + struct bpf_insn_aux_data *data = env->insn_aux_data; struct bpf_insn *insn = new_prog->insnsi; - u32 old_seen = old_data[off].seen; + u32 old_seen = data[off].seen; u32 prog_len; int i; @@ -20266,22 +21226,20 @@ static void adjust_insn_aux_data(struct bpf_verifier_env *env, * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the * original insn at old prog. */ - old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1); + data[off].zext_dst = insn_has_def32(insn + off + cnt - 1); if (cnt == 1) return; prog_len = new_prog->len; - memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); - memcpy(new_data + off + cnt - 1, old_data + off, - sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); + memmove(data + off + cnt - 1, data + off, + sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); + memset(data + off, 0, sizeof(struct bpf_insn_aux_data) * (cnt - 1)); for (i = off; i < off + cnt - 1; i++) { /* Expand insni[off]'s seen count to the patched range. */ - new_data[i].seen = old_seen; - new_data[i].zext_dst = insn_has_def32(env, insn + i); + data[i].seen = old_seen; + data[i].zext_dst = insn_has_def32(insn + i); } - env->insn_aux_data = new_data; - vfree(old_data); } static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len) @@ -20298,6 +21256,33 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len } } +static void release_insn_arrays(struct bpf_verifier_env *env) +{ + int i; + + for (i = 0; i < env->insn_array_map_cnt; i++) + bpf_insn_array_release(env->insn_array_maps[i]); +} + +static void adjust_insn_arrays(struct bpf_verifier_env *env, u32 off, u32 len) +{ + int i; + + if (len == 1) + return; + + for (i = 0; i < env->insn_array_map_cnt; i++) + bpf_insn_array_adjust(env->insn_array_maps[i], off, len); +} + +static void adjust_insn_arrays_after_remove(struct bpf_verifier_env *env, u32 off, u32 len) +{ + int i; + + for (i = 0; i < env->insn_array_map_cnt; i++) + bpf_insn_array_adjust_after_remove(env->insn_array_maps[i], off, len); +} + static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len) { struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab; @@ -20319,10 +21304,14 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of struct bpf_insn_aux_data *new_data = NULL; if (len > 1) { - new_data = vzalloc(array_size(env->prog->len + len - 1, - sizeof(struct bpf_insn_aux_data))); + new_data = vrealloc(env->insn_aux_data, + array_size(env->prog->len + len - 1, + sizeof(struct bpf_insn_aux_data)), + GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!new_data) return NULL; + + env->insn_aux_data = new_data; } new_prog = bpf_patch_insn_single(env->prog, off, patch, len); @@ -20331,11 +21320,11 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of verbose(env, "insn %d cannot be patched due to 16-bit range\n", env->insn_aux_data[off].orig_idx); - vfree(new_data); return NULL; } - adjust_insn_aux_data(env, new_data, new_prog, off, len); + adjust_insn_aux_data(env, new_prog, off, len); adjust_subprog_starts(env, off, len); + adjust_insn_arrays(env, off, len); adjust_poke_descs(new_prog, off, len); return new_prog; } @@ -20498,6 +21487,27 @@ static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off, return 0; } +/* + * Clean up dynamically allocated fields of aux data for instructions [start, ...] + */ +static void clear_insn_aux_data(struct bpf_verifier_env *env, int start, int len) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + struct bpf_insn *insns = env->prog->insnsi; + int end = start + len; + int i; + + for (i = start; i < end; i++) { + if (aux_data[i].jt) { + kvfree(aux_data[i].jt); + aux_data[i].jt = NULL; + } + + if (bpf_is_ldimm64(&insns[i])) + i++; + } +} + static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) { struct bpf_insn_aux_data *aux_data = env->insn_aux_data; @@ -20507,6 +21517,9 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) if (bpf_prog_is_offloaded(env->prog->aux)) bpf_prog_offload_remove_insns(env, off, cnt); + /* Should be called before bpf_remove_insns, as it uses prog->insnsi */ + clear_insn_aux_data(env, off, cnt); + err = bpf_remove_insns(env->prog, off, cnt); if (err) return err; @@ -20519,6 +21532,8 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) if (err) return err; + adjust_insn_arrays_after_remove(env, off, cnt); + memmove(aux_data + off, aux_data + off + cnt, sizeof(*aux_data) * (orig_prog_len - off - cnt)); @@ -20647,7 +21662,10 @@ static int opt_remove_nops(struct bpf_verifier_env *env) static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, const union bpf_attr *attr) { - struct bpf_insn *patch, zext_patch[2], rnd_hi32_patch[4]; + struct bpf_insn *patch; + /* use env->insn_buf as two independent buffers */ + struct bpf_insn *zext_patch = env->insn_buf; + struct bpf_insn *rnd_hi32_patch = &env->insn_buf[2]; struct bpf_insn_aux_data *aux = env->insn_aux_data; int i, patch_len, delta = 0, len = env->prog->len; struct bpf_insn *insns = env->prog->insnsi; @@ -20682,7 +21700,7 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, * BPF_STX + SRC_OP, so it is safe to pass NULL * here. */ - if (is_reg64(env, &insn, load_reg, NULL, DST_OP)) { + if (is_reg64(&insn, load_reg, NULL, DST_OP)) { if (class == BPF_LD && BPF_MODE(code) == BPF_IMM) i++; @@ -20720,10 +21738,9 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, if (bpf_pseudo_kfunc_call(&insn)) continue; - if (WARN_ON(load_reg == -1)) { - verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n"); + if (verifier_bug_if(load_reg == -1, env, + "zext_dst is set, but no reg is defined")) return -EFAULT; - } zext_patch[0] = insn; zext_patch[1].dst_reg = load_reg; @@ -20767,8 +21784,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog, -(subprogs[0].stack_depth + 8)); if (epilogue_cnt >= INSN_BUF_SIZE) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + verifier_bug(env, "epilogue is too long"); + return -EFAULT; } else if (epilogue_cnt) { /* Save the ARG_PTR_TO_CTX for the epilogue to use */ cnt = 0; @@ -20790,14 +21807,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (ops->gen_prologue || env->seen_direct_write) { if (!ops->gen_prologue) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + verifier_bug(env, "gen_prologue is null"); + return -EFAULT; } cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, env->prog); if (cnt >= INSN_BUF_SIZE) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + verifier_bug(env, "prologue is too long"); + return -EFAULT; } else if (cnt) { new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); if (!new_prog) @@ -20824,6 +21841,28 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) bpf_convert_ctx_access_t convert_ctx_access; u8 mode; + if (env->insn_aux_data[i + delta].nospec) { + WARN_ON_ONCE(env->insn_aux_data[i + delta].alu_state); + struct bpf_insn *patch = insn_buf; + + *patch++ = BPF_ST_NOSPEC(); + *patch++ = *insn; + cnt = patch - insn_buf; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = new_prog; + insn = new_prog->insnsi + i + delta; + /* This can not be easily merged with the + * nospec_result-case, because an insn may require a + * nospec before and after itself. Therefore also do not + * 'continue' here but potentially apply further + * patching to insn. *insn should equal patch[1] now. + */ + } + if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || insn->code == (BPF_LDX | BPF_MEM | BPF_H) || insn->code == (BPF_LDX | BPF_MEM | BPF_W) || @@ -20873,14 +21912,16 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } if (type == BPF_WRITE && - env->insn_aux_data[i + delta].sanitize_stack_spill) { - struct bpf_insn patch[] = { - *insn, - BPF_ST_NOSPEC(), - }; + env->insn_aux_data[i + delta].nospec_result) { + /* nospec_result is only used to mitigate Spectre v4 and + * to limit verification-time for Spectre v1. + */ + struct bpf_insn *patch = insn_buf; - cnt = ARRAY_SIZE(patch); - new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt); + *patch++ = *insn; + *patch++ = BPF_ST_NOSPEC(); + cnt = patch - insn_buf; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) return -ENOMEM; @@ -20915,6 +21956,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) * for this case. */ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED: + case PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED: if (type == BPF_READ) { if (BPF_MODE(insn->code) == BPF_MEM) insn->code = BPF_LDX | BPF_PROBE_MEM | @@ -20927,10 +21969,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) continue; case PTR_TO_ARENA: if (BPF_MODE(insn->code) == BPF_MEMSX) { - verbose(env, "sign extending loads from arena are not supported yet\n"); - return -EOPNOTSUPP; + if (!bpf_jit_supports_insn(insn, true)) { + verbose(env, "sign extending loads from arena are not supported yet\n"); + return -EOPNOTSUPP; + } + insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32SX | BPF_SIZE(insn->code); + } else { + insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code); } - insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code); env->prog->aux->num_exentries++; continue; default: @@ -20953,8 +21999,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) u8 size_code; if (type == BPF_WRITE) { - verbose(env, "bpf verifier narrow ctx access misconfigured\n"); - return -EINVAL; + verifier_bug(env, "narrow ctx access misconfigured"); + return -EFAULT; } size_code = BPF_H; @@ -20972,16 +22018,16 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) &target_size); if (cnt == 0 || cnt >= INSN_BUF_SIZE || (ctx_field_size && !target_size)) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + verifier_bug(env, "error during ctx access conversion (%d)", cnt); + return -EFAULT; } if (is_narrower_load && size < target_size) { u8 shift = bpf_ctx_narrow_access_offset( off, size, size_default) * 8; if (shift && cnt + 1 >= INSN_BUF_SIZE) { - verbose(env, "bpf verifier narrow ctx load misconfigured\n"); - return -EINVAL; + verifier_bug(env, "narrow ctx load misconfigured"); + return -EFAULT; } if (ctx_field_size <= 4) { if (shift) @@ -21027,6 +22073,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) struct bpf_insn *insn; void *old_bpf_func; int err, num_exentries; + int old_len, subprog_start_adjustment = 0; if (env->subprog_cnt <= 1) return 0; @@ -21040,11 +22087,9 @@ static int jit_subprogs(struct bpf_verifier_env *env) * propagated in any case. */ subprog = find_subprog(env, i + insn->imm + 1); - if (subprog < 0) { - WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", - i + insn->imm + 1); + if (verifier_bug_if(subprog < 0, env, "No program to jit at insn %d", + i + insn->imm + 1)) return -EFAULT; - } /* temporarily remember subprog id inside insn instead of * aux_data, since next loop will split up all insns into funcs */ @@ -21103,10 +22148,12 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->func_idx = i; /* Below members will be freed only at prog->aux */ func[i]->aux->btf = prog->aux->btf; + func[i]->aux->subprog_start = subprog_start + subprog_start_adjustment; func[i]->aux->func_info = prog->aux->func_info; func[i]->aux->func_info_cnt = prog->aux->func_info_cnt; func[i]->aux->poke_tab = prog->aux->poke_tab; func[i]->aux->size_poke_tab = prog->aux->size_poke_tab; + func[i]->aux->main_prog_aux = prog->aux; for (j = 0; j < prog->aux->size_poke_tab; j++) { struct bpf_jit_poke_descriptor *poke; @@ -21131,12 +22178,15 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->jited_linfo = prog->aux->jited_linfo; func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx; func[i]->aux->arena = prog->aux->arena; + func[i]->aux->used_maps = env->used_maps; + func[i]->aux->used_map_cnt = env->used_map_cnt; num_exentries = 0; insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && (BPF_MODE(insn->code) == BPF_PROBE_MEM || BPF_MODE(insn->code) == BPF_PROBE_MEM32 || + BPF_MODE(insn->code) == BPF_PROBE_MEM32SX || BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) num_exentries++; if ((BPF_CLASS(insn->code) == BPF_STX || @@ -21154,7 +22204,15 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->might_sleep = env->subprog_info[i].might_sleep; if (!i) func[i]->aux->exception_boundary = env->seen_exception; + + /* + * To properly pass the absolute subprog start to jit + * all instruction adjustments should be accumulated + */ + old_len = func[i]->len; func[i] = bpf_int_jit_compile(func[i]); + subprog_start_adjustment += func[i]->len - old_len; + if (!func[i]->jited) { err = -ENOTSUPP; goto out_free; @@ -21207,6 +22265,15 @@ static int jit_subprogs(struct bpf_verifier_env *env) cond_resched(); } + /* + * Cleanup func[i]->aux fields which aren't required + * or can become invalid in future + */ + for (i = 0; i < env->subprog_cnt; i++) { + func[i]->aux->used_maps = NULL; + func[i]->aux->used_map_cnt = 0; + } + /* finally lock prog and jit images for all functions and * populate kallsysm. Begin at the first subprogram, since * bpf_prog_load will add the kallsyms for the main program. @@ -21336,46 +22403,47 @@ static int fixup_call_args(struct bpf_verifier_env *env) } /* replace a generic kfunc with a specialized version if necessary */ -static void specialize_kfunc(struct bpf_verifier_env *env, - u32 func_id, u16 offset, unsigned long *addr) +static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc, int insn_idx) { struct bpf_prog *prog = env->prog; bool seen_direct_write; void *xdp_kfunc; bool is_rdonly; + u32 func_id = desc->func_id; + u16 offset = desc->offset; + unsigned long addr = desc->addr; + + if (offset) /* return if module BTF is used */ + return 0; if (bpf_dev_bound_kfunc_id(func_id)) { xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id); - if (xdp_kfunc) { - *addr = (unsigned long)xdp_kfunc; - return; - } + if (xdp_kfunc) + addr = (unsigned long)xdp_kfunc; /* fallback to default kfunc when not supported by netdev */ - } - - if (offset) - return; - - if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) { + } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) { seen_direct_write = env->seen_direct_write; is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE); if (is_rdonly) - *addr = (unsigned long)bpf_dynptr_from_skb_rdonly; + addr = (unsigned long)bpf_dynptr_from_skb_rdonly; /* restore env->seen_direct_write to its original value, since * may_access_direct_pkt_data mutates it */ env->seen_direct_write = seen_direct_write; + } else if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr]) { + if (bpf_lsm_has_d_inode_locked(prog)) + addr = (unsigned long)bpf_set_dentry_xattr_locked; + } else if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr]) { + if (bpf_lsm_has_d_inode_locked(prog)) + addr = (unsigned long)bpf_remove_dentry_xattr_locked; + } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) { + if (!env->insn_aux_data[insn_idx].non_sleepable) + addr = (unsigned long)bpf_dynptr_from_file_sleepable; } - - if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr] && - bpf_lsm_has_d_inode_locked(prog)) - *addr = (unsigned long)bpf_set_dentry_xattr_locked; - - if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr] && - bpf_lsm_has_d_inode_locked(prog)) - *addr = (unsigned long)bpf_remove_dentry_xattr_locked; + desc->addr = addr; + return 0; } static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux, @@ -21398,7 +22466,8 @@ static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux, static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn *insn_buf, int insn_idx, int *cnt) { - const struct bpf_kfunc_desc *desc; + struct bpf_kfunc_desc *desc; + int err; if (!insn->imm) { verbose(env, "invalid kernel function call not eliminated in verifier pass\n"); @@ -21413,11 +22482,15 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, */ desc = find_kfunc_desc(env->prog, insn->imm, insn->off); if (!desc) { - verbose(env, "verifier internal error: kernel function descriptor not found for func_id %u\n", - insn->imm); + verifier_bug(env, "kernel function descriptor not found for func_id %u", + insn->imm); return -EFAULT; } + err = specialize_kfunc(env, desc, insn_idx); + if (err) + return err; + if (!bpf_jit_supports_far_kfunc_call()) insn->imm = BPF_CALL_IMM(desc->addr); if (insn->off) @@ -21429,8 +22502,8 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size; if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && kptr_struct_meta) { - verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n", - insn_idx); + verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d", + insn_idx); return -EFAULT; } @@ -21446,15 +22519,15 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] && kptr_struct_meta) { - verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n", - insn_idx); + verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d", + insn_idx); return -EFAULT; } if (desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] && !kptr_struct_meta) { - verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n", - insn_idx); + verifier_bug(env, "kptr_struct_meta expected at insn_idx %d", + insn_idx); return -EFAULT; } @@ -21476,8 +22549,8 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } if (!kptr_struct_meta) { - verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n", - insn_idx); + verifier_bug(env, "kptr_struct_meta expected at insn_idx %d", + insn_idx); return -EFAULT; } @@ -21487,13 +22560,17 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); *cnt = 1; - } else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id)) { - struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(BPF_REG_4, (long)env->prog->aux) }; + } - insn_buf[0] = ld_addrs[0]; - insn_buf[1] = ld_addrs[1]; - insn_buf[2] = *insn; - *cnt = 3; + if (env->insn_aux_data[insn_idx].arg_prog) { + u32 regno = env->insn_aux_data[insn_idx].arg_prog; + struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(regno, (long)env->prog->aux) }; + int idx = *cnt; + + insn_buf[idx++] = ld_addrs[0]; + insn_buf[idx++] = ld_addrs[1]; + insn_buf[idx++] = *insn; + *cnt = idx; } return 0; } @@ -21507,7 +22584,7 @@ static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *pat /* We only reserve one slot for hidden subprogs in subprog_info. */ if (env->hidden_subprog_cnt) { - verbose(env, "verifier internal error: only one hidden subprog supported\n"); + verifier_bug(env, "only one hidden subprog supported"); return -EFAULT; } /* We're not patching any existing instruction, just appending the new @@ -21547,13 +22624,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env) u16 stack_depth_extra = 0; if (env->seen_exception && !env->exception_callback_subprog) { - struct bpf_insn patch[] = { - env->prog->insnsi[insn_cnt - 1], - BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), - BPF_EXIT_INSN(), - }; + struct bpf_insn *patch = insn_buf; - ret = add_hidden_subprog(env, patch, ARRAY_SIZE(patch)); + *patch++ = env->prog->insnsi[insn_cnt - 1]; + *patch++ = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); + *patch++ = BPF_EXIT_INSN(); + ret = add_hidden_subprog(env, insn_buf, patch - insn_buf); if (ret < 0) return ret; prog = env->prog; @@ -21589,20 +22665,18 @@ static int do_misc_fixups(struct bpf_verifier_env *env) insn->off == 1 && insn->imm == -1) { bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; bool isdiv = BPF_OP(insn->code) == BPF_DIV; - struct bpf_insn *patchlet; - struct bpf_insn chk_and_sdiv[] = { - BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | - BPF_NEG | BPF_K, insn->dst_reg, - 0, 0, 0), - }; - struct bpf_insn chk_and_smod[] = { - BPF_MOV32_IMM(insn->dst_reg, 0), - }; + struct bpf_insn *patch = insn_buf; - patchlet = isdiv ? chk_and_sdiv : chk_and_smod; - cnt = isdiv ? ARRAY_SIZE(chk_and_sdiv) : ARRAY_SIZE(chk_and_smod); + if (isdiv) + *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | + BPF_NEG | BPF_K, insn->dst_reg, + 0, 0, 0); + else + *patch++ = BPF_MOV32_IMM(insn->dst_reg, 0); + + cnt = patch - insn_buf; - new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) return -ENOMEM; @@ -21621,83 +22695,79 @@ static int do_misc_fixups(struct bpf_verifier_env *env) bool isdiv = BPF_OP(insn->code) == BPF_DIV; bool is_sdiv = isdiv && insn->off == 1; bool is_smod = !isdiv && insn->off == 1; - struct bpf_insn *patchlet; - struct bpf_insn chk_and_div[] = { - /* [R,W]x div 0 -> 0 */ - BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JNE | BPF_K, insn->src_reg, - 0, 2, 0), - BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg), - BPF_JMP_IMM(BPF_JA, 0, 0, 1), - *insn, - }; - struct bpf_insn chk_and_mod[] = { - /* [R,W]x mod 0 -> [R,W]x */ - BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JEQ | BPF_K, insn->src_reg, - 0, 1 + (is64 ? 0 : 1), 0), - *insn, - BPF_JMP_IMM(BPF_JA, 0, 0, 1), - BPF_MOV32_REG(insn->dst_reg, insn->dst_reg), - }; - struct bpf_insn chk_and_sdiv[] = { + struct bpf_insn *patch = insn_buf; + + if (is_sdiv) { /* [R,W]x sdiv 0 -> 0 * LLONG_MIN sdiv -1 -> LLONG_MIN * INT_MIN sdiv -1 -> INT_MIN */ - BPF_MOV64_REG(BPF_REG_AX, insn->src_reg), - BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | - BPF_ADD | BPF_K, BPF_REG_AX, - 0, 0, 1), - BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JGT | BPF_K, BPF_REG_AX, - 0, 4, 1), - BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JEQ | BPF_K, BPF_REG_AX, - 0, 1, 0), - BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | - BPF_MOV | BPF_K, insn->dst_reg, - 0, 0, 0), + *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg); + *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | + BPF_ADD | BPF_K, BPF_REG_AX, + 0, 0, 1); + *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JGT | BPF_K, BPF_REG_AX, + 0, 4, 1); + *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JEQ | BPF_K, BPF_REG_AX, + 0, 1, 0); + *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | + BPF_MOV | BPF_K, insn->dst_reg, + 0, 0, 0); /* BPF_NEG(LLONG_MIN) == -LLONG_MIN == LLONG_MIN */ - BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | - BPF_NEG | BPF_K, insn->dst_reg, - 0, 0, 0), - BPF_JMP_IMM(BPF_JA, 0, 0, 1), - *insn, - }; - struct bpf_insn chk_and_smod[] = { + *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | + BPF_NEG | BPF_K, insn->dst_reg, + 0, 0, 0); + *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *patch++ = *insn; + cnt = patch - insn_buf; + } else if (is_smod) { /* [R,W]x mod 0 -> [R,W]x */ /* [R,W]x mod -1 -> 0 */ - BPF_MOV64_REG(BPF_REG_AX, insn->src_reg), - BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | - BPF_ADD | BPF_K, BPF_REG_AX, - 0, 0, 1), - BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JGT | BPF_K, BPF_REG_AX, - 0, 3, 1), - BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JEQ | BPF_K, BPF_REG_AX, - 0, 3 + (is64 ? 0 : 1), 1), - BPF_MOV32_IMM(insn->dst_reg, 0), - BPF_JMP_IMM(BPF_JA, 0, 0, 1), - *insn, - BPF_JMP_IMM(BPF_JA, 0, 0, 1), - BPF_MOV32_REG(insn->dst_reg, insn->dst_reg), - }; - - if (is_sdiv) { - patchlet = chk_and_sdiv; - cnt = ARRAY_SIZE(chk_and_sdiv); - } else if (is_smod) { - patchlet = chk_and_smod; - cnt = ARRAY_SIZE(chk_and_smod) - (is64 ? 2 : 0); + *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg); + *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | + BPF_ADD | BPF_K, BPF_REG_AX, + 0, 0, 1); + *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JGT | BPF_K, BPF_REG_AX, + 0, 3, 1); + *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JEQ | BPF_K, BPF_REG_AX, + 0, 3 + (is64 ? 0 : 1), 1); + *patch++ = BPF_MOV32_IMM(insn->dst_reg, 0); + *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *patch++ = *insn; + + if (!is64) { + *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *patch++ = BPF_MOV32_REG(insn->dst_reg, insn->dst_reg); + } + cnt = patch - insn_buf; + } else if (isdiv) { + /* [R,W]x div 0 -> 0 */ + *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JNE | BPF_K, insn->src_reg, + 0, 2, 0); + *patch++ = BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg); + *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *patch++ = *insn; + cnt = patch - insn_buf; } else { - patchlet = isdiv ? chk_and_div : chk_and_mod; - cnt = isdiv ? ARRAY_SIZE(chk_and_div) : - ARRAY_SIZE(chk_and_mod) - (is64 ? 2 : 0); + /* [R,W]x mod 0 -> [R,W]x */ + *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JEQ | BPF_K, insn->src_reg, + 0, 1 + (is64 ? 0 : 1), 0); + *patch++ = *insn; + + if (!is64) { + *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *patch++ = BPF_MOV32_REG(insn->dst_reg, insn->dst_reg); + } + cnt = patch - insn_buf; } - new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) return -ENOMEM; @@ -21711,7 +22781,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) if (BPF_CLASS(insn->code) == BPF_LDX && (BPF_MODE(insn->code) == BPF_PROBE_MEM || BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) { - struct bpf_insn *patch = &insn_buf[0]; + struct bpf_insn *patch = insn_buf; u64 uaddress_limit = bpf_arch_uaddress_limit(); if (!uaddress_limit) @@ -21743,8 +22813,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env) BPF_MODE(insn->code) == BPF_IND)) { cnt = env->ops->gen_ld_abs(insn, insn_buf); if (cnt == 0 || cnt >= INSN_BUF_SIZE) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + verifier_bug(env, "%d insns generated for ld_abs", cnt); + return -EFAULT; } new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); @@ -21762,7 +22832,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) { const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X; const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X; - struct bpf_insn *patch = &insn_buf[0]; + struct bpf_insn *patch = insn_buf; bool issrc, isneg, isimm; u32 off_reg; @@ -22016,8 +23086,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) } if (is_storage_get_function(insn->imm)) { - if (!in_sleepable(env) || - env->insn_aux_data[i + delta].storage_get_func_atomic) + if (env->insn_aux_data[i + delta].non_sleepable) insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC); else insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL); @@ -22079,8 +23148,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env) if (cnt == -EOPNOTSUPP) goto patch_map_ops_generic; if (cnt <= 0 || cnt >= INSN_BUF_SIZE) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + verifier_bug(env, "%d insns generated for map lookup", cnt); + return -EFAULT; } new_prog = bpf_patch_insn_data(env, i + delta, @@ -22367,9 +23436,9 @@ patch_call_imm: * programs to call them, must be real in-kernel functions */ if (!fn->func) { - verbose(env, - "kernel subsystem misconfigured func %s#%d\n", - func_id_name(insn->imm), insn->imm); + verifier_bug(env, + "not inlined functions %s#%d is missing func", + func_id_name(insn->imm), insn->imm); return -EFAULT; } insn->imm = fn->func - __bpf_call_base; @@ -22403,7 +23472,7 @@ next_insn: continue; /* We need two slots in case timed may_goto is supported. */ if (stack_slots > slots) { - verbose(env, "verifier bug: stack_slots supports may_goto only\n"); + verifier_bug(env, "stack_slots supports may_goto only"); return -EFAULT; } @@ -22439,8 +23508,8 @@ next_insn: if (!map_ptr->ops->map_poke_track || !map_ptr->ops->map_poke_untrack || !map_ptr->ops->map_poke_run) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + verifier_bug(env, "poke tab is misconfigured"); + return -EFAULT; } ret = map_ptr->ops->map_poke_track(map_ptr, prog->aux); @@ -22450,7 +23519,9 @@ next_insn: } } - sort_kfunc_descs_by_imm_off(env->prog); + ret = sort_kfunc_descs_by_imm_off(env); + if (ret) + return ret; return 0; } @@ -22630,7 +23701,12 @@ static void free_states(struct bpf_verifier_env *env) { struct bpf_verifier_state_list *sl; struct list_head *head, *pos, *tmp; - int i; + struct bpf_scc_info *info; + int i, j; + + free_verifier_state(env->cur_state, true); + env->cur_state = NULL; + while (!pop_stack(env, NULL, NULL, false)); list_for_each_safe(pos, tmp, &env->free_list) { sl = container_of(pos, struct bpf_verifier_state_list, node); @@ -22639,6 +23715,16 @@ static void free_states(struct bpf_verifier_env *env) } INIT_LIST_HEAD(&env->free_list); + for (i = 0; i < env->scc_cnt; ++i) { + info = env->scc_info[i]; + if (!info) + continue; + for (j = 0; j < info->num_visits; j++) + free_backedges(&info->visits[j]); + kvfree(info); + env->scc_info[i] = NULL; + } + if (!env->explored_states) return; @@ -22666,13 +23752,14 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) env->prev_linfo = NULL; env->pass_cnt++; - state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); + state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL_ACCOUNT); if (!state) return -ENOMEM; state->curframe = 0; state->speculative = false; state->branches = 1; - state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); + state->in_sleepable = env->prog->sleepable; + state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL_ACCOUNT); if (!state->frame[0]) { kfree(state); return -ENOMEM; @@ -22691,7 +23778,8 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) struct bpf_subprog_arg_info *arg; struct bpf_reg_state *reg; - verbose(env, "Validating %s() func#%d...\n", sub_name, subprog); + if (env->log.level & BPF_LOG_LEVEL) + verbose(env, "Validating %s() func#%d...\n", sub_name, subprog); ret = btf_prepare_func_args(env, subprog); if (ret) goto out; @@ -22723,11 +23811,12 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) __mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen); } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) { reg->type = PTR_TO_MEM; - if (arg->arg_type & PTR_MAYBE_NULL) - reg->type |= PTR_MAYBE_NULL; + reg->type |= arg->arg_type & + (PTR_MAYBE_NULL | PTR_UNTRUSTED | MEM_RDONLY); mark_reg_known_zero(env, regs, i); reg->mem_size = arg->mem_size; - reg->id = ++env->id_gen; + if (arg->arg_type & PTR_MAYBE_NULL) + reg->id = ++env->id_gen; } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) { reg->type = PTR_TO_BTF_ID; if (arg->arg_type & PTR_MAYBE_NULL) @@ -22744,8 +23833,8 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) /* caller can pass either PTR_TO_ARENA or SCALAR */ mark_reg_unknown(env, regs, i); } else { - WARN_ONCE(1, "BUG: unhandled arg#%d type %d\n", - i - BPF_REG_1, arg->arg_type); + verifier_bug(env, "unhandled arg#%d type %d", + i - BPF_REG_1, arg->arg_type); ret = -EFAULT; goto out; } @@ -22775,14 +23864,6 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) ret = do_check(env); out: - /* check for NULL is necessary, since cur_state can be freed inside - * do_check() under memory pressure. - */ - if (env->cur_state) { - free_verifier_state(env->cur_state, true); - env->cur_state = NULL; - } - while (!pop_stack(env, NULL, NULL, false)); if (!ret && pop_log) bpf_vlog_reset(&env->log, 0); free_states(env); @@ -22898,7 +23979,7 @@ static void print_verification_stats(struct bpf_verifier_env *env) int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog, const struct bpf_ctx_arg_aux *info, u32 cnt) { - prog->aux->ctx_arg_info = kmemdup_array(info, cnt, sizeof(*info), GFP_KERNEL); + prog->aux->ctx_arg_info = kmemdup_array(info, cnt, sizeof(*info), GFP_KERNEL_ACCOUNT); prog->aux->ctx_arg_info_size = cnt; return prog->aux->ctx_arg_info ? 0 : -ENOMEM; @@ -23379,6 +24460,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, BTF_SET_START(btf_id_deny) BTF_ID_UNUSED #ifdef CONFIG_SMP +BTF_ID(func, ___migrate_enable) BTF_ID(func, migrate_disable) BTF_ID(func, migrate_enable) #endif @@ -23507,11 +24589,14 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return ret; } else if (prog->type == BPF_PROG_TYPE_TRACING && btf_id_set_contains(&btf_id_deny, btf_id)) { + verbose(env, "Attaching tracing programs to function '%s' is rejected.\n", + tgt_info.tgt_name); return -EINVAL; } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT || prog->expected_attach_type == BPF_MODIFY_RETURN) && btf_id_set_contains(&noreturn_deny, btf_id)) { - verbose(env, "Attaching fexit/fmod_ret to __noreturn functions is rejected.\n"); + verbose(env, "Attaching fexit/fmod_ret to __noreturn function '%s' is rejected.\n", + tgt_info.tgt_name); return -EINVAL; } @@ -23605,66 +24690,6 @@ static int process_fd_array(struct bpf_verifier_env *env, union bpf_attr *attr, return 0; } -static bool can_fallthrough(struct bpf_insn *insn) -{ - u8 class = BPF_CLASS(insn->code); - u8 opcode = BPF_OP(insn->code); - - if (class != BPF_JMP && class != BPF_JMP32) - return true; - - if (opcode == BPF_EXIT || opcode == BPF_JA) - return false; - - return true; -} - -static bool can_jump(struct bpf_insn *insn) -{ - u8 class = BPF_CLASS(insn->code); - u8 opcode = BPF_OP(insn->code); - - if (class != BPF_JMP && class != BPF_JMP32) - return false; - - switch (opcode) { - case BPF_JA: - case BPF_JEQ: - case BPF_JNE: - case BPF_JLT: - case BPF_JLE: - case BPF_JGT: - case BPF_JGE: - case BPF_JSGT: - case BPF_JSGE: - case BPF_JSLT: - case BPF_JSLE: - case BPF_JCOND: - return true; - } - - return false; -} - -static int insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) -{ - struct bpf_insn *insn = &prog->insnsi[idx]; - int i = 0, insn_sz; - u32 dst; - - insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; - if (can_fallthrough(insn) && idx + 1 < prog->len) - succ[i++] = idx + insn_sz; - - if (can_jump(insn)) { - dst = idx + jmp_offset(insn) + 1; - if (i == 0 || succ[0] != dst) - succ[i++] = dst; - } - - return i; -} - /* Each field is a register bitmask */ struct insn_live_regs { u16 use; /* registers read by instruction */ @@ -23842,7 +24867,7 @@ static int compute_live_registers(struct bpf_verifier_env *env) * - repeat the computation while {in,out} fields changes for * any instruction. */ - state = kvcalloc(insn_cnt, sizeof(*state), GFP_KERNEL); + state = kvcalloc(insn_cnt, sizeof(*state), GFP_KERNEL_ACCOUNT); if (!state) { err = -ENOMEM; goto out; @@ -23857,14 +24882,13 @@ static int compute_live_registers(struct bpf_verifier_env *env) for (i = 0; i < env->cfg.cur_postorder; ++i) { int insn_idx = env->cfg.insn_postorder[i]; struct insn_live_regs *live = &state[insn_idx]; - int succ_num; - u32 succ[2]; + struct bpf_iarray *succ; u16 new_out = 0; u16 new_in = 0; - succ_num = insn_successors(env->prog, insn_idx, succ); - for (int s = 0; s < succ_num; ++s) - new_out |= state[succ[s]].in; + succ = bpf_insn_successors(env, insn_idx); + for (int s = 0; s < succ->cnt; ++s) + new_out |= state[succ->items[s]].in; new_in = (new_out & ~live->def) | live->use; if (new_out != live->out || new_in != live->in) { live->in = new_in; @@ -23880,6 +24904,10 @@ static int compute_live_registers(struct bpf_verifier_env *env) if (env->log.level & BPF_LOG_LEVEL2) { verbose(env, "Live regs before insn:\n"); for (i = 0; i < insn_cnt; ++i) { + if (env->insn_aux_data[i].scc) + verbose(env, "%3d ", env->insn_aux_data[i].scc); + else + verbose(env, " "); verbose(env, "%3d: ", i); for (j = BPF_REG_0; j < BPF_REG_10; ++j) if (insn_aux[i].live_regs_before & BIT(j)) @@ -23895,9 +24923,186 @@ static int compute_live_registers(struct bpf_verifier_env *env) out: kvfree(state); - kvfree(env->cfg.insn_postorder); - env->cfg.insn_postorder = NULL; - env->cfg.cur_postorder = 0; + return err; +} + +/* + * Compute strongly connected components (SCCs) on the CFG. + * Assign an SCC number to each instruction, recorded in env->insn_aux[*].scc. + * If instruction is a sole member of its SCC and there are no self edges, + * assign it SCC number of zero. + * Uses a non-recursive adaptation of Tarjan's algorithm for SCC computation. + */ +static int compute_scc(struct bpf_verifier_env *env) +{ + const u32 NOT_ON_STACK = U32_MAX; + + struct bpf_insn_aux_data *aux = env->insn_aux_data; + const u32 insn_cnt = env->prog->len; + int stack_sz, dfs_sz, err = 0; + u32 *stack, *pre, *low, *dfs; + u32 i, j, t, w; + u32 next_preorder_num; + u32 next_scc_id; + bool assign_scc; + struct bpf_iarray *succ; + + next_preorder_num = 1; + next_scc_id = 1; + /* + * - 'stack' accumulates vertices in DFS order, see invariant comment below; + * - 'pre[t] == p' => preorder number of vertex 't' is 'p'; + * - 'low[t] == n' => smallest preorder number of the vertex reachable from 't' is 'n'; + * - 'dfs' DFS traversal stack, used to emulate explicit recursion. + */ + stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); + pre = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); + low = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); + dfs = kvcalloc(insn_cnt, sizeof(*dfs), GFP_KERNEL_ACCOUNT); + if (!stack || !pre || !low || !dfs) { + err = -ENOMEM; + goto exit; + } + /* + * References: + * [1] R. Tarjan "Depth-First Search and Linear Graph Algorithms" + * [2] D. J. Pearce "A Space-Efficient Algorithm for Finding Strongly Connected Components" + * + * The algorithm maintains the following invariant: + * - suppose there is a path 'u' ~> 'v', such that 'pre[v] < pre[u]'; + * - then, vertex 'u' remains on stack while vertex 'v' is on stack. + * + * Consequently: + * - If 'low[v] < pre[v]', there is a path from 'v' to some vertex 'u', + * such that 'pre[u] == low[v]'; vertex 'u' is currently on the stack, + * and thus there is an SCC (loop) containing both 'u' and 'v'. + * - If 'low[v] == pre[v]', loops containing 'v' have been explored, + * and 'v' can be considered the root of some SCC. + * + * Here is a pseudo-code for an explicitly recursive version of the algorithm: + * + * NOT_ON_STACK = insn_cnt + 1 + * pre = [0] * insn_cnt + * low = [0] * insn_cnt + * scc = [0] * insn_cnt + * stack = [] + * + * next_preorder_num = 1 + * next_scc_id = 1 + * + * def recur(w): + * nonlocal next_preorder_num + * nonlocal next_scc_id + * + * pre[w] = next_preorder_num + * low[w] = next_preorder_num + * next_preorder_num += 1 + * stack.append(w) + * for s in successors(w): + * # Note: for classic algorithm the block below should look as: + * # + * # if pre[s] == 0: + * # recur(s) + * # low[w] = min(low[w], low[s]) + * # elif low[s] != NOT_ON_STACK: + * # low[w] = min(low[w], pre[s]) + * # + * # But replacing both 'min' instructions with 'low[w] = min(low[w], low[s])' + * # does not break the invariant and makes itartive version of the algorithm + * # simpler. See 'Algorithm #3' from [2]. + * + * # 's' not yet visited + * if pre[s] == 0: + * recur(s) + * # if 's' is on stack, pick lowest reachable preorder number from it; + * # if 's' is not on stack 'low[s] == NOT_ON_STACK > low[w]', + * # so 'min' would be a noop. + * low[w] = min(low[w], low[s]) + * + * if low[w] == pre[w]: + * # 'w' is the root of an SCC, pop all vertices + * # below 'w' on stack and assign same SCC to them. + * while True: + * t = stack.pop() + * low[t] = NOT_ON_STACK + * scc[t] = next_scc_id + * if t == w: + * break + * next_scc_id += 1 + * + * for i in range(0, insn_cnt): + * if pre[i] == 0: + * recur(i) + * + * Below implementation replaces explicit recursion with array 'dfs'. + */ + for (i = 0; i < insn_cnt; i++) { + if (pre[i]) + continue; + stack_sz = 0; + dfs_sz = 1; + dfs[0] = i; +dfs_continue: + while (dfs_sz) { + w = dfs[dfs_sz - 1]; + if (pre[w] == 0) { + low[w] = next_preorder_num; + pre[w] = next_preorder_num; + next_preorder_num++; + stack[stack_sz++] = w; + } + /* Visit 'w' successors */ + succ = bpf_insn_successors(env, w); + for (j = 0; j < succ->cnt; ++j) { + if (pre[succ->items[j]]) { + low[w] = min(low[w], low[succ->items[j]]); + } else { + dfs[dfs_sz++] = succ->items[j]; + goto dfs_continue; + } + } + /* + * Preserve the invariant: if some vertex above in the stack + * is reachable from 'w', keep 'w' on the stack. + */ + if (low[w] < pre[w]) { + dfs_sz--; + goto dfs_continue; + } + /* + * Assign SCC number only if component has two or more elements, + * or if component has a self reference. + */ + assign_scc = stack[stack_sz - 1] != w; + for (j = 0; j < succ->cnt; ++j) { + if (succ->items[j] == w) { + assign_scc = true; + break; + } + } + /* Pop component elements from stack */ + do { + t = stack[--stack_sz]; + low[t] = NOT_ON_STACK; + if (assign_scc) + aux[t].scc = next_scc_id; + } while (t != w); + if (assign_scc) + next_scc_id++; + dfs_sz--; + } + } + env->scc_info = kvcalloc(next_scc_id, sizeof(*env->scc_info), GFP_KERNEL_ACCOUNT); + if (!env->scc_info) { + err = -ENOMEM; + goto exit; + } + env->scc_cnt = next_scc_id; +exit: + kvfree(stack); + kvfree(pre); + kvfree(low); + kvfree(dfs); return err; } @@ -23909,6 +25114,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 u32 log_true_size; bool is_priv; + BTF_TYPE_EMIT(enum bpf_features); + /* no program is valid */ if (ARRAY_SIZE(bpf_verifier_ops) == 0) return -EINVAL; @@ -23916,7 +25123,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 /* 'struct bpf_verifier_env' can be global, but since it's not small, * allocate/free it every time bpf_check() is called */ - env = kvzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); + env = kvzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL_ACCOUNT); if (!env) return -ENOMEM; @@ -23930,6 +25137,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 goto err_free_env; for (i = 0; i < len; i++) env->insn_aux_data[i].orig_idx = i; + env->succ = iarray_realloc(NULL, 2); + if (!env->succ) + goto err_free_env; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; @@ -23979,7 +25189,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 env->explored_states = kvcalloc(state_htab_size(env), sizeof(struct list_head), - GFP_USER); + GFP_KERNEL_ACCOUNT); ret = -ENOMEM; if (!env->explored_states) goto skip_full_check; @@ -24018,10 +25228,22 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; + ret = compute_postorder(env); + if (ret < 0) + goto skip_full_check; + + ret = bpf_stack_liveness_init(env); + if (ret) + goto skip_full_check; + ret = check_attach_btf_id(env); if (ret) goto skip_full_check; + ret = compute_scc(env); + if (ret < 0) + goto skip_full_check; + ret = compute_live_registers(env); if (ret < 0) goto skip_full_check; @@ -24106,7 +25328,7 @@ skip_full_check: /* if program passed verifier, update used_maps in bpf_prog_info */ env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt, sizeof(env->used_maps[0]), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!env->prog->aux->used_maps) { ret = -ENOMEM; @@ -24121,7 +25343,7 @@ skip_full_check: /* if program passed verifier, update used_btfs in bpf_prog_aux */ env->prog->aux->used_btfs = kmalloc_array(env->used_btf_cnt, sizeof(env->used_btfs[0]), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!env->prog->aux->used_btfs) { ret = -ENOMEM; goto err_release_maps; @@ -24141,6 +25363,8 @@ skip_full_check: adjust_btf_func(env); err_release_maps: + if (ret) + release_insn_arrays(env); if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_used_maps() will release them. @@ -24161,10 +25385,14 @@ err_release_maps: err_unlock: if (!is_priv) mutex_unlock(&bpf_verifier_lock); + clear_insn_aux_data(env, 0, env->prog->len); vfree(env->insn_aux_data); - kvfree(env->insn_hist); err_free_env: + bpf_stack_liveness_free(env); kvfree(env->cfg.insn_postorder); + kvfree(env->scc_info); + kvfree(env->succ); + kvfree(env->gotox_tmp_buf); kvfree(env); return ret; } diff --git a/kernel/cfi.c b/kernel/cfi.c index 422fa4f958ae..4dad04ead06c 100644 --- a/kernel/cfi.c +++ b/kernel/cfi.c @@ -5,6 +5,8 @@ * Copyright (C) 2022 Google LLC */ +#include <linux/bpf.h> +#include <linux/cfi_types.h> #include <linux/cfi.h> bool cfi_warn __ro_after_init = IS_ENABLED(CONFIG_CFI_PERMISSIVE); @@ -27,6 +29,19 @@ enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, return BUG_TRAP_TYPE_BUG; } +/* + * Declare two non-existent functions with types that match bpf_func_t and + * bpf_callback_t pointers, and use DEFINE_CFI_TYPE to define type hash + * variables for each function type. The cfi_bpf_* variables are used by + * arch-specific BPF JIT implementations to ensure indirectly callable JIT + * code has matching CFI type hashes. + */ +extern typeof(*(bpf_func_t)0) __bpf_prog_runX; +DEFINE_CFI_TYPE(cfi_bpf_hash, __bpf_prog_runX); + +extern typeof(*(bpf_callback_t)0) __bpf_callback_fn; +DEFINE_CFI_TYPE(cfi_bpf_subprog_hash, __bpf_callback_fn); + #ifdef CONFIG_ARCH_USES_CFI_TRAPS static inline unsigned long trap_address(s32 *p) { diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 95ab39e1ec8f..22051b4f1ccb 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -249,12 +249,15 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); -void cgroup_attach_lock(bool lock_threadgroup); -void cgroup_attach_unlock(bool lock_threadgroup); +void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk); +void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk); struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, - bool *locked) + enum cgroup_attach_lock_mode *lock_mode) __acquires(&cgroup_threadgroup_rwsem); -void cgroup_procs_write_finish(struct task_struct *task, bool locked) +void cgroup_procs_write_finish(struct task_struct *task, + enum cgroup_attach_lock_mode lock_mode) __releases(&cgroup_threadgroup_rwsem); void cgroup_lock_and_drain_offline(struct cgroup *cgrp); @@ -270,9 +273,9 @@ int cgroup_task_count(const struct cgroup *cgrp); /* * rstat.c */ -int cgroup_rstat_init(struct cgroup *cgrp); -void cgroup_rstat_exit(struct cgroup *cgrp); -void cgroup_rstat_boot(void); +int css_rstat_init(struct cgroup_subsys_state *css); +void css_rstat_exit(struct cgroup_subsys_state *css); +int ss_rstat_init(struct cgroup_subsys *ss); void cgroup_base_stat_cputime_show(struct seq_file *seq); /* diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index fa24c032ed6f..a9e029b570c8 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -10,6 +10,7 @@ #include <linux/sched/task.h> #include <linux/magic.h> #include <linux/slab.h> +#include <linux/string.h> #include <linux/vmalloc.h> #include <linux/delayacct.h> #include <linux/pid_namespace.h> @@ -32,6 +33,9 @@ static u16 cgroup_no_v1_mask; /* disable named v1 mounts */ static bool cgroup_no_v1_named; +/* Show unavailable controllers in /proc/cgroups */ +static bool proc_show_all; + /* * pidlist destructions need to be flushed on cgroup destruction. Use a * separate workqueue as flush domain. @@ -65,7 +69,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0; cgroup_lock(); - cgroup_attach_lock(true); + cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL); for_each_root(root) { struct cgroup *from_cgrp; @@ -77,7 +81,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (retval) break; } - cgroup_attach_unlock(true); + cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL); cgroup_unlock(); return retval; @@ -114,7 +118,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) cgroup_lock(); - cgroup_attach_lock(true); + cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL); /* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); @@ -150,7 +154,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) } while (task && !ret); out_err: cgroup_migrate_finish(&mgctx); - cgroup_attach_unlock(true); + cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL); cgroup_unlock(); return ret; } @@ -499,13 +503,13 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, struct task_struct *task; const struct cred *cred, *tcred; ssize_t ret; - bool locked; + enum cgroup_attach_lock_mode lock_mode; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup, &locked); + task = cgroup_procs_write_start(buf, threadgroup, &lock_mode); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -528,7 +532,7 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, ret = cgroup_attach_task(cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task, locked); + cgroup_procs_write_finish(task, lock_mode); out_unlock: cgroup_kn_unlock(of->kn); @@ -683,10 +687,11 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) */ for_each_subsys(ss, i) { - if (cgroup1_subsys_absent(ss)) - continue; cgrp_v1_visible |= ss->root != &cgrp_dfl_root; + if (!proc_show_all && cgroup1_subsys_absent(ss)) + continue; + seq_printf(m, "%s\t%d\t%d\t%d\n", ss->legacy_name, ss->root->hierarchy_id, atomic_read(&ss->root->nr_cgrps), @@ -1129,7 +1134,7 @@ int cgroup1_reconfigure(struct fs_context *fc) if (ctx->release_agent) { spin_lock(&release_agent_path_lock); - strcpy(root->release_agent_path, ctx->release_agent); + strscpy(root->release_agent_path, ctx->release_agent); spin_unlock(&release_agent_path_lock); } @@ -1321,7 +1326,7 @@ static int __init cgroup1_wq_init(void) * Cap @max_active to 1 too. */ cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", - 0, 1); + WQ_PERCPU, 1); BUG_ON(!cgroup_pidlist_destroy_wq); return 0; } @@ -1359,3 +1364,9 @@ static int __init cgroup_no_v1(char *str) return 1; } __setup("cgroup_no_v1=", cgroup_no_v1); + +static int __init cgroup_v1_proc(char *str) +{ + return (kstrtobool(str, &proc_show_all) == 0); +} +__setup("cgroup_v1_proc=", cgroup_v1_proc); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 63e5b90da1f3..e717208cfb18 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -59,6 +59,8 @@ #include <linux/sched/cputime.h> #include <linux/sched/deadline.h> #include <linux/psi.h> +#include <linux/nstree.h> +#include <linux/irq_work.h> #include <net/sock.h> #define CREATE_TRACE_POINTS @@ -95,6 +97,9 @@ EXPORT_SYMBOL_GPL(cgroup_mutex); EXPORT_SYMBOL_GPL(css_set_lock); #endif +struct blocking_notifier_head cgroup_lifetime_notifier = + BLOCKING_NOTIFIER_INIT(cgroup_lifetime_notifier); + DEFINE_SPINLOCK(trace_cgroup_path_lock); char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; static bool cgroup_debug __read_mostly; @@ -121,10 +126,33 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); /* * cgroup destruction makes heavy use of work items and there can be a lot * of concurrent destructions. Use a separate workqueue so that cgroup - * destruction work items don't end up filling up max_active of system_wq + * destruction work items don't end up filling up max_active of system_percpu_wq * which may lead to deadlock. + * + * A cgroup destruction should enqueue work sequentially to: + * cgroup_offline_wq: use for css offline work + * cgroup_release_wq: use for css release work + * cgroup_free_wq: use for free work + * + * Rationale for using separate workqueues: + * The cgroup root free work may depend on completion of other css offline + * operations. If all tasks were enqueued to a single workqueue, this could + * create a deadlock scenario where: + * - Free work waits for other css offline work to complete. + * - But other css offline work is queued after free work in the same queue. + * + * Example deadlock scenario with single workqueue (cgroup_destroy_wq): + * 1. umount net_prio + * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx) + * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx) + * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline. + * 5. net_prio root destruction blocks waiting for perf_event CSS A offline, + * which can never complete as it's behind in the same queue and + * workqueue's max_active is 1. */ -static struct workqueue_struct *cgroup_destroy_wq; +static struct workqueue_struct *cgroup_offline_wq; +static struct workqueue_struct *cgroup_release_wq; +static struct workqueue_struct *cgroup_free_wq; /* generate an array of cgroup subsystem pointers */ #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, @@ -161,10 +189,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = { }; #undef SUBSYS -static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu); +static DEFINE_PER_CPU(struct css_rstat_cpu, root_rstat_cpu); +static DEFINE_PER_CPU(struct cgroup_rstat_base_cpu, root_rstat_base_cpu); /* the default hierarchy */ -struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu }; +struct cgroup_root cgrp_dfl_root = { + .cgrp.self.rstat_cpu = &root_rstat_cpu, + .cgrp.rstat_base_cpu = &root_rstat_base_cpu, +}; EXPORT_SYMBOL_GPL(cgrp_dfl_root); /* @@ -209,12 +241,18 @@ static u16 have_canfork_callback __read_mostly; static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS); +/* + * Write protected by cgroup_mutex and write-lock of cgroup_threadgroup_rwsem, + * read protected by either. + * + * Can only be turned on, but not turned off. + */ +bool cgroup_enable_per_threadgroup_rwsem __read_mostly; + /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { - .ns.count = REFCOUNT_INIT(2), + .ns = NS_COMMON_INIT(init_cgroup_ns), .user_ns = &init_user_ns, - .ns.ops = &cgroupns_operations, - .ns.inum = PROC_CGROUP_INIT_INO, .root_cset = &init_css_set, }; @@ -250,6 +288,7 @@ static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add); +static void cgroup_rt_init(void); #ifdef CONFIG_DEBUG_CGROUP_REF #define CGROUP_REF_FN_ATTRS noinline @@ -904,7 +943,8 @@ static void css_set_move_task(struct task_struct *task, /* * We are synchronized through cgroup_threadgroup_rwsem * against PF_EXITING setting such that we can't race - * against cgroup_exit()/cgroup_free() dropping the css_set. + * against cgroup_task_dead()/cgroup_task_free() dropping + * the css_set. */ WARN_ON_ONCE(task->flags & PF_EXITING); @@ -1295,14 +1335,30 @@ void cgroup_favor_dynmods(struct cgroup_root *root, bool favor) { bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS; - /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */ + /* + * see the comment above CGRP_ROOT_FAVOR_DYNMODS definition. + * favordynmods can flip while task is between + * cgroup_threadgroup_change_begin() and end(), so down_write global + * cgroup_threadgroup_rwsem to synchronize them. + * + * Once cgroup_enable_per_threadgroup_rwsem is enabled, holding + * cgroup_threadgroup_rwsem doesn't exlude tasks between + * cgroup_thread_group_change_begin() and end() and thus it's unsafe to + * turn off. As the scenario is unlikely, simply disallow disabling once + * enabled and print out a warning. + */ + percpu_down_write(&cgroup_threadgroup_rwsem); if (favor && !favoring) { + cgroup_enable_per_threadgroup_rwsem = true; rcu_sync_enter(&cgroup_threadgroup_rwsem.rss); root->flags |= CGRP_ROOT_FAVOR_DYNMODS; } else if (!favor && favoring) { + if (cgroup_enable_per_threadgroup_rwsem) + pr_warn_once("cgroup favordynmods: per threadgroup rwsem mechanism can't be disabled\n"); rcu_sync_exit(&cgroup_threadgroup_rwsem.rss); root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; } + percpu_up_write(&cgroup_threadgroup_rwsem); } static int cgroup_init_root_id(struct cgroup_root *root) @@ -1335,6 +1391,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) { struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; + int ret; trace_cgroup_destroy_root(root); @@ -1343,6 +1400,10 @@ static void cgroup_destroy_root(struct cgroup_root *root) BUG_ON(atomic_read(&root->nr_cgrps)); BUG_ON(!list_empty(&cgrp->self.children)); + ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier, + CGROUP_LIFETIME_OFFLINE, cgrp); + WARN_ON_ONCE(notifier_to_errno(ret)); + /* Rebind all subsystems back to the default hierarchy */ WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask)); @@ -1371,7 +1432,6 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_unlock(); - cgroup_rstat_exit(cgrp); kernfs_destroy_root(root->kf_root); cgroup_free_root(root); } @@ -1462,9 +1522,9 @@ static struct cgroup *current_cgns_cgroup_dfl(void) } else { /* * NOTE: This function may be called from bpf_cgroup_from_id() - * on a task which has already passed exit_task_namespaces() and - * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all - * cgroups visible for lookups. + * on a task which has already passed exit_nsproxy_namespaces() + * and nsproxy == NULL. Fall back to cgrp_dfl_root which will + * make all cgroups visible for lookups. */ return &cgrp_dfl_root.cgrp; } @@ -1715,7 +1775,7 @@ static void css_clear_dir(struct cgroup_subsys_state *css) css->flags &= ~CSS_VISIBLE; - if (!css->ss) { + if (css_is_self(css)) { if (cgroup_on_dfl(cgrp)) { cgroup_addrm_files(css, cgrp, cgroup_base_files, false); @@ -1747,7 +1807,7 @@ static int css_populate_dir(struct cgroup_subsys_state *css) if (css->flags & CSS_VISIBLE) return 0; - if (!css->ss) { + if (css_is_self(css)) { if (cgroup_on_dfl(cgrp)) { ret = cgroup_addrm_files(css, cgrp, cgroup_base_files, true); @@ -1876,13 +1936,6 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) } spin_unlock_irq(&css_set_lock); - if (ss->css_rstat_flush) { - list_del_rcu(&css->rstat_css_node); - synchronize_rcu(); - list_add_rcu(&css->rstat_css_node, - &dcgrp->rstat_css_list); - } - /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; if (dst_root == &cgrp_dfl_root) { @@ -2065,12 +2118,16 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) cgrp->dom_cgrp = cgrp; cgrp->max_descendants = INT_MAX; cgrp->max_depth = INT_MAX; - INIT_LIST_HEAD(&cgrp->rstat_css_list); prev_cputime_init(&cgrp->prev_cputime); for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); +#ifdef CONFIG_CGROUP_BPF + for (int i = 0; i < ARRAY_SIZE(cgrp->bpf.revisions); i++) + cgrp->bpf.revisions[i] = 1; +#endif + init_waitqueue_head(&cgrp->offline_waitq); INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } @@ -2146,7 +2203,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto destroy_root; - ret = cgroup_rstat_init(root_cgrp); + ret = css_rstat_init(&root_cgrp->self); if (ret) goto destroy_root; @@ -2154,10 +2211,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto exit_stats; - if (root == &cgrp_dfl_root) { - ret = cgroup_bpf_inherit(root_cgrp); - WARN_ON_ONCE(ret); - } + ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier, + CGROUP_LIFETIME_ONLINE, root_cgrp); + WARN_ON_ONCE(notifier_to_errno(ret)); trace_cgroup_setup_root(root); @@ -2188,7 +2244,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) goto out; exit_stats: - cgroup_rstat_exit(root_cgrp); + css_rstat_exit(&root_cgrp->self); destroy_root: kernfs_destroy_root(root->kf_root); root->kf_root = NULL; @@ -2452,7 +2508,8 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); /** * cgroup_attach_lock - Lock for ->attach() - * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem + * @lock_mode: whether acquire and acquire which rwsem + * @tsk: thread group to lock * * cgroup migration sometimes needs to stabilize threadgroups against forks and * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() @@ -2472,22 +2529,55 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); * Resolve the situation by always acquiring cpus_read_lock() before optionally * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that * CPU hotplug is disabled on entry. + * + * When favordynmods is enabled, take per threadgroup rwsem to reduce overhead + * on dynamic cgroup modifications. see the comment above + * CGRP_ROOT_FAVOR_DYNMODS definition. + * + * tsk is not NULL only when writing to cgroup.procs. */ -void cgroup_attach_lock(bool lock_threadgroup) +void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk) { cpus_read_lock(); - if (lock_threadgroup) + + switch (lock_mode) { + case CGRP_ATTACH_LOCK_NONE: + break; + case CGRP_ATTACH_LOCK_GLOBAL: percpu_down_write(&cgroup_threadgroup_rwsem); + break; + case CGRP_ATTACH_LOCK_PER_THREADGROUP: + down_write(&tsk->signal->cgroup_threadgroup_rwsem); + break; + default: + pr_warn("cgroup: Unexpected attach lock mode."); + break; + } } /** * cgroup_attach_unlock - Undo cgroup_attach_lock() - * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem + * @lock_mode: whether release and release which rwsem + * @tsk: thread group to lock */ -void cgroup_attach_unlock(bool lock_threadgroup) +void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk) { - if (lock_threadgroup) + switch (lock_mode) { + case CGRP_ATTACH_LOCK_NONE: + break; + case CGRP_ATTACH_LOCK_GLOBAL: percpu_up_write(&cgroup_threadgroup_rwsem); + break; + case CGRP_ATTACH_LOCK_PER_THREADGROUP: + up_write(&tsk->signal->cgroup_threadgroup_rwsem); + break; + default: + pr_warn("cgroup: Unexpected attach lock mode."); + break; + } + cpus_read_unlock(); } @@ -2937,14 +3027,12 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, /* look up all src csets */ spin_lock_irq(&css_set_lock); - rcu_read_lock(); task = leader; do { cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx); if (!threadgroup) break; } while_each_thread(leader, task); - rcu_read_unlock(); spin_unlock_irq(&css_set_lock); /* prepare dst csets and commit */ @@ -2961,7 +3049,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, } struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, - bool *threadgroup_locked) + enum cgroup_attach_lock_mode *lock_mode) { struct task_struct *tsk; pid_t pid; @@ -2969,24 +3057,13 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return ERR_PTR(-EINVAL); - /* - * If we migrate a single thread, we don't care about threadgroup - * stability. If the thread is `current`, it won't exit(2) under our - * hands or change PID through exec(2). We exclude - * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write - * callers by cgroup_mutex. - * Therefore, we can skip the global lock. - */ - lockdep_assert_held(&cgroup_mutex); - *threadgroup_locked = pid || threadgroup; - cgroup_attach_lock(*threadgroup_locked); - +retry_find_task: rcu_read_lock(); if (pid) { tsk = find_task_by_vpid(pid); if (!tsk) { tsk = ERR_PTR(-ESRCH); - goto out_unlock_threadgroup; + goto out_unlock_rcu; } } else { tsk = current; @@ -3003,33 +3080,58 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, */ if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { tsk = ERR_PTR(-EINVAL); - goto out_unlock_threadgroup; + goto out_unlock_rcu; } - get_task_struct(tsk); - goto out_unlock_rcu; + rcu_read_unlock(); + + /* + * If we migrate a single thread, we don't care about threadgroup + * stability. If the thread is `current`, it won't exit(2) under our + * hands or change PID through exec(2). We exclude + * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write callers + * by cgroup_mutex. Therefore, we can skip the global lock. + */ + lockdep_assert_held(&cgroup_mutex); + + if (pid || threadgroup) { + if (cgroup_enable_per_threadgroup_rwsem) + *lock_mode = CGRP_ATTACH_LOCK_PER_THREADGROUP; + else + *lock_mode = CGRP_ATTACH_LOCK_GLOBAL; + } else { + *lock_mode = CGRP_ATTACH_LOCK_NONE; + } + + cgroup_attach_lock(*lock_mode, tsk); + + if (threadgroup) { + if (!thread_group_leader(tsk)) { + /* + * A race with de_thread from another thread's exec() + * may strip us of our leadership. If this happens, + * throw this task away and try again. + */ + cgroup_attach_unlock(*lock_mode, tsk); + put_task_struct(tsk); + goto retry_find_task; + } + } + + return tsk; -out_unlock_threadgroup: - cgroup_attach_unlock(*threadgroup_locked); - *threadgroup_locked = false; out_unlock_rcu: rcu_read_unlock(); return tsk; } -void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked) +void cgroup_procs_write_finish(struct task_struct *task, + enum cgroup_attach_lock_mode lock_mode) { - struct cgroup_subsys *ss; - int ssid; + cgroup_attach_unlock(lock_mode, task); /* release reference from cgroup_procs_write_start() */ put_task_struct(task); - - cgroup_attach_unlock(threadgroup_locked); - - for_each_subsys(ss, ssid) - if (ss->post_attach) - ss->post_attach(); } static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) @@ -3081,6 +3183,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) struct cgroup_subsys_state *d_css; struct cgroup *dsct; struct css_set *src_cset; + enum cgroup_attach_lock_mode lock_mode; bool has_tasks; int ret; @@ -3112,7 +3215,13 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) * write-locking can be skipped safely. */ has_tasks = !list_empty(&mgctx.preloaded_src_csets); - cgroup_attach_lock(has_tasks); + + if (has_tasks) + lock_mode = CGRP_ATTACH_LOCK_GLOBAL; + else + lock_mode = CGRP_ATTACH_LOCK_NONE; + + cgroup_attach_lock(lock_mode, NULL); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&mgctx); @@ -3133,7 +3242,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); - cgroup_attach_unlock(has_tasks); + cgroup_attach_unlock(lock_mode, NULL); return ret; } @@ -3756,6 +3865,27 @@ static int cgroup_stat_show(struct seq_file *seq, void *v) return 0; } +static int cgroup_core_local_stat_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + unsigned int sequence; + u64 freeze_time; + + do { + sequence = read_seqcount_begin(&cgrp->freezer.freeze_seq); + freeze_time = cgrp->freezer.frozen_nsec; + /* Add in current freezer interval if the cgroup is freezing. */ + if (test_bit(CGRP_FREEZE, &cgrp->flags)) + freeze_time += (ktime_get_ns() - + cgrp->freezer.freeze_start_nsec); + } while (read_seqcount_retry(&cgrp->freezer.freeze_seq, sequence)); + + do_div(freeze_time, NSEC_PER_USEC); + seq_printf(seq, "frozen_usec %llu\n", freeze_time); + + return 0; +} + #ifdef CONFIG_CGROUP_SCHED /** * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem @@ -4152,6 +4282,7 @@ static void cgroup_file_release(struct kernfs_open_file *of) cft->release(of); put_cgroup_ns(ctx->ns); kfree(ctx); + of->priv = NULL; } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, @@ -4573,6 +4704,7 @@ void cgroup_file_notify(struct cgroup_file *cfile) } spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); } +EXPORT_SYMBOL_GPL(cgroup_file_notify); /** * cgroup_file_show - show or hide a hidden cgroup file @@ -5232,15 +5364,14 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, struct cgroup_file_ctx *ctx = of->priv; struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; - const struct cred *saved_cred; ssize_t ret; - bool threadgroup_locked; + enum cgroup_attach_lock_mode lock_mode; dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked); + task = cgroup_procs_write_start(buf, threadgroup, &lock_mode); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -5255,18 +5386,17 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, * permissions using the credentials from file open to protect against * inherited fd attacks. */ - saved_cred = override_creds(of->file->f_cred); - ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb, - threadgroup, ctx->ns); - revert_creds(saved_cred); + scoped_with_creds(of->file->f_cred) + ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, + of->file->f_path.dentry->d_sb, + threadgroup, ctx->ns); if (ret) goto out_finish; ret = cgroup_attach_task(dst_cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task, threadgroup_locked); + cgroup_procs_write_finish(task, lock_mode); out_unlock: cgroup_kn_unlock(of->kn); @@ -5348,6 +5478,11 @@ static struct cftype cgroup_base_files[] = { .seq_show = cgroup_stat_show, }, { + .name = "cgroup.stat.local", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_core_local_stat_show, + }, + { .name = "cgroup.freeze", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_freeze_show, @@ -5444,8 +5579,9 @@ static void css_free_rwork_fn(struct work_struct *work) struct cgroup *cgrp = css->cgroup; percpu_ref_exit(&css->refcnt); + css_rstat_exit(css); - if (ss) { + if (!css_is_self(css)) { /* css free path */ struct cgroup_subsys_state *parent = css->parent; int id = css->id; @@ -5474,7 +5610,6 @@ static void css_free_rwork_fn(struct work_struct *work) cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); psi_cgroup_free(cgrp); - cgroup_rstat_exit(cgrp); kfree(cgrp); } else { /* @@ -5499,14 +5634,10 @@ static void css_release_work_fn(struct work_struct *work) css->flags |= CSS_RELEASED; list_del_rcu(&css->sibling); - if (ss) { + if (!css_is_self(css)) { struct cgroup *parent_cgrp; - /* css release path */ - if (!list_empty(&css->rstat_css_node)) { - cgroup_rstat_flush(cgrp); - list_del_rcu(&css->rstat_css_node); - } + css_rstat_flush(css); cgroup_idr_replace(&ss->css_idr, NULL, css->id); if (ss->css_released) @@ -5532,7 +5663,7 @@ static void css_release_work_fn(struct work_struct *work) /* cgroup release path */ TRACE_CGROUP_PATH(release, cgrp); - cgroup_rstat_flush(cgrp); + css_rstat_flush(&cgrp->self); spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; @@ -5555,7 +5686,7 @@ static void css_release_work_fn(struct work_struct *work) cgroup_unlock(); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); } static void css_release(struct percpu_ref *ref) @@ -5564,7 +5695,7 @@ static void css_release(struct percpu_ref *ref) container_of(ref, struct cgroup_subsys_state, refcnt); INIT_WORK(&css->destroy_work, css_release_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + queue_work(cgroup_release_wq, &css->destroy_work); } static void init_and_link_css(struct cgroup_subsys_state *css, @@ -5580,7 +5711,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css->id = -1; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); - INIT_LIST_HEAD(&css->rstat_css_node); css->serial_nr = css_serial_nr_next++; atomic_set(&css->online_cnt, 0); @@ -5589,9 +5719,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css_get(css->parent); } - if (ss->css_rstat_flush) - list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list); - BUG_ON(cgroup_css(cgrp, ss)); } @@ -5684,6 +5811,10 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, goto err_free_css; css->id = err; + err = css_rstat_init(css); + if (err) + goto err_free_css; + /* @css is ready to be brought online now, make it visible */ list_add_tail_rcu(&css->sibling, &parent_css->children); cgroup_idr_replace(&ss->css_idr, css, css->id); @@ -5697,9 +5828,8 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, err_list_del: list_del_rcu(&css->sibling); err_free_css: - list_del_rcu(&css->rstat_css_node); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); return ERR_PTR(err); } @@ -5713,7 +5843,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, struct cgroup_root *root = parent->root; struct cgroup *cgrp, *tcgrp; struct kernfs_node *kn; - int level = parent->level + 1; + int i, level = parent->level + 1; int ret; /* allocate the cgroup and its ID, 0 is reserved for the root */ @@ -5725,17 +5855,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, if (ret) goto out_free_cgrp; - ret = cgroup_rstat_init(cgrp); - if (ret) - goto out_cancel_ref; - /* create the directory */ kn = kernfs_create_dir_ns(parent->kn, name, mode, current_fsuid(), current_fsgid(), cgrp, NULL); if (IS_ERR(kn)) { ret = PTR_ERR(kn); - goto out_stat_exit; + goto out_cancel_ref; } cgrp->kn = kn; @@ -5745,21 +5871,27 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, cgrp->root = root; cgrp->level = level; - ret = psi_cgroup_alloc(cgrp); + /* + * Now that init_cgroup_housekeeping() has been called and cgrp->self + * is setup, it is safe to perform rstat initialization on it. + */ + ret = css_rstat_init(&cgrp->self); if (ret) goto out_kernfs_remove; - if (cgrp->root == &cgrp_dfl_root) { - ret = cgroup_bpf_inherit(cgrp); - if (ret) - goto out_psi_free; - } + ret = psi_cgroup_alloc(cgrp); + if (ret) + goto out_stat_exit; + + for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) + cgrp->ancestors[tcgrp->level] = tcgrp; /* * New cgroup inherits effective freeze counter, and * if the parent has to be frozen, the child has too. */ cgrp->freezer.e_freeze = parent->freezer.e_freeze; + seqcount_spinlock_init(&cgrp->freezer.freeze_seq, &css_set_lock); if (cgrp->freezer.e_freeze) { /* * Set the CGRP_FREEZE flag, so when a process will be @@ -5768,27 +5900,10 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, * consider it frozen immediately. */ set_bit(CGRP_FREEZE, &cgrp->flags); + cgrp->freezer.freeze_start_nsec = ktime_get_ns(); set_bit(CGRP_FROZEN, &cgrp->flags); } - spin_lock_irq(&css_set_lock); - for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { - cgrp->ancestors[tcgrp->level] = tcgrp; - - if (tcgrp != cgrp) { - tcgrp->nr_descendants++; - - /* - * If the new cgroup is frozen, all ancestor cgroups - * get a new frozen descendant, but their state can't - * change because of this. - */ - if (cgrp->freezer.e_freeze) - tcgrp->freezer.nr_frozen_descendants++; - } - } - spin_unlock_irq(&css_set_lock); - if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -5797,7 +5912,29 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, cgrp->self.serial_nr = css_serial_nr_next++; + ret = blocking_notifier_call_chain_robust(&cgroup_lifetime_notifier, + CGROUP_LIFETIME_ONLINE, + CGROUP_LIFETIME_OFFLINE, cgrp); + ret = notifier_to_errno(ret); + if (ret) + goto out_psi_free; + /* allocation complete, commit to creation */ + spin_lock_irq(&css_set_lock); + for (i = 0; i < level; i++) { + tcgrp = cgrp->ancestors[i]; + tcgrp->nr_descendants++; + + /* + * If the new cgroup is frozen, all ancestor cgroups get a new + * frozen descendant, but their state can't change because of + * this. + */ + if (cgrp->freezer.e_freeze) + tcgrp->freezer.nr_frozen_descendants++; + } + spin_unlock_irq(&css_set_lock); + list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); atomic_inc(&root->nr_cgrps); cgroup_get_live(parent); @@ -5815,10 +5952,10 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, out_psi_free: psi_cgroup_free(cgrp); +out_stat_exit: + css_rstat_exit(&cgrp->self); out_kernfs_remove: kernfs_remove(cgrp->kn); -out_stat_exit: - cgroup_rstat_exit(cgrp); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: @@ -5932,7 +6069,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) if (atomic_dec_and_test(&css->online_cnt)) { INIT_WORK(&css->destroy_work, css_killed_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + queue_work(cgroup_offline_wq, &css->destroy_work); } } @@ -6015,7 +6152,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) struct cgroup *tcgrp, *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *css; struct cgrp_cset_link *link; - int ssid; + int ssid, ret; lockdep_assert_held(&cgroup_mutex); @@ -6073,8 +6210,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) cgroup1_check_for_release(parent); - if (cgrp->root == &cgrp_dfl_root) - cgroup_bpf_offline(cgrp); + ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier, + CGROUP_LIFETIME_OFFLINE, cgrp); + WARN_ON_ONCE(notifier_to_errno(ret)); /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); @@ -6136,6 +6274,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) } else { css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); BUG_ON(css->id < 0); + + BUG_ON(ss_rstat_init(ss)); + BUG_ON(css_rstat_init(css)); } /* Update the init_css_set to contain a subsys @@ -6184,6 +6325,8 @@ int __init cgroup_init_early(void) ss->id, ss->name); WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN, "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]); + WARN(ss->early_init && ss->css_rstat_flush, + "cgroup rstat cannot be used with early init subsystem\n"); ss->id = i; ss->name = cgroup_subsys_name[i]; @@ -6212,9 +6355,10 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); - cgroup_rstat_boot(); + BUG_ON(ss_rstat_init(NULL)); get_user_ns(init_cgroup_ns.user_ns); + cgroup_rt_init(); cgroup_lock(); @@ -6225,6 +6369,8 @@ int __init cgroup_init(void) hash_add(css_set_table, &init_css_set.hlist, css_set_hash(init_css_set.subsys)); + cgroup_bpf_lifetime_notifier_init(); + BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); cgroup_unlock(); @@ -6297,6 +6443,7 @@ int __init cgroup_init(void) WARN_ON(register_filesystem(&cpuset_fs_type)); #endif + ns_tree_add(&init_cgroup_ns); return 0; } @@ -6310,8 +6457,14 @@ static int __init cgroup_wq_init(void) * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after. */ - cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); - BUG_ON(!cgroup_destroy_wq); + cgroup_offline_wq = alloc_workqueue("cgroup_offline", WQ_PERCPU, 1); + BUG_ON(!cgroup_offline_wq); + + cgroup_release_wq = alloc_workqueue("cgroup_release", WQ_PERCPU, 1); + BUG_ON(!cgroup_release_wq); + + cgroup_free_wq = alloc_workqueue("cgroup_free", WQ_PERCPU, 1); + BUG_ON(!cgroup_free_wq); return 0; } core_initcall(cgroup_wq_init); @@ -6328,15 +6481,15 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) } /* - * cgroup_get_from_id : get the cgroup associated with cgroup id + * __cgroup_get_from_id : get the cgroup associated with cgroup id * @id: cgroup id * On success return the cgrp or ERR_PTR on failure - * Only cgroups within current task's cgroup NS are valid. + * There are no cgroup NS restrictions. */ -struct cgroup *cgroup_get_from_id(u64 id) +struct cgroup *__cgroup_get_from_id(u64 id) { struct kernfs_node *kn; - struct cgroup *cgrp, *root_cgrp; + struct cgroup *cgrp; kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) @@ -6358,6 +6511,22 @@ struct cgroup *cgroup_get_from_id(u64 id) if (!cgrp) return ERR_PTR(-ENOENT); + return cgrp; +} + +/* + * cgroup_get_from_id : get the cgroup associated with cgroup id + * @id: cgroup id + * On success return the cgrp or ERR_PTR on failure + * Only cgroups within current task's cgroup NS are valid. + */ +struct cgroup *cgroup_get_from_id(u64 id) +{ + struct cgroup *cgrp, *root_cgrp; + + cgrp = __cgroup_get_from_id(id); + if (IS_ERR(cgrp)) + return cgrp; root_cgrp = current_cgns_cgroup_dfl(); if (!cgroup_is_descendant(cgrp, root_cgrp)) { @@ -6803,19 +6972,29 @@ void cgroup_post_fork(struct task_struct *child, } /** - * cgroup_exit - detach cgroup from exiting task + * cgroup_task_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process * * Description: Detach cgroup from @tsk. * */ -void cgroup_exit(struct task_struct *tsk) +void cgroup_task_exit(struct task_struct *tsk) { struct cgroup_subsys *ss; - struct css_set *cset; int i; - spin_lock_irq(&css_set_lock); + /* see cgroup_post_fork() for details */ + do_each_subsys_mask(ss, i, have_exit_callback) { + ss->exit(tsk); + } while_each_subsys_mask(); +} + +static void do_cgroup_task_dead(struct task_struct *tsk) +{ + struct css_set *cset; + unsigned long flags; + + spin_lock_irqsave(&css_set_lock, flags); WARN_ON_ONCE(list_empty(&tsk->cg_list)); cset = task_css_set(tsk); @@ -6833,15 +7012,61 @@ void cgroup_exit(struct task_struct *tsk) test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags))) cgroup_update_frozen(task_dfl_cgroup(tsk)); - spin_unlock_irq(&css_set_lock); + spin_unlock_irqrestore(&css_set_lock, flags); +} - /* see cgroup_post_fork() for details */ - do_each_subsys_mask(ss, i, have_exit_callback) { - ss->exit(tsk); - } while_each_subsys_mask(); +#ifdef CONFIG_PREEMPT_RT +/* + * cgroup_task_dead() is called from finish_task_switch() which doesn't allow + * scheduling even in RT. As the task_dead path requires grabbing css_set_lock, + * this lead to sleeping in the invalid context warning bug. css_set_lock is too + * big to become a raw_spinlock. The task_dead path doesn't need to run + * synchronously but can't be delayed indefinitely either as the dead task pins + * the cgroup and task_struct can be pinned indefinitely. Bounce through lazy + * irq_work to allow batching while ensuring timely completion. + */ +static DEFINE_PER_CPU(struct llist_head, cgrp_dead_tasks); +static DEFINE_PER_CPU(struct irq_work, cgrp_dead_tasks_iwork); + +static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork) +{ + struct llist_node *lnode; + struct task_struct *task, *next; + + lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks)); + llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) { + do_cgroup_task_dead(task); + put_task_struct(task); + } +} + +static void __init cgroup_rt_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + init_llist_head(per_cpu_ptr(&cgrp_dead_tasks, cpu)); + per_cpu(cgrp_dead_tasks_iwork, cpu) = + IRQ_WORK_INIT_LAZY(cgrp_dead_tasks_iwork_fn); + } +} + +void cgroup_task_dead(struct task_struct *task) +{ + get_task_struct(task); + llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks)); + irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork)); +} +#else /* CONFIG_PREEMPT_RT */ +static void __init cgroup_rt_init(void) {} + +void cgroup_task_dead(struct task_struct *task) +{ + do_cgroup_task_dead(task); } +#endif /* CONFIG_PREEMPT_RT */ -void cgroup_release(struct task_struct *task) +void cgroup_task_release(struct task_struct *task) { struct cgroup_subsys *ss; int ssid; @@ -6849,6 +7074,11 @@ void cgroup_release(struct task_struct *task) do_each_subsys_mask(ss, ssid, have_release_callback) { ss->release(task); } while_each_subsys_mask(); +} + +void cgroup_task_free(struct task_struct *task) +{ + struct css_set *cset = task_css_set(task); if (!list_empty(&task->cg_list)) { spin_lock_irq(&css_set_lock); @@ -6856,11 +7086,7 @@ void cgroup_release(struct task_struct *task) list_del_init(&task->cg_list); spin_unlock_irq(&css_set_lock); } -} -void cgroup_free(struct task_struct *task) -{ - struct css_set *cset = task_css_set(task); put_css_set(cset); } diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 383963e28ac6..01976c8e7d49 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -38,7 +38,6 @@ enum prs_errcode { /* bits in struct cpuset flags field */ typedef enum { - CS_ONLINE, CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, CS_MEM_HARDWALL, @@ -156,13 +155,17 @@ struct cpuset { /* for custom sched domain */ int relax_domain_level; - /* number of valid local child partitions */ - int nr_subparts; - /* partition root state */ int partition_root_state; /* + * Whether cpuset is a remote partition. + * It used to be a list anchoring all remote partitions — we can switch back + * to a list if we need to iterate over the remote partitions. + */ + bool remote_partition; + + /* * number of SCHED_DEADLINE tasks attached to this cpuset, so that we * know when to rebuild associated root domain bandwidth information. */ @@ -176,9 +179,6 @@ struct cpuset { /* Handle for cpuset.cpus.partition */ struct cgroup_file partition_file; - /* Remote partition silbling list anchored at remote_children */ - struct list_head remote_sibling; - /* Used to merge intersecting subsets for generate_sched_domains */ struct uf_node node; }; @@ -202,7 +202,7 @@ static inline struct cpuset *parent_cs(struct cpuset *cs) /* convenient tests for these bits */ static inline bool is_cpuset_online(struct cpuset *cs) { - return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); + return css_is_online(&cs->css) && !css_is_dying(&cs->css); } static inline int is_cpu_exclusive(const struct cpuset *cs) @@ -277,6 +277,8 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) ssize_t cpuset_write_resmask(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); int cpuset_common_seq_show(struct seq_file *sf, void *v); +void cpuset_full_lock(void); +void cpuset_full_unlock(void); /* * cpuset-v1.c diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index b69a7db67090..12e76774c75b 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -169,8 +169,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; - cpus_read_lock(); - cpuset_lock(); + cpuset_full_lock(); if (!is_cpuset_online(cs)) goto out_unlock; @@ -184,8 +183,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - cpuset_unlock(); - cpus_read_unlock(); + cpuset_full_unlock(); return retval; } @@ -454,8 +452,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; - cpus_read_lock(); - cpuset_lock(); + cpuset_full_lock(); if (!is_cpuset_online(cs)) { retval = -ENODEV; goto out_unlock; @@ -498,8 +495,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - cpuset_unlock(); - cpus_read_unlock(); + cpuset_full_unlock(); return retval; } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 306b60430091..3e8cc34d8d50 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -40,6 +40,7 @@ #include <linux/sched/isolation.h> #include <linux/wait.h> #include <linux/workqueue.h> +#include <linux/task_work.h> DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); @@ -81,14 +82,18 @@ static cpumask_var_t subpartitions_cpus; static cpumask_var_t isolated_cpus; /* + * isolated_cpus updating flag (protected by cpuset_mutex) + * Set if isolated_cpus is going to be updated in the current + * cpuset_mutex crtical section. + */ +static bool isolated_cpus_updating; + +/* * Housekeeping (HK_TYPE_DOMAIN) CPUs at boot */ static cpumask_var_t boot_hk_cpus; static bool have_boot_isolcpus; -/* List of remote partition root children */ -static struct list_head remote_children; - /* * A flag to force sched domain rebuild at the end of an operation. * It can be set in @@ -131,11 +136,6 @@ static bool force_sd_rebuild; #define PRS_INVALID_ROOT -1 #define PRS_INVALID_ISOLATED -2 -static inline bool is_prs_invalid(int prs_state) -{ - return prs_state < 0; -} - /* * Temporary cpumasks for working with partitions that are passed among * functions to avoid memory allocation in inner functions. @@ -159,16 +159,21 @@ void dec_dl_tasks_cs(struct task_struct *p) cs->nr_deadline_tasks--; } -static inline int is_partition_valid(const struct cpuset *cs) +static inline bool is_partition_valid(const struct cpuset *cs) { return cs->partition_root_state > 0; } -static inline int is_partition_invalid(const struct cpuset *cs) +static inline bool is_partition_invalid(const struct cpuset *cs) { return cs->partition_root_state < 0; } +static inline bool cs_is_member(const struct cpuset *cs) +{ + return cs->partition_root_state == PRS_MEMBER; +} + /* * Callers should hold callback_lock to modify partition_root_state. */ @@ -192,12 +197,26 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs) WRITE_ONCE(cs->prs_err, PERR_NONE); } +/* + * The top_cpuset is always synchronized to cpu_active_mask and we should avoid + * using cpu_online_mask as much as possible. An active CPU is always an online + * CPU, but not vice versa. cpu_active_mask and cpu_online_mask can differ + * during hotplug operations. A CPU is marked active at the last stage of CPU + * bringup (CPUHP_AP_ACTIVE). It is also the stage where cpuset hotplug code + * will be called to update the sched domains so that the scheduler can move + * a normal task to a newly active CPU or remove tasks away from a newly + * inactivated CPU. The online bit is set much earlier in the CPU bringup + * process and cleared much later in CPU teardown. + * + * If cpu_online_mask is used while a hotunplug operation is happening in + * parallel, we may leave an offline CPU in cpu_allowed or some other masks. + */ static struct cpuset top_cpuset = { - .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) | + .flags = BIT(CS_CPU_EXCLUSIVE) | BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, .relax_domain_level = -1, - .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), + .remote_partition = false, }; /* @@ -236,6 +255,12 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(cpuset_mutex); +/** + * cpuset_lock - Acquire the global cpuset mutex + * + * This locks the global cpuset mutex to prevent modifications to cpuset + * hierarchy and configurations. This helper is not enough to make modification. + */ void cpuset_lock(void) { mutex_lock(&cpuset_mutex); @@ -246,6 +271,24 @@ void cpuset_unlock(void) mutex_unlock(&cpuset_mutex); } +/** + * cpuset_full_lock - Acquire full protection for cpuset modification + * + * Takes both CPU hotplug read lock (cpus_read_lock()) and cpuset mutex + * to safely modify cpuset data. + */ +void cpuset_full_lock(void) +{ + cpus_read_lock(); + mutex_lock(&cpuset_mutex); +} + +void cpuset_full_unlock(void) +{ + mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); +} + static DEFINE_SPINLOCK(callback_lock); void cpuset_callback_lock_irq(void) @@ -266,7 +309,7 @@ static inline void check_insane_mems_config(nodemask_t *nodes) { if (!cpusets_insane_config() && movable_only_nodes(nodes)) { - static_branch_enable(&cpusets_insane_config_key); + static_branch_enable_cpuslocked(&cpusets_insane_config_key); pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n" "Cpuset allocations might fail even with a lot of memory available.\n", nodemask_pr_args(nodes)); @@ -313,33 +356,55 @@ static inline bool is_in_v2_mode(void) (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); } +static inline bool cpuset_is_populated(struct cpuset *cs) +{ + lockdep_assert_held(&cpuset_mutex); + + /* Cpusets in the process of attaching should be considered as populated */ + return cgroup_is_populated(cs->css.cgroup) || + cs->attach_in_progress; +} + /** * partition_is_populated - check if partition has tasks * @cs: partition root to be checked * @excluded_child: a child cpuset to be excluded in task checking * Return: true if there are tasks, false otherwise * - * It is assumed that @cs is a valid partition root. @excluded_child should - * be non-NULL when this cpuset is going to become a partition itself. + * @cs should be a valid partition root or going to become a partition root. + * @excluded_child should be non-NULL when this cpuset is going to become a + * partition itself. + * + * Note that a remote partition is not allowed underneath a valid local + * or remote partition. So if a non-partition root child is populated, + * the whole partition is considered populated. */ static inline bool partition_is_populated(struct cpuset *cs, struct cpuset *excluded_child) { - struct cgroup_subsys_state *css; - struct cpuset *child; + struct cpuset *cp; + struct cgroup_subsys_state *pos_css; - if (cs->css.cgroup->nr_populated_csets) + /* + * We cannot call cs_is_populated(cs) directly, as + * nr_populated_domain_children may include populated + * csets from descendants that are partitions. + */ + if (cs->css.cgroup->nr_populated_csets || + cs->attach_in_progress) return true; - if (!excluded_child && !cs->nr_subparts) - return cgroup_is_populated(cs->css.cgroup); rcu_read_lock(); - cpuset_for_each_child(child, css, cs) { - if (child == excluded_child) + cpuset_for_each_descendant_pre(cp, pos_css, cs) { + if (cp == cs || cp == excluded_child) continue; - if (is_partition_valid(child)) + + if (is_partition_valid(cp)) { + pos_css = css_rightmost_descendant(pos_css); continue; - if (cgroup_is_populated(child->css.cgroup)) { + } + + if (cpuset_is_populated(cp)) { rcu_read_unlock(); return true; } @@ -355,18 +420,18 @@ static inline bool partition_is_populated(struct cpuset *cs, * appropriate cpus. * * One way or another, we guarantee to return some non-empty subset - * of cpu_online_mask. + * of cpu_active_mask. * * Call with callback_lock or cpuset_mutex held. */ -static void guarantee_online_cpus(struct task_struct *tsk, +static void guarantee_active_cpus(struct task_struct *tsk, struct cpumask *pmask) { const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); struct cpuset *cs; - if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask))) - cpumask_copy(pmask, cpu_online_mask); + if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask))) + cpumask_copy(pmask, cpu_active_mask); rcu_read_lock(); cs = task_cs(tsk); @@ -397,94 +462,104 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) } /** - * alloc_cpumasks - allocate three cpumasks for cpuset - * @cs: the cpuset that have cpumasks to be allocated. - * @tmp: the tmpmasks structure pointer + * alloc_cpumasks - Allocate an array of cpumask variables + * @pmasks: Pointer to array of cpumask_var_t pointers + * @size: Number of cpumasks to allocate * Return: 0 if successful, -ENOMEM otherwise. * - * Only one of the two input arguments should be non-NULL. + * Allocates @size cpumasks and initializes them to empty. Returns 0 on + * success, -ENOMEM on allocation failure. On failure, any previously + * allocated cpumasks are freed. */ -static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) +static inline int alloc_cpumasks(cpumask_var_t *pmasks[], u32 size) { - cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4; + int i; - if (cs) { - pmask1 = &cs->cpus_allowed; - pmask2 = &cs->effective_cpus; - pmask3 = &cs->effective_xcpus; - pmask4 = &cs->exclusive_cpus; - } else { - pmask1 = &tmp->new_cpus; - pmask2 = &tmp->addmask; - pmask3 = &tmp->delmask; - pmask4 = NULL; + for (i = 0; i < size; i++) { + if (!zalloc_cpumask_var(pmasks[i], GFP_KERNEL)) { + while (--i >= 0) + free_cpumask_var(*pmasks[i]); + return -ENOMEM; + } } - - if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) - return -ENOMEM; - - if (!zalloc_cpumask_var(pmask2, GFP_KERNEL)) - goto free_one; - - if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) - goto free_two; - - if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL)) - goto free_three; - - return 0; +} + +/** + * alloc_tmpmasks - Allocate temporary cpumasks for cpuset operations. + * @tmp: Pointer to tmpmasks structure to populate + * Return: 0 on success, -ENOMEM on allocation failure + */ +static inline int alloc_tmpmasks(struct tmpmasks *tmp) +{ + /* + * Array of pointers to the three cpumask_var_t fields in tmpmasks. + * Note: Array size must match actual number of masks (3) + */ + cpumask_var_t *pmask[3] = { + &tmp->new_cpus, + &tmp->addmask, + &tmp->delmask + }; -free_three: - free_cpumask_var(*pmask3); -free_two: - free_cpumask_var(*pmask2); -free_one: - free_cpumask_var(*pmask1); - return -ENOMEM; + return alloc_cpumasks(pmask, ARRAY_SIZE(pmask)); } /** - * free_cpumasks - free cpumasks in a tmpmasks structure - * @cs: the cpuset that have cpumasks to be free. + * free_tmpmasks - free cpumasks in a tmpmasks structure * @tmp: the tmpmasks structure pointer */ -static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) +static inline void free_tmpmasks(struct tmpmasks *tmp) { - if (cs) { - free_cpumask_var(cs->cpus_allowed); - free_cpumask_var(cs->effective_cpus); - free_cpumask_var(cs->effective_xcpus); - free_cpumask_var(cs->exclusive_cpus); - } - if (tmp) { - free_cpumask_var(tmp->new_cpus); - free_cpumask_var(tmp->addmask); - free_cpumask_var(tmp->delmask); - } + if (!tmp) + return; + + free_cpumask_var(tmp->new_cpus); + free_cpumask_var(tmp->addmask); + free_cpumask_var(tmp->delmask); } /** - * alloc_trial_cpuset - allocate a trial cpuset - * @cs: the cpuset that the trial cpuset duplicates + * dup_or_alloc_cpuset - Duplicate or allocate a new cpuset + * @cs: Source cpuset to duplicate (NULL for a fresh allocation) + * + * Creates a new cpuset by either: + * 1. Duplicating an existing cpuset (if @cs is non-NULL), or + * 2. Allocating a fresh cpuset with zero-initialized masks (if @cs is NULL) + * + * Return: Pointer to newly allocated cpuset on success, NULL on failure */ -static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) +static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs) { struct cpuset *trial; - trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL); + /* Allocate base structure */ + trial = cs ? kmemdup(cs, sizeof(*cs), GFP_KERNEL) : + kzalloc(sizeof(*cs), GFP_KERNEL); if (!trial) return NULL; - if (alloc_cpumasks(trial, NULL)) { + /* Setup cpumask pointer array */ + cpumask_var_t *pmask[4] = { + &trial->cpus_allowed, + &trial->effective_cpus, + &trial->effective_xcpus, + &trial->exclusive_cpus + }; + + if (alloc_cpumasks(pmask, ARRAY_SIZE(pmask))) { kfree(trial); return NULL; } - cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); - cpumask_copy(trial->effective_cpus, cs->effective_cpus); - cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); - cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus); + /* Copy masks if duplicating */ + if (cs) { + cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); + cpumask_copy(trial->effective_cpus, cs->effective_cpus); + cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); + cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus); + } + return trial; } @@ -494,7 +569,10 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) */ static inline void free_cpuset(struct cpuset *cs) { - free_cpumasks(cs, NULL); + free_cpumask_var(cs->cpus_allowed); + free_cpumask_var(cs->effective_cpus); + free_cpumask_var(cs->effective_xcpus); + free_cpumask_var(cs->exclusive_cpus); kfree(cs); } @@ -526,6 +604,47 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) return true; } +/** + * cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts + * @cs1: first cpuset to check + * @cs2: second cpuset to check + * + * Returns: true if CPU exclusivity conflict exists, false otherwise + * + * Conflict detection rules: + * 1. If either cpuset is CPU exclusive, they must be mutually exclusive + * 2. exclusive_cpus masks cannot intersect between cpusets + * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs + */ +static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +{ + /* If either cpuset is exclusive, check if they are mutually exclusive */ + if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2)) + return !cpusets_are_exclusive(cs1, cs2); + + /* Exclusive_cpus cannot intersect */ + if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus)) + return true; + + /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */ + if (!cpumask_empty(cs1->cpus_allowed) && + cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus)) + return true; + + if (!cpumask_empty(cs2->cpus_allowed) && + cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus)) + return true; + + return false; +} + +static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +{ + if ((is_mem_exclusive(cs1) || is_mem_exclusive(cs2))) + return nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); + return false; +} + /* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. @@ -570,7 +689,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) * be changed to have empty cpus_allowed or mems_allowed. */ ret = -ENOSPC; - if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) { + if (cpuset_is_populated(cur)) { if (!cpumask_empty(cur->cpus_allowed) && cpumask_empty(trial->cpus_allowed)) goto out; @@ -607,38 +726,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) */ ret = -EINVAL; cpuset_for_each_child(c, css, par) { - bool txset, cxset; /* Are exclusive_cpus set? */ - if (c == cur) continue; - - txset = !cpumask_empty(trial->exclusive_cpus); - cxset = !cpumask_empty(c->exclusive_cpus); - if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) || - (txset && cxset)) { - if (!cpusets_are_exclusive(trial, c)) - goto out; - } else if (txset || cxset) { - struct cpumask *xcpus, *acpus; - - /* - * When just one of the exclusive_cpus's is set, - * cpus_allowed of the other cpuset, if set, cannot be - * a subset of it or none of those CPUs will be - * available if these exclusive CPUs are activated. - */ - if (txset) { - xcpus = trial->exclusive_cpus; - acpus = c->cpus_allowed; - } else { - xcpus = c->exclusive_cpus; - acpus = trial->cpus_allowed; - } - if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus)) - goto out; - } - if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && - nodes_intersects(trial->mems_allowed, c->mems_allowed)) + if (cpus_excl_conflict(trial, c)) + goto out; + if (mems_excl_conflict(trial, c)) goto out; } @@ -1116,9 +1208,11 @@ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) if (top_cs) { /* - * Percpu kthreads in top_cpuset are ignored + * PF_NO_SETAFFINITY tasks are ignored. + * All per cpu kthreads should have PF_NO_SETAFFINITY + * flag set, see kthread_set_per_cpu(). */ - if (kthread_is_per_cpu(task)) + if (task->flags & PF_NO_SETAFFINITY) continue; cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus); } else { @@ -1234,7 +1328,6 @@ static void reset_partition_data(struct cpuset *cs) lockdep_assert_held(&callback_lock); - cs->nr_subparts = 0; if (cpumask_empty(cs->exclusive_cpus)) { cpumask_clear(cs->effective_xcpus); if (is_cpu_exclusive(cs)) @@ -1257,6 +1350,8 @@ static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus cpumask_or(isolated_cpus, isolated_cpus, xcpus); else cpumask_andnot(isolated_cpus, isolated_cpus, xcpus); + + isolated_cpus_updating = true; } /* @@ -1264,15 +1359,12 @@ static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus * @new_prs: new partition_root_state * @parent: parent cpuset * @xcpus: exclusive CPUs to be added - * Return: true if isolated_cpus modified, false otherwise * * Remote partition if parent == NULL */ -static bool partition_xcpus_add(int new_prs, struct cpuset *parent, +static void partition_xcpus_add(int new_prs, struct cpuset *parent, struct cpumask *xcpus) { - bool isolcpus_updated; - WARN_ON_ONCE(new_prs < 0); lockdep_assert_held(&callback_lock); if (!parent) @@ -1282,13 +1374,11 @@ static bool partition_xcpus_add(int new_prs, struct cpuset *parent, if (parent == &top_cpuset) cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus); - isolcpus_updated = (new_prs != parent->partition_root_state); - if (isolcpus_updated) + if (new_prs != parent->partition_root_state) isolated_cpus_update(parent->partition_root_state, new_prs, xcpus); cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus); - return isolcpus_updated; } /* @@ -1296,15 +1386,12 @@ static bool partition_xcpus_add(int new_prs, struct cpuset *parent, * @old_prs: old partition_root_state * @parent: parent cpuset * @xcpus: exclusive CPUs to be removed - * Return: true if isolated_cpus modified, false otherwise * * Remote partition if parent == NULL */ -static bool partition_xcpus_del(int old_prs, struct cpuset *parent, +static void partition_xcpus_del(int old_prs, struct cpuset *parent, struct cpumask *xcpus) { - bool isolcpus_updated; - WARN_ON_ONCE(old_prs < 0); lockdep_assert_held(&callback_lock); if (!parent) @@ -1313,27 +1400,95 @@ static bool partition_xcpus_del(int old_prs, struct cpuset *parent, if (parent == &top_cpuset) cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus); - isolcpus_updated = (old_prs != parent->partition_root_state); - if (isolcpus_updated) + if (old_prs != parent->partition_root_state) isolated_cpus_update(old_prs, parent->partition_root_state, xcpus); cpumask_and(xcpus, xcpus, cpu_active_mask); cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus); - return isolcpus_updated; } -static void update_unbound_workqueue_cpumask(bool isolcpus_updated) +/* + * isolated_cpus_can_update - check for isolated & nohz_full conflicts + * @add_cpus: cpu mask for cpus that are going to be isolated + * @del_cpus: cpu mask for cpus that are no longer isolated, can be NULL + * Return: false if there is conflict, true otherwise + * + * If nohz_full is enabled and we have isolated CPUs, their combination must + * still leave housekeeping CPUs. + * + * TBD: Should consider merging this function into + * prstate_housekeeping_conflict(). + */ +static bool isolated_cpus_can_update(struct cpumask *add_cpus, + struct cpumask *del_cpus) { - int ret; + cpumask_var_t full_hk_cpus; + int res = true; - lockdep_assert_cpus_held(); + if (!housekeeping_enabled(HK_TYPE_KERNEL_NOISE)) + return true; + + if (del_cpus && cpumask_weight_and(del_cpus, + housekeeping_cpumask(HK_TYPE_KERNEL_NOISE))) + return true; - if (!isolcpus_updated) + if (!alloc_cpumask_var(&full_hk_cpus, GFP_KERNEL)) + return false; + + cpumask_and(full_hk_cpus, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE), + housekeeping_cpumask(HK_TYPE_DOMAIN)); + cpumask_andnot(full_hk_cpus, full_hk_cpus, isolated_cpus); + cpumask_and(full_hk_cpus, full_hk_cpus, cpu_active_mask); + if (!cpumask_weight_andnot(full_hk_cpus, add_cpus)) + res = false; + + free_cpumask_var(full_hk_cpus); + return res; +} + +/* + * prstate_housekeeping_conflict - check for partition & housekeeping conflicts + * @prstate: partition root state to be checked + * @new_cpus: cpu mask + * Return: true if there is conflict, false otherwise + * + * CPUs outside of boot_hk_cpus, if defined, can only be used in an + * isolated partition. + */ +static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) +{ + if (!have_boot_isolcpus) + return false; + + if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus)) + return true; + + return false; +} + +/* + * update_isolation_cpumasks - Update external isolation related CPU masks + * + * The following external CPU masks will be updated if necessary: + * - workqueue unbound cpumask + */ +static void update_isolation_cpumasks(void) +{ + int ret; + + if (!isolated_cpus_updating) return; + lockdep_assert_cpus_held(); + ret = workqueue_unbound_exclude_cpumask(isolated_cpus); WARN_ON_ONCE(ret < 0); + + ret = tmigr_isolated_exclude_cpumask(isolated_cpus); + WARN_ON_ONCE(ret < 0); + + isolated_cpus_updating = false; } /** @@ -1347,38 +1502,25 @@ bool cpuset_cpu_is_isolated(int cpu) } EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); -/* - * compute_effective_exclusive_cpumask - compute effective exclusive CPUs - * @cs: cpuset - * @xcpus: effective exclusive CPUs value to be set - * @real_cs: the real cpuset (can be NULL) - * Return: 0 if there is no sibling conflict, > 0 otherwise +/** + * rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets + * @parent: Parent cpuset containing all siblings + * @cs: Current cpuset (will be skipped) + * @excpus: exclusive effective CPU mask to modify * - * If exclusive_cpus isn't explicitly set or a real_cs is provided, we have to - * scan the sibling cpusets and exclude their exclusive_cpus or effective_xcpus - * as well. The provision of real_cs means that a cpumask is being changed and - * the given cs is a trial one. + * This function ensures the given @excpus mask doesn't include any CPUs that + * are exclusively allocated to sibling cpusets. It walks through all siblings + * of @cs under @parent and removes their exclusive CPUs from @excpus. */ -static int compute_effective_exclusive_cpumask(struct cpuset *cs, - struct cpumask *xcpus, - struct cpuset *real_cs) +static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs, + struct cpumask *excpus) { struct cgroup_subsys_state *css; - struct cpuset *parent = parent_cs(cs); struct cpuset *sibling; int retval = 0; - if (!xcpus) - xcpus = cs->effective_xcpus; - - cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus); - - if (!real_cs) { - if (!cpumask_empty(cs->exclusive_cpus)) - return 0; - } else { - cs = real_cs; - } + if (cpumask_empty(excpus)) + return retval; /* * Exclude exclusive CPUs from siblings @@ -1388,25 +1530,69 @@ static int compute_effective_exclusive_cpumask(struct cpuset *cs, if (sibling == cs) continue; - if (!cpumask_empty(sibling->exclusive_cpus) && - cpumask_intersects(xcpus, sibling->exclusive_cpus)) { - cpumask_andnot(xcpus, xcpus, sibling->exclusive_cpus); + if (cpumask_intersects(excpus, sibling->exclusive_cpus)) { + cpumask_andnot(excpus, excpus, sibling->exclusive_cpus); retval++; continue; } - if (!cpumask_empty(sibling->effective_xcpus) && - cpumask_intersects(xcpus, sibling->effective_xcpus)) { - cpumask_andnot(xcpus, xcpus, sibling->effective_xcpus); + if (cpumask_intersects(excpus, sibling->effective_xcpus)) { + cpumask_andnot(excpus, excpus, sibling->effective_xcpus); retval++; } } rcu_read_unlock(); + return retval; } +/* + * compute_excpus - compute effective exclusive CPUs + * @cs: cpuset + * @xcpus: effective exclusive CPUs value to be set + * Return: 0 if there is no sibling conflict, > 0 otherwise + * + * If exclusive_cpus isn't explicitly set , we have to scan the sibling cpusets + * and exclude their exclusive_cpus or effective_xcpus as well. + */ +static int compute_excpus(struct cpuset *cs, struct cpumask *excpus) +{ + struct cpuset *parent = parent_cs(cs); + + cpumask_and(excpus, user_xcpus(cs), parent->effective_xcpus); + + if (!cpumask_empty(cs->exclusive_cpus)) + return 0; + + return rm_siblings_excl_cpus(parent, cs, excpus); +} + +/* + * compute_trialcs_excpus - Compute effective exclusive CPUs for a trial cpuset + * @trialcs: The trial cpuset containing the proposed new configuration + * @cs: The original cpuset that the trial configuration is based on + * Return: 0 if successful with no sibling conflict, >0 if a conflict is found + * + * Computes the effective_xcpus for a trial configuration. @cs is provided to represent + * the real cs. + */ +static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs) +{ + struct cpuset *parent = parent_cs(trialcs); + struct cpumask *excpus = trialcs->effective_xcpus; + + /* trialcs is member, cpuset.cpus has no impact to excpus */ + if (cs_is_member(cs)) + cpumask_and(excpus, trialcs->exclusive_cpus, + parent->effective_xcpus); + else + cpumask_and(excpus, user_xcpus(trialcs), parent->effective_xcpus); + + return rm_siblings_excl_cpus(parent, cs, excpus); +} + static inline bool is_remote_partition(struct cpuset *cs) { - return !list_empty(&cs->remote_sibling); + return cs->remote_partition; } static inline bool is_local_partition(struct cpuset *cs) @@ -1427,8 +1613,6 @@ static inline bool is_local_partition(struct cpuset *cs) static int remote_partition_enable(struct cpuset *cs, int new_prs, struct tmpmasks *tmp) { - bool isolcpus_updated; - /* * The user must have sysadmin privilege. */ @@ -1439,22 +1623,28 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, * The requested exclusive_cpus must not be allocated to other * partitions and it can't use up all the root's effective_cpus. * - * Note that if there is any local partition root above it or - * remote partition root underneath it, its exclusive_cpus must - * have overlapped with subpartitions_cpus. + * The effective_xcpus mask can contain offline CPUs, but there must + * be at least one or more online CPUs present before it can be enabled. + * + * Note that creating a remote partition with any local partition root + * above it or remote partition root underneath it is not allowed. */ - compute_effective_exclusive_cpumask(cs, tmp->new_cpus, NULL); - if (cpumask_empty(tmp->new_cpus) || - cpumask_intersects(tmp->new_cpus, subpartitions_cpus) || + compute_excpus(cs, tmp->new_cpus); + WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus)); + if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) || cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) return PERR_INVCPUS; + if (((new_prs == PRS_ISOLATED) && + !isolated_cpus_can_update(tmp->new_cpus, NULL)) || + prstate_housekeeping_conflict(new_prs, tmp->new_cpus)) + return PERR_HKEEPING; spin_lock_irq(&callback_lock); - isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus); - list_add(&cs->remote_sibling, &remote_children); + partition_xcpus_add(new_prs, NULL, tmp->new_cpus); + cs->remote_partition = true; cpumask_copy(cs->effective_xcpus, tmp->new_cpus); spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(isolcpus_updated); + update_isolation_cpumasks(); cpuset_force_rebuild(); cs->prs_err = 0; @@ -1477,25 +1667,29 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, */ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) { - bool isolcpus_updated; - WARN_ON_ONCE(!is_remote_partition(cs)); - WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); + /* + * When a CPU is offlined, top_cpuset may end up with no available CPUs, + * which should clear subpartitions_cpus. We should not emit a warning for this + * scenario: the hierarchy is updated from top to bottom, so subpartitions_cpus + * may already be cleared when disabling the partition. + */ + WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus) && + !cpumask_empty(subpartitions_cpus)); spin_lock_irq(&callback_lock); - list_del_init(&cs->remote_sibling); - isolcpus_updated = partition_xcpus_del(cs->partition_root_state, - NULL, cs->effective_xcpus); + cs->remote_partition = false; + partition_xcpus_del(cs->partition_root_state, NULL, cs->effective_xcpus); if (cs->prs_err) cs->partition_root_state = -cs->partition_root_state; else cs->partition_root_state = PRS_MEMBER; /* effective_xcpus may need to be changed */ - compute_effective_exclusive_cpumask(cs, NULL, NULL); + compute_excpus(cs, cs->effective_xcpus); reset_partition_data(cs); spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(isolcpus_updated); + update_isolation_cpumasks(); cpuset_force_rebuild(); /* @@ -1520,7 +1714,6 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, { bool adding, deleting; int prs = cs->partition_root_state; - int isolcpus_updated = 0; if (WARN_ON_ONCE(!is_remote_partition(cs))) return; @@ -1541,20 +1734,24 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, * left in the top cpuset. */ if (adding) { + WARN_ON_ONCE(cpumask_intersects(tmp->addmask, subpartitions_cpus)); if (!capable(CAP_SYS_ADMIN)) cs->prs_err = PERR_ACCESS; else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) || cpumask_subset(top_cpuset.effective_cpus, tmp->addmask)) cs->prs_err = PERR_NOCPUS; + else if ((prs == PRS_ISOLATED) && + !isolated_cpus_can_update(tmp->addmask, tmp->delmask)) + cs->prs_err = PERR_HKEEPING; if (cs->prs_err) goto invalidate; } spin_lock_irq(&callback_lock); if (adding) - isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask); + partition_xcpus_add(prs, NULL, tmp->addmask); if (deleting) - isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask); + partition_xcpus_del(prs, NULL, tmp->delmask); /* * Need to update effective_xcpus and exclusive_cpus now as * update_sibling_cpumasks() below may iterate back to the same cs. @@ -1563,7 +1760,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, if (xcpus) cpumask_copy(cs->exclusive_cpus, xcpus); spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(isolcpus_updated); + update_isolation_cpumasks(); if (adding || deleting) cpuset_force_rebuild(); @@ -1578,26 +1775,6 @@ invalidate: remote_partition_disable(cs, tmp); } -/* - * prstate_housekeeping_conflict - check for partition & housekeeping conflicts - * @prstate: partition root state to be checked - * @new_cpus: cpu mask - * Return: true if there is conflict, false otherwise - * - * CPUs outside of boot_hk_cpus, if defined, can only be used in an - * isolated partition. - */ -static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) -{ - if (!have_boot_isolcpus) - return false; - - if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus)) - return true; - - return false; -} - /** * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset * @cs: The cpuset that requests change in partition root state @@ -1644,13 +1821,12 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, int deleting; /* Deleting cpus from parent's effective_cpus */ int old_prs, new_prs; int part_error = PERR_NONE; /* Partition error? */ - int subparts_delta = 0; - int isolcpus_updated = 0; struct cpumask *xcpus = user_xcpus(cs); + int parent_prs = parent->partition_root_state; bool nocpu; lockdep_assert_held(&cpuset_mutex); - WARN_ON_ONCE(is_remote_partition(cs)); + WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */ /* * new_prs will only be changed for the partcmd_update and @@ -1660,7 +1836,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, old_prs = new_prs = cs->partition_root_state; if (cmd == partcmd_invalidate) { - if (is_prs_invalid(old_prs)) + if (is_partition_invalid(cs)) return 0; /* @@ -1669,10 +1845,9 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if (is_partition_valid(parent)) adding = cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); - if (old_prs > 0) { + if (old_prs > 0) new_prs = -old_prs; - subparts_delta--; - } + goto write_error; } @@ -1692,13 +1867,14 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) { /* - * Need to call compute_effective_exclusive_cpumask() in case + * Need to call compute_excpus() in case * exclusive_cpus not set. Sibling conflict should only happen * if exclusive_cpus isn't set. */ - xcpus = tmp->new_cpus; - if (compute_effective_exclusive_cpumask(cs, xcpus, NULL)) + xcpus = tmp->delmask; + if (compute_excpus(cs, xcpus)) WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus)); + new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; /* * Enabling partition root is not allowed if its @@ -1710,17 +1886,26 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if (prstate_housekeeping_conflict(new_prs, xcpus)) return PERR_HKEEPING; + if ((new_prs == PRS_ISOLATED) && (new_prs != parent_prs) && + !isolated_cpus_can_update(xcpus, NULL)) + return PERR_HKEEPING; + + if (tasks_nocpu_error(parent, cs, xcpus)) + return PERR_NOCPUS; + /* - * A parent can be left with no CPU as long as there is no - * task directly associated with the parent partition. + * This function will only be called when all the preliminary + * checks have passed. At this point, the following condition + * should hold. + * + * (cs->effective_xcpus & cpu_active_mask) ⊆ parent->effective_cpus + * + * Warn if it is not the case. */ - if (nocpu) - return PERR_NOCPUS; + cpumask_and(tmp->new_cpus, xcpus, cpu_active_mask); + WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus)); - deleting = cpumask_and(tmp->delmask, xcpus, parent->effective_xcpus); - if (deleting) - subparts_delta++; - new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; + deleting = true; } else if (cmd == partcmd_disable) { /* * May need to add cpus back to parent's effective_cpus @@ -1731,7 +1916,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if (is_partition_valid(cs)) { cpumask_copy(tmp->addmask, cs->effective_xcpus); adding = true; - subparts_delta--; } new_prs = PRS_MEMBER; } else if (newmask) { @@ -1759,8 +1943,9 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, * * For invalid partition: * delmask = newmask & parent->effective_xcpus + * The partition may become valid soon. */ - if (is_prs_invalid(old_prs)) { + if (is_partition_invalid(cs)) { adding = false; deleting = cpumask_and(tmp->delmask, newmask, parent->effective_xcpus); @@ -1773,6 +1958,32 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, deleting = cpumask_and(tmp->delmask, tmp->delmask, parent->effective_xcpus); } + + /* + * TBD: Invalidate a currently valid child root partition may + * still break isolated_cpus_can_update() rule if parent is an + * isolated partition. + */ + if (is_partition_valid(cs) && (old_prs != parent_prs)) { + if ((parent_prs == PRS_ROOT) && + /* Adding to parent means removing isolated CPUs */ + !isolated_cpus_can_update(tmp->delmask, tmp->addmask)) + part_error = PERR_HKEEPING; + if ((parent_prs == PRS_ISOLATED) && + /* Adding to parent means adding isolated CPUs */ + !isolated_cpus_can_update(tmp->addmask, tmp->delmask)) + part_error = PERR_HKEEPING; + } + + /* + * The new CPUs to be removed from parent's effective CPUs + * must be present. + */ + if (deleting) { + cpumask_and(tmp->new_cpus, tmp->delmask, cpu_active_mask); + WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus)); + } + /* * Make partition invalid if parent's effective_cpus could * become empty and there are tasks in the parent. @@ -1800,13 +2011,12 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, * A partition error happens when parent has tasks and all * its effective CPUs will have to be distributed out. */ - WARN_ON_ONCE(!is_partition_valid(parent)); if (nocpu) { part_error = PERR_NOCPUS; if (is_partition_valid(cs)) adding = cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); - } else if (is_partition_invalid(cs) && + } else if (is_partition_invalid(cs) && !cpumask_empty(xcpus) && cpumask_subset(xcpus, parent->effective_xcpus)) { struct cgroup_subsys_state *css; struct cpuset *child; @@ -1846,17 +2056,13 @@ write_error: switch (cs->partition_root_state) { case PRS_ROOT: case PRS_ISOLATED: - if (part_error) { + if (part_error) new_prs = -old_prs; - subparts_delta--; - } break; case PRS_INVALID_ROOT: case PRS_INVALID_ISOLATED: - if (!part_error) { + if (!part_error) new_prs = -old_prs; - subparts_delta++; - } break; } } @@ -1885,28 +2091,20 @@ write_error: * newly deleted ones will be added back to effective_cpus. */ spin_lock_irq(&callback_lock); - if (old_prs != new_prs) { + if (old_prs != new_prs) cs->partition_root_state = new_prs; - if (new_prs <= 0) - cs->nr_subparts = 0; - } + /* * Adding to parent's effective_cpus means deletion CPUs from cs * and vice versa. */ if (adding) - isolcpus_updated += partition_xcpus_del(old_prs, parent, - tmp->addmask); + partition_xcpus_del(old_prs, parent, tmp->addmask); if (deleting) - isolcpus_updated += partition_xcpus_add(new_prs, parent, - tmp->delmask); + partition_xcpus_add(new_prs, parent, tmp->delmask); - if (is_partition_valid(parent)) { - parent->nr_subparts += subparts_delta; - WARN_ON_ONCE(parent->nr_subparts < 0); - } spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(isolcpus_updated); + update_isolation_cpumasks(); if ((old_prs != new_prs) && (cmd == partcmd_update)) update_partition_exclusive_flag(cs, new_prs); @@ -1959,7 +2157,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, * 2) All the effective_cpus will be used up and cp * has tasks */ - compute_effective_exclusive_cpumask(cs, new_ecpus, NULL); + compute_excpus(cs, new_ecpus); cpumask_and(new_ecpus, new_ecpus, cpu_active_mask); rcu_read_lock(); @@ -1988,8 +2186,6 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, */ spin_lock_irq(&callback_lock); make_partition_invalid(child); - cs->nr_subparts--; - child->nr_subparts = 0; spin_unlock_irq(&callback_lock); notify_partition_change(child, old_prs); continue; @@ -2018,7 +2214,6 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, { struct cpuset *cp; struct cgroup_subsys_state *pos_css; - bool need_rebuild_sched_domains = false; int old_prs, new_prs; rcu_read_lock(); @@ -2038,7 +2233,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, * its value is being processed. */ if (remote && (cp != cs)) { - compute_effective_exclusive_cpumask(cp, tmp->new_cpus, NULL); + compute_excpus(cp, tmp->new_cpus); if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) { pos_css = css_rightmost_descendant(pos_css); continue; @@ -2140,7 +2335,7 @@ get_css: cpumask_copy(cp->effective_cpus, tmp->new_cpus); cp->partition_root_state = new_prs; if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) - compute_effective_exclusive_cpumask(cp, NULL, NULL); + compute_excpus(cp, cp->effective_xcpus); /* * Make sure effective_xcpus is properly set for a valid @@ -2182,15 +2377,12 @@ get_css: if (!cpumask_empty(cp->cpus_allowed) && is_sched_load_balance(cp) && (!cpuset_v2() || is_partition_valid(cp))) - need_rebuild_sched_domains = true; + cpuset_force_rebuild(); rcu_read_lock(); css_put(&cp->css); } rcu_read_unlock(); - - if (need_rebuild_sched_domains) - cpuset_force_rebuild(); } /** @@ -2247,82 +2439,54 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, rcu_read_unlock(); } -/** - * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it - * @cs: the cpuset to consider - * @trialcs: trial cpuset - * @buf: buffer of cpu numbers written to this cpuset - */ -static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, - const char *buf) +static int parse_cpuset_cpulist(const char *buf, struct cpumask *out_mask) { int retval; - struct tmpmasks tmp; - struct cpuset *parent = parent_cs(cs); - bool invalidate = false; - bool force = false; - int old_prs = cs->partition_root_state; - /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ - if (cs == &top_cpuset) - return -EACCES; + retval = cpulist_parse(buf, out_mask); + if (retval < 0) + return retval; + if (!cpumask_subset(out_mask, top_cpuset.cpus_allowed)) + return -EINVAL; - /* - * An empty cpus_allowed is ok only if the cpuset has no tasks. - * Since cpulist_parse() fails on an empty mask, we special case - * that parsing. The validate_change() call ensures that cpusets - * with tasks have cpus. - */ - if (!*buf) { - cpumask_clear(trialcs->cpus_allowed); - if (cpumask_empty(trialcs->exclusive_cpus)) - cpumask_clear(trialcs->effective_xcpus); - } else { - retval = cpulist_parse(buf, trialcs->cpus_allowed); - if (retval < 0) - return retval; + return 0; +} - if (!cpumask_subset(trialcs->cpus_allowed, - top_cpuset.cpus_allowed)) - return -EINVAL; +/** + * validate_partition - Validate a cpuset partition configuration + * @cs: The cpuset to validate + * @trialcs: The trial cpuset containing proposed configuration changes + * + * If any validation check fails, the appropriate error code is set in the + * cpuset's prs_err field. + * + * Return: PRS error code (0 if valid, non-zero error code if invalid) + */ +static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *trialcs) +{ + struct cpuset *parent = parent_cs(cs); - /* - * When exclusive_cpus isn't explicitly set, it is constrained - * by cpus_allowed and parent's effective_xcpus. Otherwise, - * trialcs->effective_xcpus is used as a temporary cpumask - * for checking validity of the partition root. - */ - trialcs->partition_root_state = PRS_MEMBER; - if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs)) - compute_effective_exclusive_cpumask(trialcs, NULL, cs); - } + if (cs_is_member(trialcs)) + return PERR_NONE; - /* Nothing to do if the cpus didn't change */ - if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) - return 0; + if (cpumask_empty(trialcs->effective_xcpus)) + return PERR_INVCPUS; - if (alloc_cpumasks(NULL, &tmp)) - return -ENOMEM; + if (prstate_housekeeping_conflict(trialcs->partition_root_state, + trialcs->effective_xcpus)) + return PERR_HKEEPING; - if (old_prs) { - if (is_partition_valid(cs) && - cpumask_empty(trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_INVCPUS; - } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_HKEEPING; - } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_NOCPUS; - } - } + if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) + return PERR_NOCPUS; - /* - * Check all the descendants in update_cpumasks_hier() if - * effective_xcpus is to be changed. - */ - force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); + return PERR_NONE; +} + +static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs, + struct tmpmasks *tmp) +{ + int retval; + struct cpuset *parent = parent_cs(cs); retval = validate_change(cs, trialcs); @@ -2337,7 +2501,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * partition. However, any conflicting sibling partitions * have to be marked as invalid too. */ - invalidate = true; + trialcs->prs_err = PERR_NOTEXCL; rcu_read_lock(); cpuset_for_each_child(cp, css, parent) { struct cpumask *xcpus = user_xcpus(trialcs); @@ -2345,36 +2509,92 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (is_partition_valid(cp) && cpumask_intersects(xcpus, cp->effective_xcpus)) { rcu_read_unlock(); - update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp); + update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp); rcu_read_lock(); } } rcu_read_unlock(); retval = 0; } + return retval; +} - if (retval < 0) - goto out_free; +/** + * partition_cpus_change - Handle partition state changes due to CPU mask updates + * @cs: The target cpuset being modified + * @trialcs: The trial cpuset containing proposed configuration changes + * @tmp: Temporary masks for intermediate calculations + * + * This function handles partition state transitions triggered by CPU mask changes. + * CPU modifications may cause a partition to be disabled or require state updates. + */ +static void partition_cpus_change(struct cpuset *cs, struct cpuset *trialcs, + struct tmpmasks *tmp) +{ + enum prs_errcode prs_err; - if (is_partition_valid(cs) || - (is_partition_invalid(cs) && !invalidate)) { - struct cpumask *xcpus = trialcs->effective_xcpus; + if (cs_is_member(cs)) + return; - if (cpumask_empty(xcpus) && is_partition_invalid(cs)) - xcpus = trialcs->cpus_allowed; + prs_err = validate_partition(cs, trialcs); + if (prs_err) + trialcs->prs_err = cs->prs_err = prs_err; - /* - * Call remote_cpus_update() to handle valid remote partition - */ - if (is_remote_partition(cs)) - remote_cpus_update(cs, NULL, xcpus, &tmp); - else if (invalidate) + if (is_remote_partition(cs)) { + if (trialcs->prs_err) + remote_partition_disable(cs, tmp); + else + remote_cpus_update(cs, trialcs->exclusive_cpus, + trialcs->effective_xcpus, tmp); + } else { + if (trialcs->prs_err) update_parent_effective_cpumask(cs, partcmd_invalidate, - NULL, &tmp); + NULL, tmp); else update_parent_effective_cpumask(cs, partcmd_update, - xcpus, &tmp); + trialcs->effective_xcpus, tmp); } +} + +/** + * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it + * @cs: the cpuset to consider + * @trialcs: trial cpuset + * @buf: buffer of cpu numbers written to this cpuset + */ +static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, + const char *buf) +{ + int retval; + struct tmpmasks tmp; + bool force = false; + int old_prs = cs->partition_root_state; + + retval = parse_cpuset_cpulist(buf, trialcs->cpus_allowed); + if (retval < 0) + return retval; + + /* Nothing to do if the cpus didn't change */ + if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) + return 0; + + if (alloc_tmpmasks(&tmp)) + return -ENOMEM; + + compute_trialcs_excpus(trialcs, cs); + trialcs->prs_err = PERR_NONE; + + retval = cpus_allowed_validate_change(cs, trialcs, &tmp); + if (retval < 0) + goto out_free; + + /* + * Check all the descendants in update_cpumasks_hier() if + * effective_xcpus is to be changed. + */ + force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); + + partition_cpus_change(cs, trialcs, &tmp); spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); @@ -2390,7 +2610,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); out_free: - free_cpumasks(NULL, &tmp); + free_tmpmasks(&tmp); return retval; } @@ -2407,33 +2627,23 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, { int retval; struct tmpmasks tmp; - struct cpuset *parent = parent_cs(cs); - bool invalidate = false; bool force = false; int old_prs = cs->partition_root_state; - if (!*buf) { - cpumask_clear(trialcs->exclusive_cpus); - cpumask_clear(trialcs->effective_xcpus); - } else { - retval = cpulist_parse(buf, trialcs->exclusive_cpus); - if (retval < 0) - return retval; - } + retval = parse_cpuset_cpulist(buf, trialcs->exclusive_cpus); + if (retval < 0) + return retval; /* Nothing to do if the CPUs didn't change */ if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus)) return 0; - if (*buf) { - trialcs->partition_root_state = PRS_MEMBER; - /* - * Reject the change if there is exclusive CPUs conflict with - * the siblings. - */ - if (compute_effective_exclusive_cpumask(trialcs, NULL, cs)) - return -EINVAL; - } + /* + * Reject the change if there is exclusive CPUs conflict with + * the siblings. + */ + if (compute_trialcs_excpus(trialcs, cs)) + return -EINVAL; /* * Check all the descendants in update_cpumasks_hier() if @@ -2445,35 +2655,12 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval) return retval; - if (alloc_cpumasks(NULL, &tmp)) + if (alloc_tmpmasks(&tmp)) return -ENOMEM; - if (old_prs) { - if (cpumask_empty(trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_INVCPUS; - } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_HKEEPING; - } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_NOCPUS; - } + trialcs->prs_err = PERR_NONE; + partition_cpus_change(cs, trialcs, &tmp); - if (is_remote_partition(cs)) { - if (invalidate) - remote_partition_disable(cs, &tmp); - else - remote_cpus_update(cs, trialcs->exclusive_cpus, - trialcs->effective_xcpus, &tmp); - } else if (invalidate) { - update_parent_effective_cpumask(cs, partcmd_invalidate, - NULL, &tmp); - } else { - update_parent_effective_cpumask(cs, partcmd_update, - trialcs->effective_xcpus, &tmp); - } - } spin_lock_irq(&callback_lock); cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus); cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); @@ -2493,7 +2680,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); - free_cpumasks(NULL, &tmp); + free_tmpmasks(&tmp); return 0; } @@ -2545,9 +2732,24 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, } } -static void cpuset_post_attach(void) +static void flush_migrate_mm_task_workfn(struct callback_head *head) { flush_workqueue(cpuset_migrate_mm_wq); + kfree(head); +} + +static void schedule_flush_migrate_mm(void) +{ + struct callback_head *flush_cb; + + flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL); + if (!flush_cb) + return; + + init_task_work(flush_cb, flush_migrate_mm_task_workfn); + + if (task_work_add(current, flush_cb, TWA_RESUME)) + kfree(flush_cb); } /* @@ -2713,41 +2915,24 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, int retval; /* - * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; - * it's read-only - */ - if (cs == &top_cpuset) { - retval = -EACCES; - goto done; - } - - /* * An empty mems_allowed is ok iff there are no tasks in the cpuset. - * Since nodelist_parse() fails on an empty mask, we special case - * that parsing. The validate_change() call ensures that cpusets - * with tasks have memory. + * The validate_change() call ensures that cpusets with tasks have memory. */ - if (!*buf) { - nodes_clear(trialcs->mems_allowed); - } else { - retval = nodelist_parse(buf, trialcs->mems_allowed); - if (retval < 0) - goto done; + retval = nodelist_parse(buf, trialcs->mems_allowed); + if (retval < 0) + return retval; - if (!nodes_subset(trialcs->mems_allowed, - top_cpuset.mems_allowed)) { - retval = -EINVAL; - goto done; - } - } + if (!nodes_subset(trialcs->mems_allowed, + top_cpuset.mems_allowed)) + return -EINVAL; + + /* No change? nothing to do */ + if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) + return 0; - if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { - retval = 0; /* Too easy - nothing to do */ - goto done; - } retval = validate_change(cs, trialcs); if (retval < 0) - goto done; + return retval; check_insane_mems_config(&trialcs->mems_allowed); @@ -2757,8 +2942,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, /* use trialcs->mems_allowed as a temp variable */ update_nodemasks_hier(cs, &trialcs->mems_allowed); -done: - return retval; + return 0; } bool current_cpuset_is_being_rebound(void) @@ -2789,7 +2973,7 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int spread_flag_changed; int err; - trialcs = alloc_trial_cpuset(cs); + trialcs = dup_or_alloc_cpuset(cs); if (!trialcs) return -ENOMEM; @@ -2847,10 +3031,10 @@ static int update_prstate(struct cpuset *cs, int new_prs) /* * Treat a previously invalid partition root as if it is a "member". */ - if (new_prs && is_prs_invalid(old_prs)) + if (new_prs && is_partition_invalid(cs)) old_prs = PRS_MEMBER; - if (alloc_cpumasks(NULL, &tmpmask)) + if (alloc_tmpmasks(&tmpmask)) return -ENOMEM; err = update_partition_exclusive_flag(cs, new_prs); @@ -2896,7 +3080,12 @@ static int update_prstate(struct cpuset *cs, int new_prs) * A change in load balance state only, no change in cpumasks. * Need to update isolated_cpus. */ - isolcpus_updated = true; + if (((new_prs == PRS_ISOLATED) && + !isolated_cpus_can_update(cs->effective_xcpus, NULL)) || + prstate_housekeeping_conflict(new_prs, cs->effective_xcpus)) + err = PERR_HKEEPING; + else + isolcpus_updated = true; } else { /* * Switching back to member is always allowed even if it @@ -2931,7 +3120,7 @@ out: else if (isolcpus_updated) isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus); spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(isolcpus_updated); + update_isolation_cpumasks(); /* Force update if switching back to member & update effective_xcpus */ update_cpumasks_hier(cs, &tmpmask, !new_prs); @@ -2946,7 +3135,7 @@ out: notify_partition_change(cs, old_prs); if (force_sd_rebuild) rebuild_sched_domains_locked(); - free_cpumasks(NULL, &tmpmask); + free_tmpmasks(&tmpmask); return 0; } @@ -3082,7 +3271,7 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) lockdep_assert_held(&cpuset_mutex); if (cs != &top_cpuset) - guarantee_online_cpus(task, cpus_attach); + guarantee_active_cpus(task, cpus_attach); else cpumask_andnot(cpus_attach, task_cpu_possible_mask(task), subpartitions_cpus); @@ -3104,6 +3293,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) struct cpuset *cs; struct cpuset *oldcs = cpuset_attach_old_cs; bool cpus_updated, mems_updated; + bool queue_task_work = false; cgroup_taskset_first(tset, &css); cs = css_cs(css); @@ -3154,15 +3344,18 @@ static void cpuset_attach(struct cgroup_taskset *tset) * @old_mems_allowed is the right nodesets that we * migrate mm from. */ - if (is_memory_migrate(cs)) + if (is_memory_migrate(cs)) { cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, &cpuset_attach_nodemask_to); - else + queue_task_work = true; + } else mmput(mm); } } out: + if (queue_task_work) + schedule_flush_migrate_mm(); cs->old_mems_allowed = cpuset_attach_nodemask_to; if (cs->nr_migrate_dl_tasks) { @@ -3186,13 +3379,16 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, struct cpuset *trialcs; int retval = -ENODEV; + /* root is read-only */ + if (cs == &top_cpuset) + return -EACCES; + buf = strstrip(buf); - cpus_read_lock(); - mutex_lock(&cpuset_mutex); + cpuset_full_lock(); if (!is_cpuset_online(cs)) goto out_unlock; - trialcs = alloc_trial_cpuset(cs); + trialcs = dup_or_alloc_cpuset(cs); if (!trialcs) { retval = -ENOMEM; goto out_unlock; @@ -3217,9 +3413,9 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, if (force_sd_rebuild) rebuild_sched_domains_locked(); out_unlock: - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); - flush_workqueue(cpuset_migrate_mm_wq); + cpuset_full_unlock(); + if (of_cft(of)->private == FILE_MEMLIST) + schedule_flush_migrate_mm(); return retval ?: nbytes; } @@ -3321,14 +3517,10 @@ static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf, else return -EINVAL; - css_get(&cs->css); - cpus_read_lock(); - mutex_lock(&cpuset_mutex); + cpuset_full_lock(); if (is_cpuset_online(cs)) retval = update_prstate(cs, val); - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); - css_put(&cs->css); + cpuset_full_unlock(); return retval ?: nbytes; } @@ -3427,19 +3619,13 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) if (!parent_css) return &top_cpuset.css; - cs = kzalloc(sizeof(*cs), GFP_KERNEL); + cs = dup_or_alloc_cpuset(NULL); if (!cs) return ERR_PTR(-ENOMEM); - if (alloc_cpumasks(cs, NULL)) { - kfree(cs); - return ERR_PTR(-ENOMEM); - } - __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; - INIT_LIST_HEAD(&cs->remote_sibling); /* Set CS_MEMORY_MIGRATE for default hierarchy */ if (cpuset_v2()) @@ -3458,10 +3644,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (!parent) return 0; - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - - set_bit(CS_ONLINE, &cs->flags); + cpuset_full_lock(); if (is_spread_page(parent)) set_bit(CS_SPREAD_PAGE, &cs->flags); if (is_spread_slab(parent)) @@ -3513,8 +3696,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); out_unlock: - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); + cpuset_full_unlock(); return 0; } @@ -3524,42 +3706,33 @@ out_unlock: * will call rebuild_sched_domains_locked(). That is not needed * in the default hierarchy where only changes in partition * will cause repartitioning. - * - * If the cpuset has the 'sched.partition' flag enabled, simulate - * turning 'sched.partition" off. */ - static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - + cpuset_full_lock(); if (!cpuset_v2() && is_sched_load_balance(cs)) cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); cpuset_dec(); - clear_bit(CS_ONLINE, &cs->flags); - - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); + cpuset_full_unlock(); } +/* + * If a dying cpuset has the 'cpus.partition' enabled, turn it off by + * changing it back to member to free its exclusive CPUs back to the pool to + * be used by other online cpusets. + */ static void cpuset_css_killed(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - + cpuset_full_lock(); /* Reset valid partition back to member */ if (is_partition_valid(cs)) update_prstate(cs, PRS_MEMBER); - - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); - + cpuset_full_unlock(); } static void cpuset_css_free(struct cgroup_subsys_state *css) @@ -3688,7 +3861,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, - .post_attach = cpuset_post_attach, .bind = cpuset_bind, .can_fork = cpuset_can_fork, .cancel_fork = cpuset_cancel_fork, @@ -3724,7 +3896,6 @@ int __init cpuset_init(void) nodes_setall(top_cpuset.effective_mems); fmeter_init(&top_cpuset.fmeter); - INIT_LIST_HEAD(&remote_children); BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); @@ -3812,8 +3983,9 @@ retry: if (remote || (is_partition_valid(cs) && is_partition_valid(parent))) compute_partition_effective_cpumask(cs, &new_cpus); - if (remote && cpumask_empty(&new_cpus) && - partition_is_populated(cs, NULL)) { + if (remote && (cpumask_empty(subpartitions_cpus) || + (cpumask_empty(&new_cpus) && + partition_is_populated(cs, NULL)))) { cs->prs_err = PERR_HOTPLUG; remote_partition_disable(cs, tmp); compute_effective_cpumask(&new_cpus, cs, parent); @@ -3826,15 +3998,19 @@ retry: * 1) empty effective cpus but not valid empty partition. * 2) parent is invalid or doesn't grant any cpus to child * partitions. + * 3) subpartitions_cpus is empty. */ - if (is_local_partition(cs) && (!is_partition_valid(parent) || - tasks_nocpu_error(parent, cs, &new_cpus))) + if (is_local_partition(cs) && + (!is_partition_valid(parent) || + tasks_nocpu_error(parent, cs, &new_cpus) || + cpumask_empty(subpartitions_cpus))) partcmd = partcmd_invalidate; /* * On the other hand, an invalid partition root may be transitioned - * back to a regular one. + * back to a regular one with a non-empty effective xcpus. */ - else if (is_partition_valid(parent) && is_partition_invalid(cs)) + else if (is_partition_valid(parent) && is_partition_invalid(cs) && + !cpumask_empty(cs->effective_xcpus)) partcmd = partcmd_update; if (partcmd >= 0) { @@ -3891,7 +4067,7 @@ static void cpuset_handle_hotplug(void) bool on_dfl = is_in_v2_mode(); struct tmpmasks tmp, *ptmp = NULL; - if (on_dfl && !alloc_cpumasks(NULL, &tmp)) + if (on_dfl && !alloc_tmpmasks(&tmp)) ptmp = &tmp; lockdep_assert_cpus_held(); @@ -3924,7 +4100,6 @@ static void cpuset_handle_hotplug(void) */ if (!cpumask_empty(subpartitions_cpus)) { if (cpumask_subset(&new_cpus, subpartitions_cpus)) { - top_cpuset.nr_subparts = 0; cpumask_clear(subpartitions_cpus); } else { cpumask_andnot(&new_cpus, &new_cpus, @@ -3971,7 +4146,7 @@ static void cpuset_handle_hotplug(void) if (force_sd_rebuild) rebuild_sched_domains_cpuslocked(); - free_cpumasks(NULL, ptmp); + free_tmpmasks(ptmp); } void cpuset_update_active_cpus(void) @@ -4013,34 +4188,22 @@ void __init cpuset_init_smp(void) cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); top_cpuset.effective_mems = node_states[N_MEMORY]; - hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI); + hotplug_node_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI); cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); BUG_ON(!cpuset_migrate_mm_wq); } -/** - * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. - * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. - * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. - * - * Description: Returns the cpumask_var_t cpus_allowed of the cpuset - * attached to the specified @tsk. Guaranteed to return some non-empty - * subset of cpu_online_mask, even if this means going outside the - * tasks cpuset, except when the task is in the top cpuset. - **/ - -void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) +/* + * Return cpus_allowed mask from a task's cpuset. + */ +static void __cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) { - unsigned long flags; struct cpuset *cs; - spin_lock_irqsave(&callback_lock, flags); - rcu_read_lock(); - cs = task_cs(tsk); if (cs != &top_cpuset) - guarantee_online_cpus(tsk, pmask); + guarantee_active_cpus(tsk, pmask); /* * Tasks in the top cpuset won't get update to their cpumasks * when a hotplug online/offline event happens. So we include all @@ -4054,11 +4217,42 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) * allowable online cpu left, we fall back to all possible cpus. */ cpumask_andnot(pmask, possible_mask, subpartitions_cpus); - if (!cpumask_intersects(pmask, cpu_online_mask)) + if (!cpumask_intersects(pmask, cpu_active_mask)) cpumask_copy(pmask, possible_mask); } +} - rcu_read_unlock(); +/** + * cpuset_cpus_allowed_locked - return cpus_allowed mask from a task's cpuset. + * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. + * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. + * + * Similir to cpuset_cpus_allowed() except that the caller must have acquired + * cpuset_mutex. + */ +void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) +{ + lockdep_assert_held(&cpuset_mutex); + __cpuset_cpus_allowed_locked(tsk, pmask); +} + +/** + * cpuset_cpus_allowed - return cpus_allowed mask from a task's cpuset. + * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. + * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. + * + * Description: Returns the cpumask_var_t cpus_allowed of the cpuset + * attached to the specified @tsk. Guaranteed to return some non-empty + * subset of cpu_active_mask, even if this means going outside the + * tasks cpuset, except when the task is in the top cpuset. + **/ + +void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) +{ + unsigned long flags; + + spin_lock_irqsave(&callback_lock, flags); + __cpuset_cpus_allowed_locked(tsk, pmask); spin_unlock_irqrestore(&callback_lock, flags); } @@ -4085,7 +4279,7 @@ bool cpuset_cpus_allowed_fallback(struct task_struct *tsk) rcu_read_lock(); cs_mask = task_cs(tsk)->cpus_allowed; if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) { - do_set_cpus_allowed(tsk, cs_mask); + set_cpus_allowed_force(tsk, cs_mask); changed = true; } rcu_read_unlock(); @@ -4131,9 +4325,7 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) unsigned long flags; spin_lock_irqsave(&callback_lock, flags); - rcu_read_lock(); guarantee_online_mems(task_cs(tsk), &mask); - rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); return mask; @@ -4164,7 +4356,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) } /* - * cpuset_node_allowed - Can we allocate on a memory node? + * cpuset_current_node_allowed - Can current task allocate on a memory node? * @node: is this an allowed node? * @gfp_mask: memory allocation flags * @@ -4203,7 +4395,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ -bool cpuset_node_allowed(int node, gfp_t gfp_mask) +bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ bool allowed; /* is allocation in zone z allowed? */ @@ -4228,15 +4420,49 @@ bool cpuset_node_allowed(int node, gfp_t gfp_mask) /* Not hardwall and node outside mems_allowed: scan up cpusets */ spin_lock_irqsave(&callback_lock, flags); - rcu_read_lock(); cs = nearest_hardwall_ancestor(task_cs(current)); allowed = node_isset(node, cs->mems_allowed); - rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); return allowed; } +bool cpuset_node_allowed(struct cgroup *cgroup, int nid) +{ + struct cgroup_subsys_state *css; + struct cpuset *cs; + bool allowed; + + /* + * In v1, mem_cgroup and cpuset are unlikely in the same hierarchy + * and mems_allowed is likely to be empty even if we could get to it, + * so return true to avoid taking a global lock on the empty check. + */ + if (!cpuset_v2()) + return true; + + css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys); + if (!css) + return true; + + /* + * Normally, accessing effective_mems would require the cpuset_mutex + * or callback_lock - but node_isset is atomic and the reference + * taken via cgroup_get_e_css is sufficient to protect css. + * + * Since this interface is intended for use by migration paths, we + * relax locking here to avoid taking global locks - while accepting + * there may be rare scenarios where the result may be innaccurate. + * + * Reclaim and migration are subject to these same race conditions, and + * cannot make strong isolation guarantees, so this is acceptable. + */ + cs = container_of(css, struct cpuset, css); + allowed = node_isset(nid, cs->effective_mems); + css_put(css); + return allowed; +} + /** * cpuset_spread_node() - On which node to begin search for a page * @rotor: round robin rotor diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 80aa3f027ac3..81ea38dd6f9d 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -49,7 +49,6 @@ static int current_css_set_read(struct seq_file *seq, void *v) return -ENODEV; spin_lock_irq(&css_set_lock); - rcu_read_lock(); cset = task_css_set(current); refcnt = refcount_read(&cset->refcount); seq_printf(seq, "css_set %pK %d", cset, refcnt); @@ -67,7 +66,6 @@ static int current_css_set_read(struct seq_file *seq, void *v) seq_printf(seq, "%2d: %-4s\t- %p[%d]\n", ss->id, ss->name, css, css->id); } - rcu_read_unlock(); spin_unlock_irq(&css_set_lock); cgroup_kn_unlock(of->kn); return 0; @@ -95,7 +93,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) return -ENOMEM; spin_lock_irq(&css_set_lock); - rcu_read_lock(); cset = task_css_set(current); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; @@ -104,7 +101,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) seq_printf(seq, "Root %d group %s\n", c->root->hierarchy_id, name_buf); } - rcu_read_unlock(); spin_unlock_irq(&css_set_lock); kfree(name_buf); return 0; diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c index 10b63433f057..e12b946278b6 100644 --- a/kernel/cgroup/dmem.c +++ b/kernel/cgroup/dmem.c @@ -14,6 +14,7 @@ #include <linux/mutex.h> #include <linux/page_counter.h> #include <linux/parser.h> +#include <linux/rculist.h> #include <linux/slab.h> struct dmem_cgroup_region { diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index bf1690a167dd..6c18854bff34 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -171,7 +171,7 @@ static void cgroup_freeze_task(struct task_struct *task, bool freeze) /* * Freeze or unfreeze all tasks in the given cgroup. */ -static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) +static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze, u64 ts_nsec) { struct css_task_iter it; struct task_struct *task; @@ -179,10 +179,16 @@ static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - if (freeze) + write_seqcount_begin(&cgrp->freezer.freeze_seq); + if (freeze) { set_bit(CGRP_FREEZE, &cgrp->flags); - else + cgrp->freezer.freeze_start_nsec = ts_nsec; + } else { clear_bit(CGRP_FREEZE, &cgrp->flags); + cgrp->freezer.frozen_nsec += (ts_nsec - + cgrp->freezer.freeze_start_nsec); + } + write_seqcount_end(&cgrp->freezer.freeze_seq); spin_unlock_irq(&css_set_lock); if (freeze) @@ -260,6 +266,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) struct cgroup *parent; struct cgroup *dsct; bool applied = false; + u64 ts_nsec; bool old_e; lockdep_assert_held(&cgroup_mutex); @@ -271,6 +278,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) return; cgrp->freezer.freeze = freeze; + ts_nsec = ktime_get_ns(); /* * Propagate changes downwards the cgroup tree. @@ -298,7 +306,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) /* * Do change actual state: freeze or unfreeze. */ - cgroup_do_freeze(dsct, freeze); + cgroup_do_freeze(dsct, freeze, ts_nsec); applied = true; } diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 039d1eb2f215..915b02f65980 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -63,18 +63,12 @@ static struct freezer *parent_freezer(struct freezer *freezer) return css_freezer(freezer->css.parent); } -bool cgroup_freezing(struct task_struct *task) +bool cgroup1_freezing(struct task_struct *task) { bool ret; - unsigned int state; rcu_read_lock(); - /* Check if the cgroup is still FREEZING, but not FROZEN. The extra - * !FROZEN check is required, because the FREEZING bit is not cleared - * when the state FROZEN is reached. - */ - state = task_freezer(task)->state; - ret = (state & CGROUP_FREEZING) && !(state & CGROUP_FROZEN); + ret = task_freezer(task)->state & CGROUP_FREEZING; rcu_read_unlock(); return ret; @@ -188,13 +182,12 @@ static void freezer_attach(struct cgroup_taskset *tset) if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { - freeze_task(task); - /* clear FROZEN and propagate upwards */ while (freezer && (freezer->state & CGROUP_FROZEN)) { freezer->state &= ~CGROUP_FROZEN; freezer = parent_freezer(freezer); } + freeze_task(task); } } diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index 2fa3a4fb2aaf..6a01d91ea4cb 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -24,6 +24,10 @@ static const char *const misc_res_name[] = { /* AMD SEV-ES ASIDs resource */ "sev_es", #endif +#ifdef CONFIG_INTEL_TDX_HOST + /* Intel TDX HKIDs resource */ + "tdx", +#endif }; /* Root misc cgroup */ diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 144a464e45c6..db9617556dd7 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -5,7 +5,7 @@ #include <linux/slab.h> #include <linux/nsproxy.h> #include <linux/proc_ns.h> - +#include <linux/nstree.h> /* cgroup namespaces */ @@ -21,33 +21,31 @@ static void dec_cgroup_namespaces(struct ucounts *ucounts) static struct cgroup_namespace *alloc_cgroup_ns(void) { - struct cgroup_namespace *new_ns; + struct cgroup_namespace *new_ns __free(kfree) = NULL; int ret; new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); - ret = ns_alloc_inum(&new_ns->ns); - if (ret) { - kfree(new_ns); + ret = ns_common_init(new_ns); + if (ret) return ERR_PTR(ret); - } - refcount_set(&new_ns->ns.count, 1); - new_ns->ns.ops = &cgroupns_operations; - return new_ns; + return no_free_ptr(new_ns); } void free_cgroup_ns(struct cgroup_namespace *ns) { + ns_tree_remove(ns); put_css_set(ns->root_cset); dec_cgroup_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); - kfree(ns); + ns_common_free(ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); } EXPORT_SYMBOL(free_cgroup_ns); -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, +struct cgroup_namespace *copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { @@ -87,14 +85,10 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, new_ns->ucounts = ucounts; new_ns->root_cset = cset; + ns_tree_add(new_ns); return new_ns; } -static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) -{ - return container_of(ns, struct cgroup_namespace, ns); -} - static int cgroupns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; @@ -143,7 +137,6 @@ static struct user_namespace *cgroupns_owner(struct ns_common *ns) const struct proc_ns_operations cgroupns_operations = { .name = "cgroup", - .type = CLONE_NEWCGROUP, .get = cgroupns_get, .put = cgroupns_put, .install = cgroupns_install, diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index b2239156b7de..150e5871e66f 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -9,102 +9,129 @@ #include <trace/events/cgroup.h> -static DEFINE_SPINLOCK(cgroup_rstat_lock); -static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); +static DEFINE_SPINLOCK(rstat_base_lock); +static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list); static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); -static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) -{ - return per_cpu_ptr(cgrp->rstat_cpu, cpu); -} - /* - * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock). - * - * This makes it easier to diagnose locking issues and contention in - * production environments. The parameter @fast_path determine the - * tracepoints being added, allowing us to diagnose "flush" related - * operations without handling high-frequency fast-path "update" events. + * Determines whether a given css can participate in rstat. + * css's that are cgroup::self use rstat for base stats. + * Other css's associated with a subsystem use rstat only when + * they define the ss->css_rstat_flush callback. */ -static __always_inline -unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu, - struct cgroup *cgrp, const bool fast_path) +static inline bool css_uses_rstat(struct cgroup_subsys_state *css) { - unsigned long flags; - bool contended; + return css_is_self(css) || css->ss->css_rstat_flush != NULL; +} - /* - * The _irqsave() is needed because cgroup_rstat_lock is - * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring - * this lock with the _irq() suffix only disables interrupts on - * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables - * interrupts on both configurations. The _irqsave() ensures - * that interrupts are always disabled and later restored. - */ - contended = !raw_spin_trylock_irqsave(cpu_lock, flags); - if (contended) { - if (fast_path) - trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended); - else - trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended); +static struct css_rstat_cpu *css_rstat_cpu( + struct cgroup_subsys_state *css, int cpu) +{ + return per_cpu_ptr(css->rstat_cpu, cpu); +} - raw_spin_lock_irqsave(cpu_lock, flags); - } +static struct cgroup_rstat_base_cpu *cgroup_rstat_base_cpu( + struct cgroup *cgrp, int cpu) +{ + return per_cpu_ptr(cgrp->rstat_base_cpu, cpu); +} - if (fast_path) - trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended); - else - trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended); +static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss) +{ + if (ss) + return &ss->rstat_ss_lock; - return flags; + return &rstat_base_lock; } -static __always_inline -void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu, - struct cgroup *cgrp, unsigned long flags, - const bool fast_path) +static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) { - if (fast_path) - trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false); - else - trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false); - - raw_spin_unlock_irqrestore(cpu_lock, flags); + if (ss) + return per_cpu_ptr(ss->lhead, cpu); + return per_cpu_ptr(&rstat_backlog_list, cpu); } /** - * cgroup_rstat_updated - keep track of updated rstat_cpu - * @cgrp: target cgroup + * css_rstat_updated - keep track of updated rstat_cpu + * @css: target cgroup subsystem state * @cpu: cpu on which rstat_cpu was updated * - * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching - * rstat_cpu->updated_children list. See the comment on top of - * cgroup_rstat_cpu definition for details. + * Atomically inserts the css in the ss's llist for the given cpu. This is + * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist + * will be processed at the flush time to create the update tree. + * + * NOTE: if the user needs the guarantee that the updater either add itself in + * the lockless list or the concurrent flusher flushes its updated stats, a + * memory barrier is needed before the call to css_rstat_updated() i.e. a + * barrier after updating the per-cpu stats and before calling + * css_rstat_updated(). */ -__bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) +__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) { - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); - unsigned long flags; + struct llist_head *lhead; + struct css_rstat_cpu *rstatc; + struct llist_node *self; /* - * Speculative already-on-list test. This may race leading to - * temporary inaccuracies, which is fine. + * Since bpf programs can call this function, prevent access to + * uninitialized rstat pointers. + */ + if (!css_uses_rstat(css)) + return; + + lockdep_assert_preemption_disabled(); + + /* + * For archs withnot nmi safe cmpxchg or percpu ops support, ignore + * the requests from nmi context. + */ + if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) || + !IS_ENABLED(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS)) && in_nmi()) + return; + + rstatc = css_rstat_cpu(css, cpu); + /* + * If already on list return. This check is racy and smp_mb() is needed + * to pair it with the smp_mb() in css_process_update_tree() if the + * guarantee that the updated stats are visible to concurrent flusher is + * needed. + */ + if (llist_on_list(&rstatc->lnode)) + return; + + /* + * This function can be renentered by irqs and nmis for the same cgroup + * and may try to insert the same per-cpu lnode into the llist. Note + * that llist_add() does not protect against such scenarios. In addition + * this same per-cpu lnode can be modified through init_llist_node() + * from css_rstat_flush() running on a different CPU. + * + * To protect against such stacked contexts of irqs/nmis, we use the + * fact that lnode points to itself when not on a list and then use + * try_cmpxchg() to atomically set to NULL to select the winner + * which will call llist_add(). The losers can assume the insertion is + * successful and the winner will eventually add the per-cpu lnode to + * the llist. * - * Because @parent's updated_children is terminated with @parent - * instead of NULL, we can tell whether @cgrp is on the list by - * testing the next pointer for NULL. + * Please note that we can not use this_cpu_cmpxchg() here as on some + * archs it is not safe against modifications from multiple CPUs. */ - if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next)) + self = &rstatc->lnode; + if (!try_cmpxchg(&rstatc->lnode.next, &self, NULL)) return; - flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true); + lhead = ss_lhead_cpu(css->ss, cpu); + llist_add(&rstatc->lnode, lhead); +} - /* put @cgrp and all ancestors on the corresponding updated lists */ +static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu) +{ + /* put @css and all ancestors on the corresponding updated lists */ while (true) { - struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); - struct cgroup *parent = cgroup_parent(cgrp); - struct cgroup_rstat_cpu *prstatc; + struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); + struct cgroup_subsys_state *parent = css->parent; + struct css_rstat_cpu *prstatc; /* * Both additions and removals are bottom-up. If a cgroup @@ -115,53 +142,104 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) /* Root has no parent to link it to, but mark it busy */ if (!parent) { - rstatc->updated_next = cgrp; + rstatc->updated_next = css; break; } - prstatc = cgroup_rstat_cpu(parent, cpu); + prstatc = css_rstat_cpu(parent, cpu); rstatc->updated_next = prstatc->updated_children; - prstatc->updated_children = cgrp; + prstatc->updated_children = css; - cgrp = parent; + css = parent; } +} + +static void css_process_update_tree(struct cgroup_subsys *ss, int cpu) +{ + struct llist_head *lhead = ss_lhead_cpu(ss, cpu); + struct llist_node *lnode; + + while ((lnode = llist_del_first_init(lhead))) { + struct css_rstat_cpu *rstatc; + + /* + * smp_mb() is needed here (more specifically in between + * init_llist_node() and per-cpu stats flushing) if the + * guarantee is required by a rstat user where etiher the + * updater should add itself on the lockless list or the + * flusher flush the stats updated by the updater who have + * observed that they are already on the list. The + * corresponding barrier pair for this one should be before + * css_rstat_updated() by the user. + * + * For now, there aren't any such user, so not adding the + * barrier here but if such a use-case arise, please add + * smp_mb() here. + */ - _cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true); + rstatc = container_of(lnode, struct css_rstat_cpu, lnode); + __css_process_update_tree(rstatc->owner, cpu); + } } /** - * cgroup_rstat_push_children - push children cgroups into the given list + * css_rstat_push_children - push children css's into the given list * @head: current head of the list (= subtree root) * @child: first child of the root * @cpu: target cpu - * Return: A new singly linked list of cgroups to be flush + * Return: A new singly linked list of css's to be flushed * - * Iteratively traverse down the cgroup_rstat_cpu updated tree level by + * Iteratively traverse down the css_rstat_cpu updated tree level by * level and push all the parents first before their next level children - * into a singly linked list built from the tail backward like "pushing" - * cgroups into a stack. The root is pushed by the caller. + * into a singly linked list via the rstat_flush_next pointer built from the + * tail backward like "pushing" css's into a stack. The root is pushed by + * the caller. */ -static struct cgroup *cgroup_rstat_push_children(struct cgroup *head, - struct cgroup *child, int cpu) +static struct cgroup_subsys_state *css_rstat_push_children( + struct cgroup_subsys_state *head, + struct cgroup_subsys_state *child, int cpu) { - struct cgroup *chead = child; /* Head of child cgroup level */ - struct cgroup *ghead = NULL; /* Head of grandchild cgroup level */ - struct cgroup *parent, *grandchild; - struct cgroup_rstat_cpu *crstatc; + struct cgroup_subsys_state *cnext = child; /* Next head of child css level */ + struct cgroup_subsys_state *ghead = NULL; /* Head of grandchild css level */ + struct cgroup_subsys_state *parent, *grandchild; + struct css_rstat_cpu *crstatc; child->rstat_flush_next = NULL; + /* + * The subsystem rstat lock must be held for the whole duration from + * here as the rstat_flush_next list is being constructed to when + * it is consumed later in css_rstat_flush(). + */ + lockdep_assert_held(ss_rstat_lock(head->ss)); + + /* + * Notation: -> updated_next pointer + * => rstat_flush_next pointer + * + * Assuming the following sample updated_children lists: + * P: C1 -> C2 -> P + * C1: G11 -> G12 -> C1 + * C2: G21 -> G22 -> C2 + * + * After 1st iteration: + * head => C2 => C1 => NULL + * ghead => G21 => G11 => NULL + * + * After 2nd iteration: + * head => G12 => G11 => G22 => G21 => C2 => C1 => NULL + */ next_level: - while (chead) { - child = chead; - chead = child->rstat_flush_next; - parent = cgroup_parent(child); + while (cnext) { + child = cnext; + cnext = child->rstat_flush_next; + parent = child->parent; - /* updated_next is parent cgroup terminated */ + /* updated_next is parent cgroup terminated if !NULL */ while (child != parent) { child->rstat_flush_next = head; head = child; - crstatc = cgroup_rstat_cpu(child, cpu); + crstatc = css_rstat_cpu(child, cpu); grandchild = crstatc->updated_children; if (grandchild != child) { /* Push the grand child to the next level */ @@ -175,7 +253,7 @@ next_level: } if (ghead) { - chead = ghead; + cnext = ghead; ghead = NULL; goto next_level; } @@ -183,51 +261,50 @@ next_level: } /** - * cgroup_rstat_updated_list - return a list of updated cgroups to be flushed - * @root: root of the cgroup subtree to traverse + * css_rstat_updated_list - build a list of updated css's to be flushed + * @root: root of the css subtree to traverse * @cpu: target cpu - * Return: A singly linked list of cgroups to be flushed + * Return: A singly linked list of css's to be flushed * * Walks the updated rstat_cpu tree on @cpu from @root. During traversal, - * each returned cgroup is unlinked from the updated tree. + * each returned css is unlinked from the updated tree. * * The only ordering guarantee is that, for a parent and a child pair * covered by a given traversal, the child is before its parent in * the list. * * Note that updated_children is self terminated and points to a list of - * child cgroups if not empty. Whereas updated_next is like a sibling link - * within the children list and terminated by the parent cgroup. An exception - * here is the cgroup root whose updated_next can be self terminated. + * child css's if not empty. Whereas updated_next is like a sibling link + * within the children list and terminated by the parent css. An exception + * here is the css root whose updated_next can be self terminated. */ -static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) +static struct cgroup_subsys_state *css_rstat_updated_list( + struct cgroup_subsys_state *root, int cpu) { - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); - struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(root, cpu); - struct cgroup *head = NULL, *parent, *child; - unsigned long flags; + struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu); + struct cgroup_subsys_state *head = NULL, *parent, *child; - flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false); + css_process_update_tree(root->ss, cpu); /* Return NULL if this subtree is not on-list */ if (!rstatc->updated_next) - goto unlock_ret; + return NULL; /* * Unlink @root from its parent. As the updated_children list is * singly linked, we have to walk it to find the removal point. */ - parent = cgroup_parent(root); + parent = root->parent; if (parent) { - struct cgroup_rstat_cpu *prstatc; - struct cgroup **nextp; + struct css_rstat_cpu *prstatc; + struct cgroup_subsys_state **nextp; - prstatc = cgroup_rstat_cpu(parent, cpu); + prstatc = css_rstat_cpu(parent, cpu); nextp = &prstatc->updated_children; while (*nextp != root) { - struct cgroup_rstat_cpu *nrstatc; + struct css_rstat_cpu *nrstatc; - nrstatc = cgroup_rstat_cpu(*nextp, cpu); + nrstatc = css_rstat_cpu(*nextp, cpu); WARN_ON_ONCE(*nextp == parent); nextp = &nrstatc->updated_next; } @@ -242,16 +319,15 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) child = rstatc->updated_children; rstatc->updated_children = root; if (child != root) - head = cgroup_rstat_push_children(head, child, cpu); -unlock_ret: - _cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false); + head = css_rstat_push_children(head, child, cpu); + return head; } /* * A hook for bpf stat collectors to attach to and flush their stats. - * Together with providing bpf kfuncs for cgroup_rstat_updated() and - * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that + * Together with providing bpf kfuncs for css_rstat_updated() and + * css_rstat_flush(), this enables a complete workflow where bpf progs that * collect cgroup stats can integrate with rstat for efficient flushing. * * A static noinline declaration here could cause the compiler to optimize away @@ -271,7 +347,7 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp, __bpf_hook_end(); /* - * Helper functions for locking cgroup_rstat_lock. + * Helper functions for locking. * * This makes it easier to diagnose locking issues and contention in * production environments. The parameter @cpu_in_loop indicate lock @@ -279,115 +355,181 @@ __bpf_hook_end(); * value -1 is used when obtaining the main lock else this is the CPU * number processed last. */ -static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop) - __acquires(&cgroup_rstat_lock) +static inline void __css_rstat_lock(struct cgroup_subsys_state *css, + int cpu_in_loop) + __acquires(ss_rstat_lock(css->ss)) { + struct cgroup *cgrp = css->cgroup; + spinlock_t *lock; bool contended; - contended = !spin_trylock_irq(&cgroup_rstat_lock); + lock = ss_rstat_lock(css->ss); + contended = !spin_trylock_irq(lock); if (contended) { trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended); - spin_lock_irq(&cgroup_rstat_lock); + spin_lock_irq(lock); } trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended); } -static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop) - __releases(&cgroup_rstat_lock) +static inline void __css_rstat_unlock(struct cgroup_subsys_state *css, + int cpu_in_loop) + __releases(ss_rstat_lock(css->ss)) { + struct cgroup *cgrp = css->cgroup; + spinlock_t *lock; + + lock = ss_rstat_lock(css->ss); trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false); - spin_unlock_irq(&cgroup_rstat_lock); + spin_unlock_irq(lock); } /** - * cgroup_rstat_flush - flush stats in @cgrp's subtree - * @cgrp: target cgroup + * css_rstat_flush - flush stats in @css's rstat subtree + * @css: target cgroup subsystem state * - * Collect all per-cpu stats in @cgrp's subtree into the global counters - * and propagate them upwards. After this function returns, all cgroups in - * the subtree have up-to-date ->stat. + * Collect all per-cpu stats in @css's subtree into the global counters + * and propagate them upwards. After this function returns, all rstat + * nodes in the subtree have up-to-date ->stat. * - * This also gets all cgroups in the subtree including @cgrp off the + * This also gets all rstat nodes in the subtree including @css off the * ->updated_children lists. * * This function may block. */ -__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) +__bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css) { int cpu; + bool is_self = css_is_self(css); + + /* + * Since bpf programs can call this function, prevent access to + * uninitialized rstat pointers. + */ + if (!css_uses_rstat(css)) + return; might_sleep(); for_each_possible_cpu(cpu) { - struct cgroup *pos; + struct cgroup_subsys_state *pos; /* Reacquire for each CPU to avoid disabling IRQs too long */ - __cgroup_rstat_lock(cgrp, cpu); - pos = cgroup_rstat_updated_list(cgrp, cpu); + __css_rstat_lock(css, cpu); + pos = css_rstat_updated_list(css, cpu); for (; pos; pos = pos->rstat_flush_next) { - struct cgroup_subsys_state *css; - - cgroup_base_stat_flush(pos, cpu); - bpf_rstat_flush(pos, cgroup_parent(pos), cpu); - - rcu_read_lock(); - list_for_each_entry_rcu(css, &pos->rstat_css_list, - rstat_css_node) - css->ss->css_rstat_flush(css, cpu); - rcu_read_unlock(); + if (is_self) { + cgroup_base_stat_flush(pos->cgroup, cpu); + bpf_rstat_flush(pos->cgroup, + cgroup_parent(pos->cgroup), cpu); + } else + pos->ss->css_rstat_flush(pos, cpu); } - __cgroup_rstat_unlock(cgrp, cpu); + __css_rstat_unlock(css, cpu); if (!cond_resched()) cpu_relax(); } } -int cgroup_rstat_init(struct cgroup *cgrp) +int css_rstat_init(struct cgroup_subsys_state *css) { + struct cgroup *cgrp = css->cgroup; int cpu; + bool is_self = css_is_self(css); + + if (is_self) { + /* the root cgrp has rstat_base_cpu preallocated */ + if (!cgrp->rstat_base_cpu) { + cgrp->rstat_base_cpu = alloc_percpu(struct cgroup_rstat_base_cpu); + if (!cgrp->rstat_base_cpu) + return -ENOMEM; + } + } else if (css->ss->css_rstat_flush == NULL) + return 0; + + /* the root cgrp's self css has rstat_cpu preallocated */ + if (!css->rstat_cpu) { + css->rstat_cpu = alloc_percpu(struct css_rstat_cpu); + if (!css->rstat_cpu) { + if (is_self) + free_percpu(cgrp->rstat_base_cpu); - /* the root cgrp has rstat_cpu preallocated */ - if (!cgrp->rstat_cpu) { - cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu); - if (!cgrp->rstat_cpu) return -ENOMEM; + } } /* ->updated_children list is self terminated */ for_each_possible_cpu(cpu) { - struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); + + rstatc->owner = rstatc->updated_children = css; + init_llist_node(&rstatc->lnode); - rstatc->updated_children = cgrp; - u64_stats_init(&rstatc->bsync); + if (is_self) { + struct cgroup_rstat_base_cpu *rstatbc; + + rstatbc = cgroup_rstat_base_cpu(cgrp, cpu); + u64_stats_init(&rstatbc->bsync); + } } return 0; } -void cgroup_rstat_exit(struct cgroup *cgrp) +void css_rstat_exit(struct cgroup_subsys_state *css) { int cpu; - cgroup_rstat_flush(cgrp); + if (!css_uses_rstat(css)) + return; + + if (!css->rstat_cpu) + return; + + css_rstat_flush(css); /* sanity check */ for_each_possible_cpu(cpu) { - struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); - if (WARN_ON_ONCE(rstatc->updated_children != cgrp) || + if (WARN_ON_ONCE(rstatc->updated_children != css) || WARN_ON_ONCE(rstatc->updated_next)) return; } - free_percpu(cgrp->rstat_cpu); - cgrp->rstat_cpu = NULL; + if (css_is_self(css)) { + struct cgroup *cgrp = css->cgroup; + + free_percpu(cgrp->rstat_base_cpu); + cgrp->rstat_base_cpu = NULL; + } + + free_percpu(css->rstat_cpu); + css->rstat_cpu = NULL; } -void __init cgroup_rstat_boot(void) +/** + * ss_rstat_init - subsystem-specific rstat initialization + * @ss: target subsystem + * + * If @ss is NULL, the static locks associated with the base stats + * are initialized. If @ss is non-NULL, the subsystem-specific locks + * are initialized. + */ +int __init ss_rstat_init(struct cgroup_subsys *ss) { int cpu; + if (ss) { + ss->lhead = alloc_percpu(struct llist_head); + if (!ss->lhead) + return -ENOMEM; + } + + spin_lock_init(ss_rstat_lock(ss)); for_each_possible_cpu(cpu) - raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); + init_llist_head(ss_lhead_cpu(ss, cpu)); + + return 0; } /* @@ -420,9 +562,9 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { - struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct cgroup_rstat_base_cpu *rstatbc = cgroup_rstat_base_cpu(cgrp, cpu); struct cgroup *parent = cgroup_parent(cgrp); - struct cgroup_rstat_cpu *prstatc; + struct cgroup_rstat_base_cpu *prstatbc; struct cgroup_base_stat delta; unsigned seq; @@ -432,15 +574,15 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) /* fetch the current per-cpu values */ do { - seq = __u64_stats_fetch_begin(&rstatc->bsync); - delta = rstatc->bstat; - } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); + seq = __u64_stats_fetch_begin(&rstatbc->bsync); + delta = rstatbc->bstat; + } while (__u64_stats_fetch_retry(&rstatbc->bsync, seq)); /* propagate per-cpu delta to cgroup and per-cpu global statistics */ - cgroup_base_stat_sub(&delta, &rstatc->last_bstat); + cgroup_base_stat_sub(&delta, &rstatbc->last_bstat); cgroup_base_stat_add(&cgrp->bstat, &delta); - cgroup_base_stat_add(&rstatc->last_bstat, &delta); - cgroup_base_stat_add(&rstatc->subtree_bstat, &delta); + cgroup_base_stat_add(&rstatbc->last_bstat, &delta); + cgroup_base_stat_add(&rstatbc->subtree_bstat, &delta); /* propagate cgroup and per-cpu global delta to parent (unless that's root) */ if (cgroup_parent(parent)) { @@ -449,73 +591,73 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) cgroup_base_stat_add(&parent->bstat, &delta); cgroup_base_stat_add(&cgrp->last_bstat, &delta); - delta = rstatc->subtree_bstat; - prstatc = cgroup_rstat_cpu(parent, cpu); - cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat); - cgroup_base_stat_add(&prstatc->subtree_bstat, &delta); - cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta); + delta = rstatbc->subtree_bstat; + prstatbc = cgroup_rstat_base_cpu(parent, cpu); + cgroup_base_stat_sub(&delta, &rstatbc->last_subtree_bstat); + cgroup_base_stat_add(&prstatbc->subtree_bstat, &delta); + cgroup_base_stat_add(&rstatbc->last_subtree_bstat, &delta); } } -static struct cgroup_rstat_cpu * +static struct cgroup_rstat_base_cpu * cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags) { - struct cgroup_rstat_cpu *rstatc; + struct cgroup_rstat_base_cpu *rstatbc; - rstatc = get_cpu_ptr(cgrp->rstat_cpu); - *flags = u64_stats_update_begin_irqsave(&rstatc->bsync); - return rstatc; + rstatbc = get_cpu_ptr(cgrp->rstat_base_cpu); + *flags = u64_stats_update_begin_irqsave(&rstatbc->bsync); + return rstatbc; } static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, - struct cgroup_rstat_cpu *rstatc, + struct cgroup_rstat_base_cpu *rstatbc, unsigned long flags) { - u64_stats_update_end_irqrestore(&rstatc->bsync, flags); - cgroup_rstat_updated(cgrp, smp_processor_id()); - put_cpu_ptr(rstatc); + u64_stats_update_end_irqrestore(&rstatbc->bsync, flags); + css_rstat_updated(&cgrp->self, smp_processor_id()); + put_cpu_ptr(rstatbc); } void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) { - struct cgroup_rstat_cpu *rstatc; + struct cgroup_rstat_base_cpu *rstatbc; unsigned long flags; - rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); - rstatc->bstat.cputime.sum_exec_runtime += delta_exec; - cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); + rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); + rstatbc->bstat.cputime.sum_exec_runtime += delta_exec; + cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags); } void __cgroup_account_cputime_field(struct cgroup *cgrp, enum cpu_usage_stat index, u64 delta_exec) { - struct cgroup_rstat_cpu *rstatc; + struct cgroup_rstat_base_cpu *rstatbc; unsigned long flags; - rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); + rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); switch (index) { case CPUTIME_NICE: - rstatc->bstat.ntime += delta_exec; + rstatbc->bstat.ntime += delta_exec; fallthrough; case CPUTIME_USER: - rstatc->bstat.cputime.utime += delta_exec; + rstatbc->bstat.cputime.utime += delta_exec; break; case CPUTIME_SYSTEM: case CPUTIME_IRQ: case CPUTIME_SOFTIRQ: - rstatc->bstat.cputime.stime += delta_exec; + rstatbc->bstat.cputime.stime += delta_exec; break; #ifdef CONFIG_SCHED_CORE case CPUTIME_FORCEIDLE: - rstatc->bstat.forceidle_sum += delta_exec; + rstatbc->bstat.forceidle_sum += delta_exec; break; #endif default: break; } - cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); + cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags); } /* @@ -574,12 +716,12 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) struct cgroup_base_stat bstat; if (cgroup_parent(cgrp)) { - cgroup_rstat_flush(cgrp); - __cgroup_rstat_lock(cgrp, -1); + css_rstat_flush(&cgrp->self); + __css_rstat_lock(&cgrp->self, -1); bstat = cgrp->bstat; cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &bstat.cputime.utime, &bstat.cputime.stime); - __cgroup_rstat_unlock(cgrp, -1); + __css_rstat_unlock(&cgrp->self, -1); } else { root_cgroup_cputime(&bstat); } @@ -601,10 +743,10 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) cgroup_force_idle_show(seq, &bstat); } -/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */ +/* Add bpf kfuncs for css_rstat_updated() and css_rstat_flush() */ BTF_KFUNCS_START(bpf_rstat_kfunc_ids) -BTF_ID_FLAGS(func, cgroup_rstat_updated) -BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE) +BTF_ID_FLAGS(func, css_rstat_updated) +BTF_ID_FLAGS(func, css_rstat_flush, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_rstat_kfunc_ids) static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = { diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config index 8aafd050b754..9f6ab7dabf67 100644 --- a/kernel/configs/debug.config +++ b/kernel/configs/debug.config @@ -83,7 +83,7 @@ CONFIG_SLUB_DEBUG_ON=y # # Debug Oops, Lockups and Hangs # -# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +CONFIG_BOOTPARAM_HUNG_TASK_PANIC=0 # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set CONFIG_DEBUG_ATOMIC_SLEEP=y CONFIG_DETECT_HUNG_TASK=y @@ -112,3 +112,8 @@ CONFIG_BRANCH_PROFILE_NONE=y CONFIG_DYNAMIC_FTRACE=y CONFIG_FTRACE=y CONFIG_FUNCTION_TRACER=y +# +# Preemption +# +CONFIG_DEBUG_PREEMPT=y +CONFIG_PREEMPT=y diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config index dd7c32fb5ac1..7c3924614e01 100644 --- a/kernel/configs/hardening.config +++ b/kernel/configs/hardening.config @@ -60,9 +60,15 @@ CONFIG_LIST_HARDENED=y # Initialize all heap variables to zero on allocation. CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y +# Initialize all heap variables to zero on free to reduce stale data lifetime. +CONFIG_INIT_ON_FREE_DEFAULT_ON=y + # Initialize all stack variables to zero on function entry. CONFIG_INIT_STACK_ALL_ZERO=y +# Wipe kernel stack after syscall completion to reduce stale data lifetime. +CONFIG_KSTACK_ERASE=y + # Wipe RAM at reboot via EFI. For more details, see: # https://trustedcomputinggroup.org/resource/pc-client-work-group-platform-reset-attack-mitigation-specification/ # https://bugzilla.redhat.com/show_bug.cgi?id=1532058 @@ -87,8 +93,8 @@ CONFIG_SECCOMP_FILTER=y # Provides some protections against SYN flooding. CONFIG_SYN_COOKIES=y -# Enable Kernel Control Flow Integrity (currently Clang only). -CONFIG_CFI_CLANG=y +# Enable Kernel Control Flow Integrity. +CONFIG_CFI=y # CONFIG_CFI_PERMISSIVE is not set # Attack surface reduction: do not autoload TTY line disciplines. diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index b753695c5a8f..5dd0f0a34a73 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config @@ -2,3 +2,4 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_KERNEL_XZ=y CONFIG_SLUB=y CONFIG_SLUB_TINY=y +CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y diff --git a/kernel/configs/xen.config b/kernel/configs/xen.config index 6878b9a49be8..1875a0a5047a 100644 --- a/kernel/configs/xen.config +++ b/kernel/configs/xen.config @@ -13,6 +13,8 @@ CONFIG_SCSI=y CONFIG_FB=y CONFIG_INPUT_MISC=y CONFIG_MEMORY_HOTPLUG=y +CONFIG_MEMORY_HOTREMOVE=y +CONFIG_ZONE_DEVICE=y CONFIG_TTY=y # Technically not required but otherwise produces # pretty useless systems starting from allnoconfig @@ -47,3 +49,4 @@ CONFIG_XEN_GNTDEV=m CONFIG_XEN_GRANT_DEV_ALLOC=m CONFIG_SWIOTLB_XEN=y CONFIG_XEN_PRIVCMD=m +CONFIG_XEN_UNPOPULATED_ALLOC=y diff --git a/kernel/cpu.c b/kernel/cpu.c index b08bb34b1718..8df2d773fe3b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -37,6 +37,7 @@ #include <linux/cpuset.h> #include <linux/random.h> #include <linux/cc_platform.h> +#include <linux/parser.h> #include <trace/events/power.h> #define CREATE_TRACE_POINTS @@ -248,6 +249,14 @@ err: return ret; } +/* + * The former STARTING/DYING states, ran with IRQs disabled and must not fail. + */ +static bool cpuhp_is_atomic_state(enum cpuhp_state state) +{ + return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE; +} + #ifdef CONFIG_SMP static bool cpuhp_is_ap_state(enum cpuhp_state state) { @@ -270,14 +279,6 @@ static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup) complete(done); } -/* - * The former STARTING/DYING states, ran with IRQs disabled and must not fail. - */ -static bool cpuhp_is_atomic_state(enum cpuhp_state state) -{ - return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE; -} - /* Synchronization state management */ enum cpuhp_sync_state { SYNC_STATE_DEAD, @@ -1308,9 +1309,6 @@ static int takedown_cpu(unsigned int cpu) */ irq_lock_sparse(); - /* - * So now all preempt/rcu users must observe !cpu_active(). - */ err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu)); if (err) { /* CPU refused to die */ @@ -2069,11 +2067,6 @@ static struct cpuhp_step cpuhp_hp_states[] = { .teardown.single = NULL, .cant_stop = true, }, - [CPUHP_PERF_PREPARE] = { - .name = "perf:prepare", - .startup.single = perf_event_init_cpu, - .teardown.single = perf_event_exit_cpu, - }, [CPUHP_RANDOM_PREPARE] = { .name = "random:prepare", .startup.single = random_prepare_cpu, @@ -2371,7 +2364,14 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup, else ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL); #else - ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL); + if (cpuhp_is_atomic_state(state)) { + guard(irqsave)(); + ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL); + /* STARTING/DYING must not fail! */ + WARN_ON_ONCE(ret); + } else { + ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL); + } #endif BUG_ON(ret && !bringup); return ret; @@ -3092,10 +3092,13 @@ EXPORT_SYMBOL(cpu_all_bits); #ifdef CONFIG_INIT_ALL_POSSIBLE struct cpumask __cpu_possible_mask __ro_after_init = {CPU_BITS_ALL}; +unsigned int __num_possible_cpus __ro_after_init = NR_CPUS; #else struct cpumask __cpu_possible_mask __ro_after_init; +unsigned int __num_possible_cpus __ro_after_init; #endif EXPORT_SYMBOL(__cpu_possible_mask); +EXPORT_SYMBOL(__num_possible_cpus); struct cpumask __cpu_online_mask __read_mostly; EXPORT_SYMBOL(__cpu_online_mask); @@ -3123,6 +3126,7 @@ void init_cpu_present(const struct cpumask *src) void init_cpu_possible(const struct cpumask *src) { cpumask_copy(&__cpu_possible_mask, src); + __num_possible_cpus = cpumask_weight(&__cpu_possible_mask); } void set_cpu_online(unsigned int cpu, bool online) @@ -3147,6 +3151,21 @@ void set_cpu_online(unsigned int cpu, bool online) } /* + * This should be marked __init, but there is a boatload of call sites + * which need to be fixed up to do so. Sigh... + */ +void set_cpu_possible(unsigned int cpu, bool possible) +{ + if (possible) { + if (!cpumask_test_and_set_cpu(cpu, &__cpu_possible_mask)) + __num_possible_cpus++; + } else { + if (cpumask_test_and_clear_cpu(cpu, &__cpu_possible_mask)) + __num_possible_cpus--; + } +} + +/* * Activate the first processor. */ void __init boot_cpu_init(void) @@ -3179,8 +3198,38 @@ void __init boot_cpu_hotplug_init(void) #ifdef CONFIG_CPU_MITIGATIONS /* - * These are used for a global "mitigations=" cmdline option for toggling - * optional CPU mitigations. + * All except the cross-thread attack vector are mitigated by default. + * Cross-thread mitigation often requires disabling SMT which is expensive + * so cross-thread mitigations are only partially enabled by default. + * + * Guest-to-Host and Guest-to-Guest vectors are only needed if KVM support is + * present. + */ +static bool attack_vectors[NR_CPU_ATTACK_VECTORS] __ro_after_init = { + [CPU_MITIGATE_USER_KERNEL] = true, + [CPU_MITIGATE_USER_USER] = true, + [CPU_MITIGATE_GUEST_HOST] = IS_ENABLED(CONFIG_KVM), + [CPU_MITIGATE_GUEST_GUEST] = IS_ENABLED(CONFIG_KVM), +}; + +bool cpu_attack_vector_mitigated(enum cpu_attack_vectors v) +{ + if (v < NR_CPU_ATTACK_VECTORS) + return attack_vectors[v]; + + WARN_ONCE(1, "Invalid attack vector %d\n", v); + return false; +} + +/* + * There are 3 global options, 'off', 'auto', 'auto,nosmt'. These may optionally + * be combined with attack-vector disables which follow them. + * + * Examples: + * mitigations=auto,no_user_kernel,no_user_user,no_cross_thread + * mitigations=auto,nosmt,no_guest_host,no_guest_guest + * + * mitigations=off is equivalent to disabling all attack vectors. */ enum cpu_mitigations { CPU_MITIGATIONS_OFF, @@ -3188,19 +3237,96 @@ enum cpu_mitigations { CPU_MITIGATIONS_AUTO_NOSMT, }; +enum { + NO_USER_KERNEL, + NO_USER_USER, + NO_GUEST_HOST, + NO_GUEST_GUEST, + NO_CROSS_THREAD, + NR_VECTOR_PARAMS, +}; + +enum smt_mitigations smt_mitigations __ro_after_init = SMT_MITIGATIONS_AUTO; static enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; +static const match_table_t global_mitigations = { + { CPU_MITIGATIONS_AUTO_NOSMT, "auto,nosmt"}, + { CPU_MITIGATIONS_AUTO, "auto"}, + { CPU_MITIGATIONS_OFF, "off"}, +}; + +static const match_table_t vector_mitigations = { + { NO_USER_KERNEL, "no_user_kernel"}, + { NO_USER_USER, "no_user_user"}, + { NO_GUEST_HOST, "no_guest_host"}, + { NO_GUEST_GUEST, "no_guest_guest"}, + { NO_CROSS_THREAD, "no_cross_thread"}, + { NR_VECTOR_PARAMS, NULL}, +}; + +static int __init mitigations_parse_global_opt(char *arg) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(global_mitigations); i++) { + const char *pattern = global_mitigations[i].pattern; + + if (!strncmp(arg, pattern, strlen(pattern))) { + cpu_mitigations = global_mitigations[i].token; + return strlen(pattern); + } + } + + return 0; +} + static int __init mitigations_parse_cmdline(char *arg) { - if (!strcmp(arg, "off")) - cpu_mitigations = CPU_MITIGATIONS_OFF; - else if (!strcmp(arg, "auto")) - cpu_mitigations = CPU_MITIGATIONS_AUTO; - else if (!strcmp(arg, "auto,nosmt")) - cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT; - else - pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n", - arg); + char *s, *p; + int len; + + len = mitigations_parse_global_opt(arg); + + if (cpu_mitigations_off()) { + memset(attack_vectors, 0, sizeof(attack_vectors)); + smt_mitigations = SMT_MITIGATIONS_OFF; + } else if (cpu_mitigations_auto_nosmt()) { + smt_mitigations = SMT_MITIGATIONS_ON; + } + + p = arg + len; + + if (!*p) + return 0; + + /* Attack vector controls may come after the ',' */ + if (*p++ != ',' || !IS_ENABLED(CONFIG_ARCH_HAS_CPU_ATTACK_VECTORS)) { + pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n", arg); + return 0; + } + + while ((s = strsep(&p, ",")) != NULL) { + switch (match_token(s, vector_mitigations, NULL)) { + case NO_USER_KERNEL: + attack_vectors[CPU_MITIGATE_USER_KERNEL] = false; + break; + case NO_USER_USER: + attack_vectors[CPU_MITIGATE_USER_USER] = false; + break; + case NO_GUEST_HOST: + attack_vectors[CPU_MITIGATE_GUEST_HOST] = false; + break; + case NO_GUEST_GUEST: + attack_vectors[CPU_MITIGATE_GUEST_GUEST] = false; + break; + case NO_CROSS_THREAD: + smt_mitigations = SMT_MITIGATIONS_OFF; + break; + default: + pr_crit("Unsupported mitigations options %s\n", s); + return 0; + } + } return 0; } diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index b0f0d15085db..7481fbb947d3 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -173,7 +173,7 @@ int cpu_cluster_pm_exit(void) EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit); #ifdef CONFIG_PM -static int cpu_pm_suspend(void) +static int cpu_pm_suspend(void *data) { int ret; @@ -185,20 +185,24 @@ static int cpu_pm_suspend(void) return ret; } -static void cpu_pm_resume(void) +static void cpu_pm_resume(void *data) { cpu_cluster_pm_exit(); cpu_pm_exit(); } -static struct syscore_ops cpu_pm_syscore_ops = { +static const struct syscore_ops cpu_pm_syscore_ops = { .suspend = cpu_pm_suspend, .resume = cpu_pm_resume, }; +static struct syscore cpu_pm_syscore = { + .ops = &cpu_pm_syscore_ops, +}; + static int cpu_pm_init(void) { - register_syscore_ops(&cpu_pm_syscore_ops); + register_syscore(&cpu_pm_syscore); return 0; } core_initcall(cpu_pm_init); diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 335b8425dd4b..99dac1aa972a 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -21,6 +21,8 @@ #include <linux/reboot.h> #include <linux/btf.h> #include <linux/objtool.h> +#include <linux/delay.h> +#include <linux/panic.h> #include <asm/page.h> #include <asm/sections.h> @@ -33,6 +35,11 @@ /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; +/* time to wait for possible DMA to finish before starting the kdump kernel + * when a CMA reservation is used + */ +#define CMA_DMA_TIMEOUT_SEC 10 + #ifdef CONFIG_CRASH_DUMP int kimage_crash_copy_vmcoreinfo(struct kimage *image) @@ -97,6 +104,14 @@ int kexec_crash_loaded(void) } EXPORT_SYMBOL_GPL(kexec_crash_loaded); +static void crash_cma_clear_pending_dma(void) +{ + if (!crashk_cma_cnt) + return; + + mdelay(CMA_DMA_TIMEOUT_SEC * 1000); +} + /* * No panic_cpu check version of crash_kexec(). This function is called * only when panic_cpu holds the current CPU number; this is the only CPU @@ -119,6 +134,7 @@ void __noclone __crash_kexec(struct pt_regs *regs) crash_setup_regs(&fixed_regs, regs); crash_save_vmcoreinfo(); machine_crash_shutdown(&fixed_regs); + crash_cma_clear_pending_dma(); machine_kexec(kexec_crash_image); } kexec_unlock(); @@ -128,17 +144,7 @@ STACK_FRAME_NON_STANDARD(__crash_kexec); __bpf_kfunc void crash_kexec(struct pt_regs *regs) { - int old_cpu, this_cpu; - - /* - * Only one CPU is allowed to execute the crash_kexec() code as with - * panic(). Otherwise parallel calls of panic() and crash_kexec() - * may stop each other. To exclude them, we use panic_cpu here too. - */ - old_cpu = PANIC_CPU_INVALID; - this_cpu = raw_smp_processor_id(); - - if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { + if (panic_try_start()) { /* This is the 1st CPU which comes here, so go ahead. */ __crash_kexec(regs); @@ -146,7 +152,7 @@ __bpf_kfunc void crash_kexec(struct pt_regs *regs) * Reset panic_cpu to allow another panic()/crash_kexec() * call. */ - atomic_set(&panic_cpu, PANIC_CPU_INVALID); + panic_reset(); } } @@ -259,6 +265,20 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, return 0; } +/** + * crash_exclude_mem_range - exclude a mem range for existing ranges + * @mem: mem->range contains an array of ranges sorted in ascending order + * @mstart: the start of to-be-excluded range + * @mend: the start of to-be-excluded range + * + * If you are unsure if a range split will happen, to avoid function call + * failure because of -ENOMEM, always make sure + * mem->max_nr_ranges == mem->nr_ranges + 1 + * before calling the function each time. + * + * returns 0 if a memory range is excluded successfully + * return -ENOMEM if mem->ranges doesn't have space to hold split ranges + */ int crash_exclude_mem_range(struct crash_mem *mem, unsigned long long mstart, unsigned long long mend) { @@ -318,6 +338,7 @@ int crash_exclude_mem_range(struct crash_mem *mem, return 0; } +EXPORT_SYMBOL_GPL(crash_exclude_mem_range); ssize_t crash_get_memory_size(void) { @@ -352,7 +373,7 @@ static int __crash_shrink_memory(struct resource *old_res, old_res->start = 0; old_res->end = 0; } else { - crashk_res.end = ram_res->start - 1; + old_res->end = ram_res->start - 1; } crash_free_reserved_phys_range(ram_res->start, ram_res->end); diff --git a/kernel/crash_core_test.c b/kernel/crash_core_test.c new file mode 100644 index 000000000000..8aadf6801530 --- /dev/null +++ b/kernel/crash_core_test.c @@ -0,0 +1,343 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <kunit/test.h> +#include <linux/crash_core.h> // For struct crash_mem and struct range if defined there + +// Helper to create and initialize crash_mem +static struct crash_mem *create_crash_mem(struct kunit *test, unsigned int max_ranges, + unsigned int nr_initial_ranges, + const struct range *initial_ranges) +{ + struct crash_mem *mem; + size_t alloc_size; + + // Check if max_ranges can even hold initial_ranges + if (max_ranges < nr_initial_ranges) { + kunit_err(test, "max_ranges (%u) < nr_initial_ranges (%u)\n", + max_ranges, nr_initial_ranges); + return NULL; + } + + alloc_size = sizeof(struct crash_mem) + (size_t)max_ranges * sizeof(struct range); + mem = kunit_kzalloc(test, alloc_size, GFP_KERNEL); + if (!mem) { + kunit_err(test, "Failed to allocate crash_mem\n"); + return NULL; + } + + mem->max_nr_ranges = max_ranges; + mem->nr_ranges = nr_initial_ranges; + if (initial_ranges && nr_initial_ranges > 0) { + memcpy(mem->ranges, initial_ranges, + nr_initial_ranges * sizeof(struct range)); + } + + return mem; +} + +// Helper to compare ranges for assertions +static void assert_ranges_equal(struct kunit *test, + const struct range *actual_ranges, + unsigned int actual_nr_ranges, + const struct range *expected_ranges, + unsigned int expected_nr_ranges, + const char *case_name) +{ + unsigned int i; + + KUNIT_ASSERT_EQ_MSG(test, expected_nr_ranges, actual_nr_ranges, + "%s: Number of ranges mismatch.", case_name); + + for (i = 0; i < expected_nr_ranges; i++) { + KUNIT_ASSERT_EQ_MSG(test, expected_ranges[i].start, actual_ranges[i].start, + "%s: Range %u start mismatch.", case_name, i); + KUNIT_ASSERT_EQ_MSG(test, expected_ranges[i].end, actual_ranges[i].end, + "%s: Range %u end mismatch.", case_name, i); + } +} + +// Structure for test parameters +struct exclude_test_param { + const char *description; + unsigned long long exclude_start; + unsigned long long exclude_end; + unsigned int initial_max_ranges; + const struct range *initial_ranges; + unsigned int initial_nr_ranges; + const struct range *expected_ranges; + unsigned int expected_nr_ranges; + int expected_ret; +}; + +static void run_exclude_test_case(struct kunit *test, const struct exclude_test_param *params) +{ + struct crash_mem *mem; + int ret; + + kunit_info(test, "%s", params->description); + + mem = create_crash_mem(test, params->initial_max_ranges, + params->initial_nr_ranges, params->initial_ranges); + if (!mem) + return; // Error already logged by create_crash_mem or kunit_kzalloc + + ret = crash_exclude_mem_range(mem, params->exclude_start, params->exclude_end); + + KUNIT_ASSERT_EQ_MSG(test, params->expected_ret, ret, + "%s: Return value mismatch.", params->description); + + if (params->expected_ret == 0) { + assert_ranges_equal(test, mem->ranges, mem->nr_ranges, + params->expected_ranges, params->expected_nr_ranges, + params->description); + } else { + // If an error is expected, nr_ranges might still be relevant to check + // depending on the exact point of failure. For ENOMEM on split, + // nr_ranges shouldn't have changed. + KUNIT_ASSERT_EQ_MSG(test, params->initial_nr_ranges, + mem->nr_ranges, + "%s: Number of ranges mismatch on error.", + params->description); + } +} + +/* + * Test Strategy 1: One to-be-excluded range A and one existing range B. + * + * Exhaust all possibilities of the position of A regarding B. + */ + +static const struct range single_range_b = { .start = 100, .end = 199 }; + +static const struct exclude_test_param exclude_single_range_test_data[] = { + { + .description = "1.1: A is left of B, no overlap", + .exclude_start = 10, .exclude_end = 50, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.2: A's right boundary touches B's left boundary", + .exclude_start = 10, .exclude_end = 99, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.3: A overlaps B's left part", + .exclude_start = 50, .exclude_end = 149, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){{ .start = 150, .end = 199 }}, + .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.4: A is completely inside B", + .exclude_start = 120, .exclude_end = 179, + .initial_max_ranges = 2, // Needs space for split + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){ + { .start = 100, .end = 119 }, + { .start = 180, .end = 199 } + }, + .expected_nr_ranges = 2, + .expected_ret = 0, + }, + { + .description = "1.5: A overlaps B's right part", + .exclude_start = 150, .exclude_end = 249, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){{ .start = 100, .end = 149 }}, + .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.6: A's left boundary touches B's right boundary", + .exclude_start = 200, .exclude_end = 250, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.7: A is right of B, no overlap", + .exclude_start = 250, .exclude_end = 300, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.8: A completely covers B and extends beyond", + .exclude_start = 50, .exclude_end = 250, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, .expected_nr_ranges = 0, + .expected_ret = 0, + }, + { + .description = "1.9: A covers B and extends to the left", + .exclude_start = 50, .exclude_end = 199, // A ends exactly where B ends + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, .expected_nr_ranges = 0, + .expected_ret = 0, + }, + { + .description = "1.10: A covers B and extends to the right", + .exclude_start = 100, .exclude_end = 250, // A starts exactly where B starts + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, .expected_nr_ranges = 0, + .expected_ret = 0, + }, + { + .description = "1.11: A is identical to B", + .exclude_start = 100, .exclude_end = 199, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, .expected_nr_ranges = 0, + .expected_ret = 0, + }, + { + .description = "1.12: A is a point, left of B, no overlap", + .exclude_start = 10, .exclude_end = 10, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.13: A is a point, at start of B", + .exclude_start = 100, .exclude_end = 100, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){{ .start = 101, .end = 199 }}, + .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.14: A is a point, in middle of B (causes split)", + .exclude_start = 150, .exclude_end = 150, + .initial_max_ranges = 2, // Needs space for split + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){ + { .start = 100, .end = 149 }, + { .start = 151, .end = 199 } + }, + .expected_nr_ranges = 2, + .expected_ret = 0, + }, + { + .description = "1.15: A is a point, at end of B", + .exclude_start = 199, .exclude_end = 199, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){{ .start = 100, .end = 198 }}, + .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.16: A is a point, right of B, no overlap", + .exclude_start = 250, .exclude_end = 250, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + // ENOMEM case for single range split + { + .description = "1.17: A completely inside B (split), no space (ENOMEM)", + .exclude_start = 120, .exclude_end = 179, + .initial_max_ranges = 1, // Not enough for split + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, // Not checked on error by assert_ranges_equal for content + .expected_nr_ranges = 1, // Should remain unchanged + .expected_ret = -ENOMEM, + }, +}; + + +static void exclude_single_range_test(struct kunit *test) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(exclude_single_range_test_data); i++) { + kunit_log(KERN_INFO, test, "Running: %s", exclude_single_range_test_data[i].description); + run_exclude_test_case(test, &exclude_single_range_test_data[i]); + // KUnit will stop on first KUNIT_ASSERT failure within run_exclude_test_case + } +} + +/* + * Test Strategy 2: Regression test. + */ + +static const struct exclude_test_param exclude_range_regression_test_data[] = { + // Test data from commit a2e9a95d2190 + { + .description = "2.1: exclude low 1M", + .exclude_start = 0, .exclude_end = (1 << 20) - 1, + .initial_max_ranges = 3, + .initial_ranges = (const struct range[]){ + { .start = 0, .end = 0x3efff }, + { .start = 0x3f000, .end = 0x3ffff }, + { .start = 0x40000, .end = 0x9ffff } + }, + .initial_nr_ranges = 3, + .expected_nr_ranges = 0, + .expected_ret = 0, + }, + // Test data from https://lore.kernel.org/all/ZXrY7QbXAlxydsSC@MiWiFi-R3L-srv/T/#u + { + .description = "2.2: when range out of bound", + .exclude_start = 100, .exclude_end = 200, + .initial_max_ranges = 3, + .initial_ranges = (const struct range[]){ + { .start = 1, .end = 299 }, + { .start = 401, .end = 1000 }, + { .start = 1001, .end = 2000 } + }, + .initial_nr_ranges = 3, + .expected_ranges = NULL, // Not checked on error by assert_ranges_equal for content + .expected_nr_ranges = 3, // Should remain unchanged + .expected_ret = -ENOMEM + }, + +}; + + +static void exclude_range_regression_test(struct kunit *test) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(exclude_range_regression_test_data); i++) { + kunit_log(KERN_INFO, test, "Running: %s", exclude_range_regression_test_data[i].description); + run_exclude_test_case(test, &exclude_range_regression_test_data[i]); + // KUnit will stop on first KUNIT_ASSERT failure within run_exclude_test_case + } +} + +/* + * KUnit Test Suite + */ +static struct kunit_case crash_exclude_mem_range_test_cases[] = { + KUNIT_CASE(exclude_single_range_test), + KUNIT_CASE(exclude_range_regression_test), + {} +}; + +static struct kunit_suite crash_exclude_mem_range_suite = { + .name = "crash_exclude_mem_range_tests", + .test_cases = crash_exclude_mem_range_test_cases, + // .init and .exit can be NULL if not needed globally for the suite +}; + +kunit_test_suite(crash_exclude_mem_range_suite); + +MODULE_DESCRIPTION("crash dump KUnit test suite"); +MODULE_LICENSE("GPL"); diff --git a/kernel/crash_dump_dm_crypt.c b/kernel/crash_dump_dm_crypt.c new file mode 100644 index 000000000000..401423ba477d --- /dev/null +++ b/kernel/crash_dump_dm_crypt.c @@ -0,0 +1,464 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/key.h> +#include <linux/keyctl.h> +#include <keys/user-type.h> +#include <linux/crash_dump.h> +#include <linux/cc_platform.h> +#include <linux/configfs.h> +#include <linux/module.h> + +#define KEY_NUM_MAX 128 /* maximum dm crypt keys */ +#define KEY_SIZE_MAX 256 /* maximum dm crypt key size */ +#define KEY_DESC_MAX_LEN 128 /* maximum dm crypt key description size */ + +static unsigned int key_count; + +struct dm_crypt_key { + unsigned int key_size; + char key_desc[KEY_DESC_MAX_LEN]; + u8 data[KEY_SIZE_MAX]; +}; + +static struct keys_header { + unsigned int total_keys; + struct dm_crypt_key keys[] __counted_by(total_keys); +} *keys_header; + +static size_t get_keys_header_size(size_t total_keys) +{ + return struct_size(keys_header, keys, total_keys); +} + +unsigned long long dm_crypt_keys_addr; +EXPORT_SYMBOL_GPL(dm_crypt_keys_addr); + +static int __init setup_dmcryptkeys(char *arg) +{ + char *end; + + if (!arg) + return -EINVAL; + dm_crypt_keys_addr = memparse(arg, &end); + if (end > arg) + return 0; + + dm_crypt_keys_addr = 0; + return -EINVAL; +} + +early_param("dmcryptkeys", setup_dmcryptkeys); + +/* + * Architectures may override this function to read dm crypt keys + */ +ssize_t __weak dm_crypt_keys_read(char *buf, size_t count, u64 *ppos) +{ + struct kvec kvec = { .iov_base = buf, .iov_len = count }; + struct iov_iter iter; + + iov_iter_kvec(&iter, READ, &kvec, 1, count); + return read_from_oldmem(&iter, count, ppos, cc_platform_has(CC_ATTR_MEM_ENCRYPT)); +} + +static int add_key_to_keyring(struct dm_crypt_key *dm_key, + key_ref_t keyring_ref) +{ + key_ref_t key_ref; + int r; + + /* create or update the requested key and add it to the target keyring */ + key_ref = key_create_or_update(keyring_ref, "user", dm_key->key_desc, + dm_key->data, dm_key->key_size, + KEY_USR_ALL, KEY_ALLOC_IN_QUOTA); + + if (!IS_ERR(key_ref)) { + r = key_ref_to_ptr(key_ref)->serial; + key_ref_put(key_ref); + kexec_dprintk("Success adding key %s", dm_key->key_desc); + } else { + r = PTR_ERR(key_ref); + kexec_dprintk("Error when adding key"); + } + + key_ref_put(keyring_ref); + return r; +} + +static void get_keys_from_kdump_reserved_memory(void) +{ + struct keys_header *keys_header_loaded; + + arch_kexec_unprotect_crashkres(); + + keys_header_loaded = kmap_local_page(pfn_to_page( + kexec_crash_image->dm_crypt_keys_addr >> PAGE_SHIFT)); + + memcpy(keys_header, keys_header_loaded, get_keys_header_size(key_count)); + kunmap_local(keys_header_loaded); + arch_kexec_protect_crashkres(); +} + +static int restore_dm_crypt_keys_to_thread_keyring(void) +{ + struct dm_crypt_key *key; + size_t keys_header_size; + key_ref_t keyring_ref; + u64 addr; + + /* find the target keyring (which must be writable) */ + keyring_ref = + lookup_user_key(KEY_SPEC_USER_KEYRING, 0x01, KEY_NEED_WRITE); + if (IS_ERR(keyring_ref)) { + kexec_dprintk("Failed to get the user keyring\n"); + return PTR_ERR(keyring_ref); + } + + addr = dm_crypt_keys_addr; + dm_crypt_keys_read((char *)&key_count, sizeof(key_count), &addr); + if (key_count < 0 || key_count > KEY_NUM_MAX) { + kexec_dprintk("Failed to read the number of dm-crypt keys\n"); + return -1; + } + + kexec_dprintk("There are %u keys\n", key_count); + addr = dm_crypt_keys_addr; + + keys_header_size = get_keys_header_size(key_count); + keys_header = kzalloc(keys_header_size, GFP_KERNEL); + if (!keys_header) + return -ENOMEM; + + dm_crypt_keys_read((char *)keys_header, keys_header_size, &addr); + + for (int i = 0; i < keys_header->total_keys; i++) { + key = &keys_header->keys[i]; + kexec_dprintk("Get key (size=%u)\n", key->key_size); + add_key_to_keyring(key, keyring_ref); + } + + return 0; +} + +static int read_key_from_user_keying(struct dm_crypt_key *dm_key) +{ + const struct user_key_payload *ukp; + struct key *key; + + kexec_dprintk("Requesting logon key %s", dm_key->key_desc); + key = request_key(&key_type_logon, dm_key->key_desc, NULL); + + if (IS_ERR(key)) { + pr_warn("No such logon key %s\n", dm_key->key_desc); + return PTR_ERR(key); + } + + ukp = user_key_payload_locked(key); + if (!ukp) + return -EKEYREVOKED; + + if (ukp->datalen > KEY_SIZE_MAX) { + pr_err("Key size %u exceeds maximum (%u)\n", ukp->datalen, KEY_SIZE_MAX); + return -EINVAL; + } + + memcpy(dm_key->data, ukp->data, ukp->datalen); + dm_key->key_size = ukp->datalen; + kexec_dprintk("Get dm crypt key (size=%u) %s: %8ph\n", dm_key->key_size, + dm_key->key_desc, dm_key->data); + return 0; +} + +struct config_key { + struct config_item item; + const char *description; +}; + +static inline struct config_key *to_config_key(struct config_item *item) +{ + return container_of(item, struct config_key, item); +} + +static ssize_t config_key_description_show(struct config_item *item, char *page) +{ + return sprintf(page, "%s\n", to_config_key(item)->description); +} + +static ssize_t config_key_description_store(struct config_item *item, + const char *page, size_t count) +{ + struct config_key *config_key = to_config_key(item); + size_t len; + int ret; + + ret = -EINVAL; + len = strcspn(page, "\n"); + + if (len > KEY_DESC_MAX_LEN) { + pr_err("The key description shouldn't exceed %u characters", KEY_DESC_MAX_LEN); + return ret; + } + + if (!len) + return ret; + + kfree(config_key->description); + ret = -ENOMEM; + config_key->description = kmemdup_nul(page, len, GFP_KERNEL); + if (!config_key->description) + return ret; + + return count; +} + +CONFIGFS_ATTR(config_key_, description); + +static struct configfs_attribute *config_key_attrs[] = { + &config_key_attr_description, + NULL, +}; + +static void config_key_release(struct config_item *item) +{ + kfree(to_config_key(item)); + key_count--; +} + +static struct configfs_item_operations config_key_item_ops = { + .release = config_key_release, +}; + +static const struct config_item_type config_key_type = { + .ct_item_ops = &config_key_item_ops, + .ct_attrs = config_key_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct config_item *config_keys_make_item(struct config_group *group, + const char *name) +{ + struct config_key *config_key; + + if (key_count > KEY_NUM_MAX) { + pr_err("Only %u keys at maximum to be created\n", KEY_NUM_MAX); + return ERR_PTR(-EINVAL); + } + + config_key = kzalloc(sizeof(struct config_key), GFP_KERNEL); + if (!config_key) + return ERR_PTR(-ENOMEM); + + config_item_init_type_name(&config_key->item, name, &config_key_type); + + key_count++; + + return &config_key->item; +} + +static ssize_t config_keys_count_show(struct config_item *item, char *page) +{ + return sprintf(page, "%d\n", key_count); +} + +CONFIGFS_ATTR_RO(config_keys_, count); + +static bool is_dm_key_reused; + +static ssize_t config_keys_reuse_show(struct config_item *item, char *page) +{ + return sprintf(page, "%d\n", is_dm_key_reused); +} + +static ssize_t config_keys_reuse_store(struct config_item *item, + const char *page, size_t count) +{ + if (!kexec_crash_image || !kexec_crash_image->dm_crypt_keys_addr) { + kexec_dprintk( + "dm-crypt keys haven't be saved to crash-reserved memory\n"); + return -EINVAL; + } + + if (kstrtobool(page, &is_dm_key_reused)) + return -EINVAL; + + if (is_dm_key_reused) + get_keys_from_kdump_reserved_memory(); + + return count; +} + +CONFIGFS_ATTR(config_keys_, reuse); + +static struct configfs_attribute *config_keys_attrs[] = { + &config_keys_attr_count, + &config_keys_attr_reuse, + NULL, +}; + +/* + * Note that, since no extra work is required on ->drop_item(), + * no ->drop_item() is provided. + */ +static struct configfs_group_operations config_keys_group_ops = { + .make_item = config_keys_make_item, +}; + +static const struct config_item_type config_keys_type = { + .ct_group_ops = &config_keys_group_ops, + .ct_attrs = config_keys_attrs, + .ct_owner = THIS_MODULE, +}; + +static bool restore; + +static ssize_t config_keys_restore_show(struct config_item *item, char *page) +{ + return sprintf(page, "%d\n", restore); +} + +static ssize_t config_keys_restore_store(struct config_item *item, + const char *page, size_t count) +{ + if (!restore) + restore_dm_crypt_keys_to_thread_keyring(); + + if (kstrtobool(page, &restore)) + return -EINVAL; + + return count; +} + +CONFIGFS_ATTR(config_keys_, restore); + +static struct configfs_attribute *kdump_config_keys_attrs[] = { + &config_keys_attr_restore, + NULL, +}; + +static const struct config_item_type kdump_config_keys_type = { + .ct_attrs = kdump_config_keys_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct configfs_subsystem config_keys_subsys = { + .su_group = { + .cg_item = { + .ci_namebuf = "crash_dm_crypt_keys", + .ci_type = &config_keys_type, + }, + }, +}; + +static int build_keys_header(void) +{ + struct config_item *item = NULL; + struct config_key *key; + int i, r; + + if (keys_header != NULL) + kvfree(keys_header); + + keys_header = kzalloc(get_keys_header_size(key_count), GFP_KERNEL); + if (!keys_header) + return -ENOMEM; + + keys_header->total_keys = key_count; + + i = 0; + list_for_each_entry(item, &config_keys_subsys.su_group.cg_children, + ci_entry) { + if (item->ci_type != &config_key_type) + continue; + + key = to_config_key(item); + + if (!key->description) { + pr_warn("No key description for key %s\n", item->ci_name); + return -EINVAL; + } + + strscpy(keys_header->keys[i].key_desc, key->description, + KEY_DESC_MAX_LEN); + r = read_key_from_user_keying(&keys_header->keys[i]); + if (r != 0) { + kexec_dprintk("Failed to read key %s\n", + keys_header->keys[i].key_desc); + return r; + } + i++; + kexec_dprintk("Found key: %s\n", item->ci_name); + } + + return 0; +} + +int crash_load_dm_crypt_keys(struct kimage *image) +{ + struct kexec_buf kbuf = { + .image = image, + .buf_min = 0, + .buf_max = ULONG_MAX, + .top_down = false, + .random = true, + }; + int r; + + + if (key_count <= 0) { + kexec_dprintk("No dm-crypt keys\n"); + return -ENOENT; + } + + if (!is_dm_key_reused) { + image->dm_crypt_keys_addr = 0; + r = build_keys_header(); + if (r) + return r; + } + + kbuf.buffer = keys_header; + kbuf.bufsz = get_keys_header_size(key_count); + + kbuf.memsz = kbuf.bufsz; + kbuf.buf_align = ELF_CORE_HEADER_ALIGN; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + r = kexec_add_buffer(&kbuf); + if (r) { + kvfree((void *)kbuf.buffer); + return r; + } + image->dm_crypt_keys_addr = kbuf.mem; + image->dm_crypt_keys_sz = kbuf.bufsz; + kexec_dprintk( + "Loaded dm crypt keys to kexec_buffer bufsz=0x%lx memsz=0x%lx\n", + kbuf.bufsz, kbuf.memsz); + + return r; +} + +static int __init configfs_dmcrypt_keys_init(void) +{ + int ret; + + if (is_kdump_kernel()) { + config_keys_subsys.su_group.cg_item.ci_type = + &kdump_config_keys_type; + } + + config_group_init(&config_keys_subsys.su_group); + mutex_init(&config_keys_subsys.su_mutex); + ret = configfs_register_subsystem(&config_keys_subsys); + if (ret) { + pr_err("Error %d while registering subsystem %s\n", ret, + config_keys_subsys.su_group.cg_item.ci_namebuf); + goto out_unregister; + } + + return 0; + +out_unregister: + configfs_unregister_subsystem(&config_keys_subsys); + + return ret; +} + +module_init(configfs_dmcrypt_keys_init); diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index aff7c0fdbefa..62e60e0223cf 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -14,6 +14,8 @@ #include <linux/cpuhotplug.h> #include <linux/memblock.h> #include <linux/kmemleak.h> +#include <linux/cma.h> +#include <linux/crash_reserve.h> #include <asm/page.h> #include <asm/sections.h> @@ -131,7 +133,7 @@ static int __init parse_crashkernel_mem(char *cmdline, cur++; *crash_base = memparse(cur, &tmp); if (cur == tmp) { - pr_warn("crahskernel: Memory value expected after '@'\n"); + pr_warn("crashkernel: Memory value expected after '@'\n"); return -EINVAL; } } @@ -172,17 +174,19 @@ static int __init parse_crashkernel_simple(char *cmdline, #define SUFFIX_HIGH 0 #define SUFFIX_LOW 1 -#define SUFFIX_NULL 2 +#define SUFFIX_CMA 2 +#define SUFFIX_NULL 3 static __initdata char *suffix_tbl[] = { [SUFFIX_HIGH] = ",high", [SUFFIX_LOW] = ",low", + [SUFFIX_CMA] = ",cma", [SUFFIX_NULL] = NULL, }; /* * That function parses "suffix" crashkernel command lines like * - * crashkernel=size,[high|low] + * crashkernel=size,[high|low|cma] * * It returns 0 on success and -EINVAL on failure. */ @@ -298,9 +302,11 @@ int __init parse_crashkernel(char *cmdline, unsigned long long *crash_size, unsigned long long *crash_base, unsigned long long *low_size, + unsigned long long *cma_size, bool *high) { int ret; + unsigned long long __always_unused cma_base; /* crashkernel=X[@offset] */ ret = __parse_crashkernel(cmdline, system_ram, crash_size, @@ -331,6 +337,14 @@ int __init parse_crashkernel(char *cmdline, *high = true; } + + /* + * optional CMA reservation + * cma_base is ignored + */ + if (cma_size) + __parse_crashkernel(cmdline, 0, cma_size, + &cma_base, suffix_tbl[SUFFIX_CMA]); #endif if (!*crash_size) ret = -EINVAL; @@ -457,9 +471,62 @@ retry: #endif } +struct range crashk_cma_ranges[CRASHKERNEL_CMA_RANGES_MAX]; +#ifdef CRASHKERNEL_CMA +int crashk_cma_cnt; +void __init reserve_crashkernel_cma(unsigned long long cma_size) +{ + unsigned long long request_size = roundup(cma_size, PAGE_SIZE); + unsigned long long reserved_size = 0; + + if (!cma_size) + return; + + while (cma_size > reserved_size && + crashk_cma_cnt < CRASHKERNEL_CMA_RANGES_MAX) { + + struct cma *res; + + if (cma_declare_contiguous(0, request_size, 0, 0, 0, false, + "crashkernel", &res)) { + /* reservation failed, try half-sized blocks */ + if (request_size <= PAGE_SIZE) + break; + + request_size = roundup(request_size / 2, PAGE_SIZE); + continue; + } + + crashk_cma_ranges[crashk_cma_cnt].start = cma_get_base(res); + crashk_cma_ranges[crashk_cma_cnt].end = + crashk_cma_ranges[crashk_cma_cnt].start + + cma_get_size(res) - 1; + ++crashk_cma_cnt; + reserved_size += request_size; + } + + if (cma_size > reserved_size) + pr_warn("crashkernel CMA reservation failed: %lld MB requested, %lld MB reserved in %d ranges\n", + cma_size >> 20, reserved_size >> 20, crashk_cma_cnt); + else + pr_info("crashkernel CMA reserved: %lld MB in %d ranges\n", + reserved_size >> 20, crashk_cma_cnt); +} + +#else /* CRASHKERNEL_CMA */ +void __init reserve_crashkernel_cma(unsigned long long cma_size) +{ + if (cma_size) + pr_warn("crashkernel CMA reservation not supported\n"); +} +#endif + #ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY static __init int insert_crashkernel_resources(void) { + if (!arch_add_crash_res_to_iomem()) + return 0; + if (crashk_res.start < crashk_res.end) insert_resource(&iomem_resource, &crashk_res); diff --git a/kernel/cred.c b/kernel/cred.c index 9676965c0981..a6f686b30da1 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -35,33 +35,6 @@ do { \ static struct kmem_cache *cred_jar; -/* init to 2 - one for init_task, one to ensure it is never freed */ -static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) }; - -/* - * The initial credentials for the initial task - */ -struct cred init_cred = { - .usage = ATOMIC_INIT(4), - .uid = GLOBAL_ROOT_UID, - .gid = GLOBAL_ROOT_GID, - .suid = GLOBAL_ROOT_UID, - .sgid = GLOBAL_ROOT_GID, - .euid = GLOBAL_ROOT_UID, - .egid = GLOBAL_ROOT_GID, - .fsuid = GLOBAL_ROOT_UID, - .fsgid = GLOBAL_ROOT_GID, - .securebits = SECUREBITS_DEFAULT, - .cap_inheritable = CAP_EMPTY_SET, - .cap_permitted = CAP_FULL_SET, - .cap_effective = CAP_FULL_SET, - .cap_bset = CAP_FULL_SET, - .user = INIT_USER, - .user_ns = &init_user_ns, - .group_info = &init_groups, - .ucounts = &init_ucounts, -}; - /* * The RCU callback to actually dispose of a set of credentials */ @@ -287,7 +260,7 @@ struct cred *prepare_exec_creds(void) * The new process gets the current process's subjective credentials as its * objective and subjective credentials */ -int copy_creds(struct task_struct *p, unsigned long clone_flags) +int copy_creds(struct task_struct *p, u64 clone_flags) { struct cred *new; int ret; @@ -306,6 +279,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) kdebug("share_creds(%p{%ld})", p->cred, atomic_long_read(&p->cred->usage)); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); + get_cred_namespaces(p); return 0; } @@ -343,6 +317,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) p->cred = p->real_cred = get_cred(new); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); + get_cred_namespaces(p); + return 0; error_put: @@ -435,10 +411,13 @@ int commit_creds(struct cred *new) */ if (new->user != old->user || new->user_ns != old->user_ns) inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1); + rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->cred, new); if (new->user != old->user || new->user_ns != old->user_ns) dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1); + if (new->user_ns != old->user_ns) + switch_cred_namespaces(old, new); /* send notifications */ if (!uid_eq(new->uid, old->uid) || diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index f625172d4b67..22fe969c5d2e 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -30,6 +30,7 @@ #include <linux/kgdb.h> #include <linux/kdb.h> #include <linux/serial_core.h> +#include <linux/string.h> #include <linux/reboot.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> @@ -547,7 +548,7 @@ static void gdb_cmd_setregs(struct kgdb_state *ks) error_packet(remcom_out_buffer, -EINVAL); } else { gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs); - strcpy(remcom_out_buffer, "OK"); + strscpy(remcom_out_buffer, "OK"); } } @@ -577,7 +578,7 @@ static void gdb_cmd_memwrite(struct kgdb_state *ks) if (err) error_packet(remcom_out_buffer, err); else - strcpy(remcom_out_buffer, "OK"); + strscpy(remcom_out_buffer, "OK"); } #if DBG_MAX_REG_NUM > 0 @@ -630,7 +631,7 @@ static void gdb_cmd_reg_set(struct kgdb_state *ks) i = i / 2; kgdb_hex2mem(ptr, (char *)gdb_regs, i); dbg_set_reg(regnum, gdb_regs, ks->linux_regs); - strcpy(remcom_out_buffer, "OK"); + strscpy(remcom_out_buffer, "OK"); } #endif /* DBG_MAX_REG_NUM > 0 */ @@ -642,7 +643,7 @@ static void gdb_cmd_binwrite(struct kgdb_state *ks) if (err) error_packet(remcom_out_buffer, err); else - strcpy(remcom_out_buffer, "OK"); + strscpy(remcom_out_buffer, "OK"); } /* Handle the 'D' or 'k', detach or kill packets */ @@ -656,7 +657,7 @@ static void gdb_cmd_detachkill(struct kgdb_state *ks) if (error < 0) { error_packet(remcom_out_buffer, error); } else { - strcpy(remcom_out_buffer, "OK"); + strscpy(remcom_out_buffer, "OK"); kgdb_connected = 0; } put_packet(remcom_out_buffer); @@ -676,7 +677,7 @@ static int gdb_cmd_reboot(struct kgdb_state *ks) /* For now, only honor R0 */ if (strcmp(remcom_in_buffer, "R0") == 0) { printk(KERN_CRIT "Executing emergency reboot\n"); - strcpy(remcom_out_buffer, "OK"); + strscpy(remcom_out_buffer, "OK"); put_packet(remcom_out_buffer); /* @@ -739,7 +740,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) case 'C': /* Current thread id */ - strcpy(remcom_out_buffer, "QC"); + strscpy(remcom_out_buffer, "QC"); ks->threadid = shadow_pid(current->pid); int_to_threadref(thref, ks->threadid); pack_threadid(remcom_out_buffer + 2, thref); @@ -773,7 +774,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) int len = strlen(remcom_in_buffer + 6); if ((len % 2) != 0) { - strcpy(remcom_out_buffer, "E01"); + strscpy(remcom_out_buffer, "E01"); break; } kgdb_hex2mem(remcom_in_buffer + 6, @@ -785,14 +786,14 @@ static void gdb_cmd_query(struct kgdb_state *ks) kdb_parse(remcom_out_buffer); kdb_common_deinit_state(); - strcpy(remcom_out_buffer, "OK"); + strscpy(remcom_out_buffer, "OK"); } break; #endif #ifdef CONFIG_HAVE_ARCH_KGDB_QXFER_PKT case 'S': if (!strncmp(remcom_in_buffer, "qSupported:", 11)) - strcpy(remcom_out_buffer, kgdb_arch_gdb_stub_feature); + strscpy(remcom_out_buffer, kgdb_arch_gdb_stub_feature); break; case 'X': if (!strncmp(remcom_in_buffer, "qXfer:", 6)) @@ -822,7 +823,7 @@ static void gdb_cmd_task(struct kgdb_state *ks) } kgdb_usethread = thread; ks->kgdb_usethreadid = ks->threadid; - strcpy(remcom_out_buffer, "OK"); + strscpy(remcom_out_buffer, "OK"); break; case 'c': ptr = &remcom_in_buffer[2]; @@ -837,7 +838,7 @@ static void gdb_cmd_task(struct kgdb_state *ks) } kgdb_contthread = thread; } - strcpy(remcom_out_buffer, "OK"); + strscpy(remcom_out_buffer, "OK"); break; } } @@ -851,7 +852,7 @@ static void gdb_cmd_thread(struct kgdb_state *ks) kgdb_hex2long(&ptr, &ks->threadid); thread = getthread(ks->linux_regs, ks->threadid); if (thread) - strcpy(remcom_out_buffer, "OK"); + strscpy(remcom_out_buffer, "OK"); else error_packet(remcom_out_buffer, -EINVAL); } @@ -913,7 +914,7 @@ static void gdb_cmd_break(struct kgdb_state *ks) (int) length, *bpt_type - '0'); if (error == 0) - strcpy(remcom_out_buffer, "OK"); + strscpy(remcom_out_buffer, "OK"); else error_packet(remcom_out_buffer, error); } diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 9b11b10b120c..61c1690058ed 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -589,24 +589,41 @@ static void kdb_msg_write(const char *msg, int msg_len) */ cookie = console_srcu_read_lock(); for_each_console_srcu(c) { - if (!(console_srcu_read_flags(c) & CON_ENABLED)) + short flags = console_srcu_read_flags(c); + + if (!console_is_usable(c, flags, true)) continue; if (c == dbg_io_ops->cons) continue; - if (!c->write) - continue; - /* - * Set oops_in_progress to encourage the console drivers to - * disregard their internal spin locks: in the current calling - * context the risk of deadlock is a bigger problem than risks - * due to re-entering the console driver. We operate directly on - * oops_in_progress rather than using bust_spinlocks() because - * the calls bust_spinlocks() makes on exit are not appropriate - * for this calling context. - */ - ++oops_in_progress; - c->write(c, msg, msg_len); - --oops_in_progress; + + if (flags & CON_NBCON) { + struct nbcon_write_context wctxt = { }; + + /* + * Do not continue if the console is NBCON and the context + * can't be acquired. + */ + if (!nbcon_kdb_try_acquire(c, &wctxt)) + continue; + + nbcon_write_context_set_buf(&wctxt, (char *)msg, msg_len); + + c->write_atomic(c, &wctxt); + nbcon_kdb_release(&wctxt); + } else { + /* + * Set oops_in_progress to encourage the console drivers to + * disregard their internal spin locks: in the current calling + * context the risk of deadlock is a bigger problem than risks + * due to re-entering the console driver. We operate directly on + * oops_in_progress rather than using bust_spinlocks() because + * the calls bust_spinlocks() makes on exit are not appropriate + * for this calling context. + */ + ++oops_in_progress; + c->write(c, msg, msg_len); + --oops_in_progress; + } touch_nmi_watchdog(); } console_srcu_read_unlock(cookie); @@ -714,8 +731,8 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) * it, depending on the results of the search. */ cp++; /* to byte after the newline */ - replaced_byte = *cp; /* remember what/where it was */ - cphold = cp; + replaced_byte = *cp; /* remember what it was */ + cphold = cp; /* remember where it was */ *cp = '\0'; /* end the string for our search */ /* @@ -732,8 +749,9 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) * Shift the buffer left. */ *cphold = replaced_byte; - strcpy(kdb_buffer, cphold); - len = strlen(kdb_buffer); + len = strlen(cphold); + /* Use memmove() because the buffers overlap */ + memmove(kdb_buffer, cphold, len + 1); next_avail = kdb_buffer + len; size_avail = sizeof(kdb_buffer) - len; goto kdb_print_out; @@ -872,8 +890,9 @@ kdb_printit: */ if (kdb_grepping_flag && !suspend_grep) { *cphold = replaced_byte; - strcpy(kdb_buffer, cphold); - len = strlen(kdb_buffer); + len = strlen(cphold); + /* Use memmove() because the buffers overlap */ + memmove(kdb_buffer, cphold, len + 1); next_avail = kdb_buffer + len; size_avail = sizeof(kdb_buffer) - len; } diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c index 3a74604fdb8a..386d30e530b7 100644 --- a/kernel/debug/kdb/kdb_keyboard.c +++ b/kernel/debug/kdb/kdb_keyboard.c @@ -145,9 +145,6 @@ int kdb_get_kbd_char(void) return CTRL('F'); } - if (scancode == 0xe0) - return -1; - /* * For Japanese 86/106 keyboards * See comment in drivers/char/pc_keyb.c. diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 7a4d2d4689a5..dddf2b5aad57 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -721,20 +721,12 @@ static int kdb_defcmd(int argc, const char **argv) mp->name = kdb_strdup(argv[1], GFP_KDB); if (!mp->name) goto fail_name; - mp->usage = kdb_strdup(argv[2], GFP_KDB); + mp->usage = kdb_strdup_dequote(argv[2], GFP_KDB); if (!mp->usage) goto fail_usage; - mp->help = kdb_strdup(argv[3], GFP_KDB); + mp->help = kdb_strdup_dequote(argv[3], GFP_KDB); if (!mp->help) goto fail_help; - if (mp->usage[0] == '"') { - strcpy(mp->usage, argv[2]+1); - mp->usage[strlen(mp->usage)-1] = '\0'; - } - if (mp->help[0] == '"') { - strcpy(mp->help, argv[3]+1); - mp->help[strlen(mp->help)-1] = '\0'; - } INIT_LIST_HEAD(&kdb_macro->statements); defcmd_in_progress = true; @@ -860,7 +852,7 @@ static void parse_grep(const char *str) kdb_printf("search string too long\n"); return; } - strcpy(kdb_grep_string, cp); + memcpy(kdb_grep_string, cp, len + 1); kdb_grepping_flag++; return; } diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index d2520d72b1f5..a2fc7d2bc9fc 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -110,6 +110,7 @@ extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, extern int kdbgetsymval(const char *, kdb_symtab_t *); extern int kdbnearsym(unsigned long, kdb_symtab_t *); extern char *kdb_strdup(const char *str, gfp_t type); +extern char *kdb_strdup_dequote(const char *str, gfp_t type); extern void kdb_symbol_print(unsigned long, const kdb_symtab_t *, unsigned int); /* Routine for debugging the debugger state. */ diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 05b137e7dcb9..56f7b906e7cc 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -23,6 +23,7 @@ #include <linux/uaccess.h> #include <linux/kdb.h> #include <linux/slab.h> +#include <linux/string.h> #include <linux/ctype.h> #include "kdb_private.h" @@ -246,11 +247,41 @@ void kdb_symbol_print(unsigned long addr, const kdb_symtab_t *symtab_p, */ char *kdb_strdup(const char *str, gfp_t type) { - int n = strlen(str)+1; + size_t n = strlen(str) + 1; char *s = kmalloc(n, type); if (!s) return NULL; - return strcpy(s, str); + memcpy(s, str, n); + return s; +} + +/* + * kdb_strdup_dequote - same as kdb_strdup(), but trims surrounding quotes from + * the input string if present. + * Remarks: + * Quotes are only removed if there is both a leading and a trailing quote. + */ +char *kdb_strdup_dequote(const char *str, gfp_t type) +{ + size_t len = strlen(str); + char *s; + + if (str[0] == '"' && len > 1 && str[len - 1] == '"') { + /* trim both leading and trailing quotes */ + str++; + len -= 2; + } + + len++; /* add space for NUL terminator */ + + s = kmalloc(len, type); + if (!s) + return NULL; + + memcpy(s, str, len - 1); + s[len - 1] = '\0'; + + return s; } /* diff --git a/kernel/delayacct.c b/kernel/delayacct.c index eb63a021ac04..30e7912ebb0d 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -14,6 +14,15 @@ #include <linux/delayacct.h> #include <linux/module.h> +#define UPDATE_DELAY(type) \ +do { \ + d->type##_delay_max = tsk->delays->type##_delay_max; \ + d->type##_delay_min = tsk->delays->type##_delay_min; \ + tmp = d->type##_delay_total + tsk->delays->type##_delay; \ + d->type##_delay_total = (tmp < d->type##_delay_total) ? 0 : tmp; \ + d->type##_count += tsk->delays->type##_count; \ +} while (0) + DEFINE_STATIC_KEY_FALSE(delayacct_key); int delayacct_on __read_mostly; /* Delay accounting turned on/off */ struct kmem_cache *delayacct_cache; @@ -173,41 +182,13 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ raw_spin_lock_irqsave(&tsk->delays->lock, flags); - d->blkio_delay_max = tsk->delays->blkio_delay_max; - d->blkio_delay_min = tsk->delays->blkio_delay_min; - tmp = d->blkio_delay_total + tsk->delays->blkio_delay; - d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; - d->swapin_delay_max = tsk->delays->swapin_delay_max; - d->swapin_delay_min = tsk->delays->swapin_delay_min; - tmp = d->swapin_delay_total + tsk->delays->swapin_delay; - d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; - d->freepages_delay_max = tsk->delays->freepages_delay_max; - d->freepages_delay_min = tsk->delays->freepages_delay_min; - tmp = d->freepages_delay_total + tsk->delays->freepages_delay; - d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; - d->thrashing_delay_max = tsk->delays->thrashing_delay_max; - d->thrashing_delay_min = tsk->delays->thrashing_delay_min; - tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay; - d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp; - d->compact_delay_max = tsk->delays->compact_delay_max; - d->compact_delay_min = tsk->delays->compact_delay_min; - tmp = d->compact_delay_total + tsk->delays->compact_delay; - d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp; - d->wpcopy_delay_max = tsk->delays->wpcopy_delay_max; - d->wpcopy_delay_min = tsk->delays->wpcopy_delay_min; - tmp = d->wpcopy_delay_total + tsk->delays->wpcopy_delay; - d->wpcopy_delay_total = (tmp < d->wpcopy_delay_total) ? 0 : tmp; - d->irq_delay_max = tsk->delays->irq_delay_max; - d->irq_delay_min = tsk->delays->irq_delay_min; - tmp = d->irq_delay_total + tsk->delays->irq_delay; - d->irq_delay_total = (tmp < d->irq_delay_total) ? 0 : tmp; - d->blkio_count += tsk->delays->blkio_count; - d->swapin_count += tsk->delays->swapin_count; - d->freepages_count += tsk->delays->freepages_count; - d->thrashing_count += tsk->delays->thrashing_count; - d->compact_count += tsk->delays->compact_count; - d->wpcopy_count += tsk->delays->wpcopy_count; - d->irq_count += tsk->delays->irq_count; + UPDATE_DELAY(blkio); + UPDATE_DELAY(swapin); + UPDATE_DELAY(freepages); + UPDATE_DELAY(thrashing); + UPDATE_DELAY(compact); + UPDATE_DELAY(wpcopy); + UPDATE_DELAY(irq); raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return 0; diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 8df0dfaaca18..d8fd6f779f79 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -42,6 +42,7 @@ #include <linux/memblock.h> #include <linux/err.h> #include <linux/sizes.h> +#include <linux/dma-buf/heaps/cma.h> #include <linux/dma-map-ops.h> #include <linux/cma.h> #include <linux/nospec.h> @@ -222,7 +223,10 @@ void __init dma_contiguous_reserve(phys_addr_t limit) if (size_cmdline != -1) { selected_size = size_cmdline; selected_base = base_cmdline; - selected_limit = min_not_zero(limit_cmdline, limit); + + /* Hornor the user setup dma address limit */ + selected_limit = limit_cmdline ?: limit; + if (base_cmdline + size_cmdline == limit_cmdline) fixed = true; } else { @@ -238,6 +242,8 @@ void __init dma_contiguous_reserve(phys_addr_t limit) } if (selected_size && !dma_contiguous_default_area) { + int ret; + pr_debug("%s: reserving %ld MiB for global area\n", __func__, (unsigned long)selected_size / SZ_1M); @@ -245,6 +251,10 @@ void __init dma_contiguous_reserve(phys_addr_t limit) selected_limit, &dma_contiguous_default_area, fixed); + + ret = dma_heap_cma_register_heap(dma_contiguous_default_area); + if (ret) + pr_warn("Couldn't register default CMA heap."); } } @@ -480,8 +490,6 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) pr_err("Reserved memory: unable to setup CMA region\n"); return err; } - /* Architecture specific contiguous memory fixup. */ - dma_contiguous_early_fixup(rmem->base, rmem->size); if (default_cma) dma_contiguous_default_area = cma; @@ -492,6 +500,10 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) pr_info("Reserved memory: created CMA memory pool at %pa, size %ld MiB\n", &rmem->base, (unsigned long)rmem->size / SZ_1M); + err = dma_heap_cma_register_heap(cma); + if (err) + pr_warn("Couldn't register CMA heap."); + return 0; } RESERVEDMEM_OF_DECLARE(cma, "shared-dma-pool", rmem_cma_setup); diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index e43c6de2bce4..138ede653de4 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -23,6 +23,7 @@ #include <linux/ctype.h> #include <linux/list.h> #include <linux/slab.h> +#include <linux/swiotlb.h> #include <asm/sections.h> #include "debug.h" @@ -38,7 +39,8 @@ enum { dma_debug_single, dma_debug_sg, dma_debug_coherent, - dma_debug_resource, + dma_debug_noncoherent, + dma_debug_phy, }; enum map_err_types { @@ -140,7 +142,8 @@ static const char *type2name[] = { [dma_debug_single] = "single", [dma_debug_sg] = "scatter-gather", [dma_debug_coherent] = "coherent", - [dma_debug_resource] = "resource", + [dma_debug_noncoherent] = "noncoherent", + [dma_debug_phy] = "phy", }; static const char *dir2name[] = { @@ -592,7 +595,9 @@ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs) if (rc == -ENOMEM) { pr_err_once("cacheline tracking ENOMEM, dma-debug disabled\n"); global_disable = true; - } else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { + } else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && + !(IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && + is_swiotlb_active(entry->dev))) { err_printk(entry->dev, entry, "cacheline tracking EEXIST, overlapping mappings aren't supported\n"); } @@ -993,7 +998,8 @@ static void check_unmap(struct dma_debug_entry *ref) "[mapped as %s] [unmapped as %s]\n", ref->dev_addr, ref->size, type2name[entry->type], type2name[ref->type]); - } else if (entry->type == dma_debug_coherent && + } else if ((entry->type == dma_debug_coherent || + entry->type == dma_debug_noncoherent) && ref->paddr != entry->paddr) { err_printk(ref->dev, entry, "device driver frees " "DMA memory with different CPU address " @@ -1051,17 +1057,16 @@ static void check_unmap(struct dma_debug_entry *ref) dma_entry_free(entry); } -static void check_for_stack(struct device *dev, - struct page *page, size_t offset) +static void check_for_stack(struct device *dev, phys_addr_t phys) { void *addr; struct vm_struct *stack_vm_area = task_stack_vm_area(current); if (!stack_vm_area) { /* Stack is direct-mapped. */ - if (PageHighMem(page)) + if (PhysHighMem(phys)) return; - addr = page_address(page) + offset; + addr = phys_to_virt(phys); if (object_is_on_stack(addr)) err_printk(dev, NULL, "device driver maps memory from stack [addr=%p]\n", addr); } else { @@ -1069,10 +1074,12 @@ static void check_for_stack(struct device *dev, int i; for (i = 0; i < stack_vm_area->nr_pages; i++) { - if (page != stack_vm_area->pages[i]) + if (__phys_to_pfn(phys) != + page_to_pfn(stack_vm_area->pages[i])) continue; - addr = (u8 *)current->stack + i * PAGE_SIZE + offset; + addr = (u8 *)current->stack + i * PAGE_SIZE + + (phys % PAGE_SIZE); err_printk(dev, NULL, "device driver maps memory from stack [probable addr=%p]\n", addr); break; } @@ -1201,9 +1208,8 @@ void debug_dma_map_single(struct device *dev, const void *addr, } EXPORT_SYMBOL(debug_dma_map_single); -void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, - size_t size, int direction, dma_addr_t dma_addr, - unsigned long attrs) +void debug_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, + int direction, dma_addr_t dma_addr, unsigned long attrs) { struct dma_debug_entry *entry; @@ -1218,19 +1224,18 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, return; entry->dev = dev; - entry->type = dma_debug_single; - entry->paddr = page_to_phys(page) + offset; + entry->type = dma_debug_phy; + entry->paddr = phys; entry->dev_addr = dma_addr; entry->size = size; entry->direction = direction; entry->map_err_type = MAP_ERR_NOT_CHECKED; - check_for_stack(dev, page, offset); + if (!(attrs & DMA_ATTR_MMIO)) { + check_for_stack(dev, phys); - if (!PageHighMem(page)) { - void *addr = page_address(page) + offset; - - check_for_illegal_area(dev, addr, size); + if (!PhysHighMem(phys)) + check_for_illegal_area(dev, phys_to_virt(phys), size); } add_dma_entry(entry, attrs); @@ -1274,11 +1279,11 @@ void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) } EXPORT_SYMBOL(debug_dma_mapping_error); -void debug_dma_unmap_page(struct device *dev, dma_addr_t dma_addr, +void debug_dma_unmap_phys(struct device *dev, dma_addr_t dma_addr, size_t size, int direction) { struct dma_debug_entry ref = { - .type = dma_debug_single, + .type = dma_debug_phy, .dev = dev, .dev_addr = dma_addr, .size = size, @@ -1302,7 +1307,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, return; for_each_sg(sg, s, nents, i) { - check_for_stack(dev, sg_page(s), s->offset); + check_for_stack(dev, sg_phys(s)); if (!PageHighMem(sg_page(s))) check_for_illegal_area(dev, sg_virt(s), s->length); } @@ -1442,47 +1447,6 @@ void debug_dma_free_coherent(struct device *dev, size_t size, check_unmap(&ref); } -void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, - int direction, dma_addr_t dma_addr, - unsigned long attrs) -{ - struct dma_debug_entry *entry; - - if (unlikely(dma_debug_disabled())) - return; - - entry = dma_entry_alloc(); - if (!entry) - return; - - entry->type = dma_debug_resource; - entry->dev = dev; - entry->paddr = addr; - entry->size = size; - entry->dev_addr = dma_addr; - entry->direction = direction; - entry->map_err_type = MAP_ERR_NOT_CHECKED; - - add_dma_entry(entry, attrs); -} - -void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, - size_t size, int direction) -{ - struct dma_debug_entry ref = { - .type = dma_debug_resource, - .dev = dev, - .dev_addr = dma_addr, - .size = size, - .direction = direction, - }; - - if (unlikely(dma_debug_disabled())) - return; - - check_unmap(&ref); -} - void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, int direction) { @@ -1581,6 +1545,49 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, } } +void debug_dma_alloc_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr, + unsigned long attrs) +{ + struct dma_debug_entry *entry; + + if (unlikely(dma_debug_disabled())) + return; + + entry = dma_entry_alloc(); + if (!entry) + return; + + entry->type = dma_debug_noncoherent; + entry->dev = dev; + entry->paddr = page_to_phys(page); + entry->size = size; + entry->dev_addr = dma_addr; + entry->direction = direction; + + add_dma_entry(entry, attrs); +} + +void debug_dma_free_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr) +{ + struct dma_debug_entry ref = { + .type = dma_debug_noncoherent, + .dev = dev, + .paddr = page_to_phys(page), + .dev_addr = dma_addr, + .size = size, + .direction = direction, + }; + + if (unlikely(dma_debug_disabled())) + return; + + check_unmap(&ref); +} + static int __init dma_debug_driver_setup(char *str) { int i; diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h index f525197d3cae..da7be0bddcf6 100644 --- a/kernel/dma/debug.h +++ b/kernel/dma/debug.h @@ -9,12 +9,11 @@ #define _KERNEL_DMA_DEBUG_H #ifdef CONFIG_DMA_API_DEBUG -extern void debug_dma_map_page(struct device *dev, struct page *page, - size_t offset, size_t size, - int direction, dma_addr_t dma_addr, +extern void debug_dma_map_phys(struct device *dev, phys_addr_t phys, + size_t size, int direction, dma_addr_t dma_addr, unsigned long attrs); -extern void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, +extern void debug_dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, int direction); extern void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, @@ -31,14 +30,6 @@ extern void debug_dma_alloc_coherent(struct device *dev, size_t size, extern void debug_dma_free_coherent(struct device *dev, size_t size, void *virt, dma_addr_t addr); -extern void debug_dma_map_resource(struct device *dev, phys_addr_t addr, - size_t size, int direction, - dma_addr_t dma_addr, - unsigned long attrs); - -extern void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, - size_t size, int direction); - extern void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, int direction); @@ -54,15 +45,21 @@ extern void debug_dma_sync_sg_for_cpu(struct device *dev, extern void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, int direction); +extern void debug_dma_alloc_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr, + unsigned long attrs); +extern void debug_dma_free_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr); #else /* CONFIG_DMA_API_DEBUG */ -static inline void debug_dma_map_page(struct device *dev, struct page *page, - size_t offset, size_t size, - int direction, dma_addr_t dma_addr, - unsigned long attrs) +static inline void debug_dma_map_phys(struct device *dev, phys_addr_t phys, + size_t size, int direction, + dma_addr_t dma_addr, unsigned long attrs) { } -static inline void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, +static inline void debug_dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, int direction) { } @@ -90,19 +87,6 @@ static inline void debug_dma_free_coherent(struct device *dev, size_t size, { } -static inline void debug_dma_map_resource(struct device *dev, phys_addr_t addr, - size_t size, int direction, - dma_addr_t dma_addr, - unsigned long attrs) -{ -} - -static inline void debug_dma_unmap_resource(struct device *dev, - dma_addr_t dma_addr, size_t size, - int direction) -{ -} - static inline void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, int direction) @@ -126,5 +110,18 @@ static inline void debug_dma_sync_sg_for_device(struct device *dev, int nelems, int direction) { } + +static inline void debug_dma_alloc_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr, + unsigned long attrs) +{ +} + +static inline void debug_dma_free_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr) +{ +} #endif /* CONFIG_DMA_API_DEBUG */ #endif /* _KERNEL_DMA_DEBUG_H */ diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index b8fe0b3d0ffb..50c3fe2a1d55 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -13,6 +13,7 @@ #include <linux/vmalloc.h> #include <linux/set_memory.h> #include <linux/slab.h> +#include <linux/pci-p2pdma.h> #include "direct.h" /* @@ -119,7 +120,7 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp_t gfp, bool allow_highmem) { int node = dev_to_node(dev); - struct page *page = NULL; + struct page *page; u64 phys_limit; WARN_ON_ONCE(!PAGE_ALIGNED(size)); @@ -130,30 +131,25 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit); page = dma_alloc_contiguous(dev, size, gfp); if (page) { - if (!dma_coherent_ok(dev, page_to_phys(page), size) || - (!allow_highmem && PageHighMem(page))) { - dma_free_contiguous(dev, page, size); - page = NULL; - } + if (dma_coherent_ok(dev, page_to_phys(page), size) && + (allow_highmem || !PageHighMem(page))) + return page; + + dma_free_contiguous(dev, page, size); } -again: - if (!page) - page = alloc_pages_node(node, gfp, get_order(size)); - if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { + + while ((page = alloc_pages_node(node, gfp, get_order(size))) + && !dma_coherent_ok(dev, page_to_phys(page), size)) { __free_pages(page, get_order(size)); - page = NULL; if (IS_ENABLED(CONFIG_ZONE_DMA32) && phys_limit < DMA_BIT_MASK(64) && - !(gfp & (GFP_DMA32 | GFP_DMA))) { + !(gfp & (GFP_DMA32 | GFP_DMA))) gfp |= GFP_DMA32; - goto again; - } - - if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) { + else if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) gfp = (gfp & ~GFP_DMA32) | GFP_DMA; - goto again; - } + else + return NULL; } return page; @@ -452,7 +448,7 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, if (sg_dma_is_bus_address(sg)) sg_dma_unmark_bus_address(sg); else - dma_direct_unmap_page(dev, sg->dma_address, + dma_direct_unmap_phys(dev, sg->dma_address, sg_dma_len(sg), dir, attrs); } } @@ -462,34 +458,34 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) { struct pci_p2pdma_map_state p2pdma_state = {}; - enum pci_p2pdma_map_type map; struct scatterlist *sg; int i, ret; for_each_sg(sgl, sg, nents, i) { - if (is_pci_p2pdma_page(sg_page(sg))) { - map = pci_p2pdma_map_segment(&p2pdma_state, dev, sg); - switch (map) { - case PCI_P2PDMA_MAP_BUS_ADDR: - continue; - case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: - /* - * Any P2P mapping that traverses the PCI - * host bridge must be mapped with CPU physical - * address and not PCI bus addresses. This is - * done with dma_direct_map_page() below. - */ - break; - default: - ret = -EREMOTEIO; + switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) { + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + /* + * Any P2P mapping that traverses the PCI host bridge + * must be mapped with CPU physical address and not PCI + * bus addresses. + */ + break; + case PCI_P2PDMA_MAP_NONE: + sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg), + sg->length, dir, attrs); + if (sg->dma_address == DMA_MAPPING_ERROR) { + ret = -EIO; goto out_unmap; } - } - - sg->dma_address = dma_direct_map_page(dev, sg_page(sg), - sg->offset, sg->length, dir, attrs); - if (sg->dma_address == DMA_MAPPING_ERROR) { - ret = -EIO; + break; + case PCI_P2PDMA_MAP_BUS_ADDR: + sg->dma_address = pci_p2pdma_bus_addr_map( + p2pdma_state.mem, sg_phys(sg)); + sg_dma_len(sg) = sg->length; + sg_dma_mark_bus_address(sg); + continue; + default: + ret = -EREMOTEIO; goto out_unmap; } sg_dma_len(sg) = sg->length; @@ -502,22 +498,6 @@ out_unmap: return ret; } -dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ - dma_addr_t dma_addr = paddr; - - if (unlikely(!dma_capable(dev, dma_addr, size, false))) { - dev_err_once(dev, - "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", - &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); - WARN_ON_ONCE(1); - return DMA_MAPPING_ERROR; - } - - return dma_addr; -} - int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index d2c0b7e632fc..da2fadf45bcd 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -80,42 +80,57 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev, arch_dma_mark_clean(paddr, size); } -static inline dma_addr_t dma_direct_map_page(struct device *dev, - struct page *page, unsigned long offset, size_t size, - enum dma_data_direction dir, unsigned long attrs) +static inline dma_addr_t dma_direct_map_phys(struct device *dev, + phys_addr_t phys, size_t size, enum dma_data_direction dir, + unsigned long attrs) { - phys_addr_t phys = page_to_phys(page) + offset; - dma_addr_t dma_addr = phys_to_dma(dev, phys); + dma_addr_t dma_addr; if (is_swiotlb_force_bounce(dev)) { - if (is_pci_p2pdma_page(page)) - return DMA_MAPPING_ERROR; + if (attrs & DMA_ATTR_MMIO) + goto err_overflow; + return swiotlb_map(dev, phys, size, dir, attrs); } - if (unlikely(!dma_capable(dev, dma_addr, size, true)) || - dma_kmalloc_needs_bounce(dev, size, dir)) { - if (is_pci_p2pdma_page(page)) - return DMA_MAPPING_ERROR; - if (is_swiotlb_active(dev)) - return swiotlb_map(dev, phys, size, dir, attrs); - - dev_WARN_ONCE(dev, 1, - "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", - &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); - return DMA_MAPPING_ERROR; + if (attrs & DMA_ATTR_MMIO) { + dma_addr = phys; + if (unlikely(!dma_capable(dev, dma_addr, size, false))) + goto err_overflow; + } else { + dma_addr = phys_to_dma(dev, phys); + if (unlikely(!dma_capable(dev, dma_addr, size, true)) || + dma_kmalloc_needs_bounce(dev, size, dir)) { + if (is_swiotlb_active(dev)) + return swiotlb_map(dev, phys, size, dir, attrs); + + goto err_overflow; + } } - if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + if (!dev_is_dma_coherent(dev) && + !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) arch_sync_dma_for_device(phys, size, dir); return dma_addr; + +err_overflow: + dev_WARN_ONCE( + dev, 1, + "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); + return DMA_MAPPING_ERROR; } -static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, +static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - phys_addr_t phys = dma_to_phys(dev, addr); + phys_addr_t phys; + + if (attrs & DMA_ATTR_MMIO) + /* nothing to do: uncached and no swiotlb */ + return; + phys = dma_to_phys(dev, addr); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) dma_direct_sync_single_for_cpu(dev, addr, size, dir); diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c index 92de80e5b057..16a51736a2a3 100644 --- a/kernel/dma/dummy.c +++ b/kernel/dma/dummy.c @@ -11,17 +11,16 @@ static int dma_dummy_mmap(struct device *dev, struct vm_area_struct *vma, return -ENXIO; } -static dma_addr_t dma_dummy_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - unsigned long attrs) +static dma_addr_t dma_dummy_map_phys(struct device *dev, phys_addr_t phys, + size_t size, enum dma_data_direction dir, unsigned long attrs) { return DMA_MAPPING_ERROR; } -static void dma_dummy_unmap_page(struct device *dev, dma_addr_t dma_handle, +static void dma_dummy_unmap_phys(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir, unsigned long attrs) { /* - * Dummy ops doesn't support map_page, so unmap_page should never be + * Dummy ops doesn't support map_phys, so unmap_page should never be * called. */ WARN_ON_ONCE(true); @@ -51,8 +50,8 @@ static int dma_dummy_supported(struct device *hwdev, u64 mask) const struct dma_map_ops dma_dummy_ops = { .mmap = dma_dummy_mmap, - .map_page = dma_dummy_map_page, - .unmap_page = dma_dummy_unmap_page, + .map_phys = dma_dummy_map_phys, + .unmap_phys = dma_dummy_unmap_phys, .map_sg = dma_dummy_map_sg, .unmap_sg = dma_dummy_unmap_sg, .dma_supported = dma_dummy_supported, diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index cc19a3efea89..794041a39e65 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -11,13 +11,13 @@ #include <linux/dma-mapping.h> #include <linux/kernel.h> #include <linux/kthread.h> -#include <linux/map_benchmark.h> #include <linux/math64.h> #include <linux/module.h> #include <linux/pci.h> #include <linux/platform_device.h> #include <linux/slab.h> #include <linux/timekeeping.h> +#include <uapi/linux/map_benchmark.h> struct map_benchmark_data { struct map_benchmark bparam; diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 051a32988040..37163eb49f9f 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -152,12 +152,12 @@ static inline bool dma_map_direct(struct device *dev, return dma_go_direct(dev, *dev->dma_mask, ops); } -dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, - size_t offset, size_t size, enum dma_data_direction dir, - unsigned long attrs) +dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, + enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); - dma_addr_t addr; + bool is_mmio = attrs & DMA_ATTR_MMIO; + dma_addr_t addr = DMA_MAPPING_ERROR; BUG_ON(!valid_dma_direction(dir)); @@ -165,36 +165,65 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, return DMA_MAPPING_ERROR; if (dma_map_direct(dev, ops) || - arch_dma_map_page_direct(dev, page_to_phys(page) + offset + size)) - addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); + (!is_mmio && arch_dma_map_phys_direct(dev, phys + size))) + addr = dma_direct_map_phys(dev, phys, size, dir, attrs); else if (use_dma_iommu(dev)) - addr = iommu_dma_map_page(dev, page, offset, size, dir, attrs); - else - addr = ops->map_page(dev, page, offset, size, dir, attrs); - kmsan_handle_dma(page, offset, size, dir); - trace_dma_map_page(dev, page_to_phys(page) + offset, addr, size, dir, - attrs); - debug_dma_map_page(dev, page, offset, size, dir, addr, attrs); + addr = iommu_dma_map_phys(dev, phys, size, dir, attrs); + else if (ops->map_phys) + addr = ops->map_phys(dev, phys, size, dir, attrs); + + if (!is_mmio) + kmsan_handle_dma(phys, size, dir); + trace_dma_map_phys(dev, phys, addr, size, dir, attrs); + debug_dma_map_phys(dev, phys, size, dir, addr, attrs); return addr; } +EXPORT_SYMBOL_GPL(dma_map_phys); + +dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, + size_t offset, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + phys_addr_t phys = page_to_phys(page) + offset; + + if (unlikely(attrs & DMA_ATTR_MMIO)) + return DMA_MAPPING_ERROR; + + if (IS_ENABLED(CONFIG_DMA_API_DEBUG) && + WARN_ON_ONCE(is_zone_device_page(page))) + return DMA_MAPPING_ERROR; + + return dma_map_phys(dev, phys, size, dir, attrs); +} EXPORT_SYMBOL(dma_map_page_attrs); -void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, +void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); + bool is_mmio = attrs & DMA_ATTR_MMIO; BUG_ON(!valid_dma_direction(dir)); if (dma_map_direct(dev, ops) || - arch_dma_unmap_page_direct(dev, addr + size)) - dma_direct_unmap_page(dev, addr, size, dir, attrs); + (!is_mmio && arch_dma_unmap_phys_direct(dev, addr + size))) + dma_direct_unmap_phys(dev, addr, size, dir, attrs); else if (use_dma_iommu(dev)) - iommu_dma_unmap_page(dev, addr, size, dir, attrs); - else - ops->unmap_page(dev, addr, size, dir, attrs); - trace_dma_unmap_page(dev, addr, size, dir, attrs); - debug_dma_unmap_page(dev, addr, size, dir); + iommu_dma_unmap_phys(dev, addr, size, dir, attrs); + else if (ops->unmap_phys) + ops->unmap_phys(dev, addr, size, dir, attrs); + trace_dma_unmap_phys(dev, addr, size, dir, attrs); + debug_dma_unmap_phys(dev, addr, size, dir); +} +EXPORT_SYMBOL_GPL(dma_unmap_phys); + +void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ + if (unlikely(attrs & DMA_ATTR_MMIO)) + return; + + dma_unmap_phys(dev, addr, size, dir, attrs); } EXPORT_SYMBOL(dma_unmap_page_attrs); @@ -321,41 +350,18 @@ EXPORT_SYMBOL(dma_unmap_sg_attrs); dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - const struct dma_map_ops *ops = get_dma_ops(dev); - dma_addr_t addr = DMA_MAPPING_ERROR; - - BUG_ON(!valid_dma_direction(dir)); - - if (WARN_ON_ONCE(!dev->dma_mask)) + if (IS_ENABLED(CONFIG_DMA_API_DEBUG) && + WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr)))) return DMA_MAPPING_ERROR; - if (dma_map_direct(dev, ops)) - addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs); - else if (use_dma_iommu(dev)) - addr = iommu_dma_map_resource(dev, phys_addr, size, dir, attrs); - else if (ops->map_resource) - addr = ops->map_resource(dev, phys_addr, size, dir, attrs); - - trace_dma_map_resource(dev, phys_addr, addr, size, dir, attrs); - debug_dma_map_resource(dev, phys_addr, size, dir, addr, attrs); - return addr; + return dma_map_phys(dev, phys_addr, size, dir, attrs | DMA_ATTR_MMIO); } EXPORT_SYMBOL(dma_map_resource); void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - const struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - if (dma_map_direct(dev, ops)) - ; /* nothing to do: uncached and no swiotlb */ - else if (use_dma_iommu(dev)) - iommu_dma_unmap_resource(dev, addr, size, dir, attrs); - else if (ops->unmap_resource) - ops->unmap_resource(dev, addr, size, dir, attrs); - trace_dma_unmap_resource(dev, addr, size, dir, attrs); - debug_dma_unmap_resource(dev, addr, size, dir); + dma_unmap_phys(dev, addr, size, dir, attrs | DMA_ATTR_MMIO); } EXPORT_SYMBOL(dma_unmap_resource); @@ -443,6 +449,24 @@ bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr) } EXPORT_SYMBOL_GPL(__dma_need_sync); +/** + * dma_need_unmap - does this device need dma_unmap_* operations + * @dev: device to check + * + * If this function returns %false, drivers can skip calling dma_unmap_* after + * finishing an I/O. This function must be called after all mappings that might + * need to be unmapped have been performed. + */ +bool dma_need_unmap(struct device *dev) +{ + if (!dma_map_direct(dev, get_dma_ops(dev))) + return true; + if (!dev->dma_skip_sync) + return true; + return IS_ENABLED(CONFIG_DMA_API_DEBUG); +} +EXPORT_SYMBOL_GPL(dma_need_unmap); + static void dma_setup_need_sync(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); @@ -694,7 +718,7 @@ struct page *dma_alloc_pages(struct device *dev, size_t size, if (page) { trace_dma_alloc_pages(dev, page_to_virt(page), *dma_handle, size, dir, gfp, 0); - debug_dma_map_page(dev, page, 0, size, dir, *dma_handle, 0); + debug_dma_alloc_pages(dev, page, size, dir, *dma_handle, 0); } else { trace_dma_alloc_pages(dev, NULL, 0, size, dir, gfp, 0); } @@ -720,7 +744,7 @@ void dma_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_handle, enum dma_data_direction dir) { trace_dma_free_pages(dev, page_to_virt(page), dma_handle, size, dir, 0); - debug_dma_unmap_page(dev, dma_handle, size, dir); + debug_dma_free_pages(dev, page, size, dir, dma_handle); __dma_free_pages(dev, size, page, dma_handle, dir); } EXPORT_SYMBOL_GPL(dma_free_pages); diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c index 9afd569eadb9..20caf9cabf69 100644 --- a/kernel/dma/ops_helpers.c +++ b/kernel/dma/ops_helpers.c @@ -64,6 +64,7 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size, { const struct dma_map_ops *ops = get_dma_ops(dev); struct page *page; + phys_addr_t phys; page = dma_alloc_contiguous(dev, size, gfp); if (!page) @@ -71,11 +72,12 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size, if (!page) return NULL; + phys = page_to_phys(page); if (use_dma_iommu(dev)) - *dma_handle = iommu_dma_map_page(dev, page, 0, size, dir, + *dma_handle = iommu_dma_map_phys(dev, phys, size, dir, DMA_ATTR_SKIP_CPU_SYNC); else - *dma_handle = ops->map_page(dev, page, 0, size, dir, + *dma_handle = ops->map_phys(dev, phys, size, dir, DMA_ATTR_SKIP_CPU_SYNC); if (*dma_handle == DMA_MAPPING_ERROR) { dma_free_contiguous(dev, page, size); @@ -92,10 +94,10 @@ void dma_common_free_pages(struct device *dev, size_t size, struct page *page, const struct dma_map_ops *ops = get_dma_ops(dev); if (use_dma_iommu(dev)) - iommu_dma_unmap_page(dev, dma_handle, size, dir, + iommu_dma_unmap_phys(dev, dma_handle, size, dir, DMA_ATTR_SKIP_CPU_SYNC); - else if (ops->unmap_page) - ops->unmap_page(dev, dma_handle, size, dir, + else if (ops->unmap_phys) + ops->unmap_phys(dev, dma_handle, size, dir, DMA_ATTR_SKIP_CPU_SYNC); dma_free_contiguous(dev, page, size); } diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 7b04f7575796..26392badc36b 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -93,7 +93,7 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, page = dma_alloc_from_contiguous(NULL, 1 << order, order, false); if (!page) - page = alloc_pages(gfp, order); + page = alloc_pages(gfp | __GFP_NOWARN, order); } while (!page && order-- > 0); if (!page) goto out; @@ -102,8 +102,8 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, #ifdef CONFIG_DMA_DIRECT_REMAP addr = dma_common_contiguous_remap(page, pool_size, - pgprot_dmacoherent(PAGE_KERNEL), - __builtin_return_address(0)); + pgprot_decrypted(pgprot_dmacoherent(PAGE_KERNEL)), + __builtin_return_address(0)); if (!addr) goto free_page; #else diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 9e2afad1c615..b7c1c0c92d0c 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -49,7 +49,7 @@ void *dma_common_contiguous_remap(struct page *page, size_t size, if (!pages) return NULL; for (i = 0; i < count; i++) - pages[i] = nth_page(page, i); + pages[i] = page++; vaddr = vmap(pages, count, VM_DMA_COHERENT, prot); kvfree(pages); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index abcf3fa63a56..a547c7693135 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -61,8 +61,6 @@ */ #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) -#define INVALID_PHYS_ADDR (~(phys_addr_t)0) - /** * struct io_tlb_slot - IO TLB slot descriptor * @orig_addr: The original address corresponding to a mapped entry. @@ -1209,7 +1207,7 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, nslabs = nr_slots(alloc_size); phys_limit = min_not_zero(*dev->dma_mask, dev->bus_dma_limit); pool = swiotlb_alloc_pool(dev, nslabs, nslabs, 1, phys_limit, - GFP_NOWAIT | __GFP_NOWARN); + GFP_NOWAIT); if (!pool) return -1; diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile index d4b8bd0af79b..2333d70802e4 100644 --- a/kernel/entry/Makefile +++ b/kernel/entry/Makefile @@ -12,5 +12,6 @@ ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong CFLAGS_common.o += -fno-stack-protector -obj-$(CONFIG_GENERIC_ENTRY) += common.o syscall_user_dispatch.o -obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK) += kvm.o +obj-$(CONFIG_GENERIC_IRQ_ENTRY) += common.o +obj-$(CONFIG_GENERIC_SYSCALL) += syscall-common.o syscall_user_dispatch.o +obj-$(CONFIG_VIRT_XFER_TO_GUEST_WORK) += virt.o diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 20154572ede9..5c792b30c58a 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -1,100 +1,30 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/context_tracking.h> -#include <linux/entry-common.h> +#include <linux/irq-entry-common.h> #include <linux/resume_user_mode.h> #include <linux/highmem.h> #include <linux/jump_label.h> #include <linux/kmsan.h> #include <linux/livepatch.h> -#include <linux/audit.h> #include <linux/tick.h> -#include "common.h" - -#define CREATE_TRACE_POINTS -#include <trace/events/syscalls.h> - -static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) -{ - if (unlikely(audit_context())) { - unsigned long args[6]; - - syscall_get_arguments(current, regs, args); - audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); - } -} - -long syscall_trace_enter(struct pt_regs *regs, long syscall, - unsigned long work) -{ - long ret = 0; - - /* - * Handle Syscall User Dispatch. This must comes first, since - * the ABI here can be something that doesn't make sense for - * other syscall_work features. - */ - if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { - if (syscall_user_dispatch(regs)) - return -1L; - } - - /* Handle ptrace */ - if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { - ret = ptrace_report_syscall_entry(regs); - if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) - return -1L; - } - - /* Do seccomp after ptrace, to catch any tracer changes. */ - if (work & SYSCALL_WORK_SECCOMP) { - ret = __secure_computing(); - if (ret == -1L) - return ret; - } - - /* Either of the above might have changed the syscall number */ - syscall = syscall_get_nr(current, regs); - - if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) { - trace_sys_enter(regs, syscall); - /* - * Probes or BPF hooks in the tracepoint may have changed the - * system call number as well. - */ - syscall = syscall_get_nr(current, regs); - } - - syscall_enter_audit(regs, syscall); - - return ret ? : syscall; -} - -noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) -{ - enter_from_user_mode(regs); - instrumentation_begin(); - local_irq_enable(); - instrumentation_end(); -} - /* Workaround to allow gradual conversion of architecture code */ void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } -/** - * exit_to_user_mode_loop - do any pending work before leaving to user space - * @regs: Pointer to pt_regs on entry stack - * @ti_work: TIF work flags as read by the caller - */ -__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, - unsigned long ti_work) +#ifdef CONFIG_HAVE_GENERIC_TIF_BITS +#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK & ~_TIF_RSEQ) +#else +#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK) +#endif + +static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs, + unsigned long ti_work) { /* * Before returning to user space ensure that all pending work * items have been completed. */ - while (ti_work & EXIT_TO_USER_MODE_WORK) { + while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) { local_irq_enable_exit_to_user(ti_work); @@ -133,104 +63,21 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, return ti_work; } -/* - * If SYSCALL_EMU is set, then the only reason to report is when - * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall - * instruction has been already reported in syscall_enter_from_user_mode(). - */ -static inline bool report_single_step(unsigned long work) -{ - if (work & SYSCALL_WORK_SYSCALL_EMU) - return false; - - return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; -} - -static void syscall_exit_work(struct pt_regs *regs, unsigned long work) -{ - bool step; - - /* - * If the syscall was rolled back due to syscall user dispatching, - * then the tracers below are not invoked for the same reason as - * the entry side was not invoked in syscall_trace_enter(): The ABI - * of these syscalls is unknown. - */ - if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { - if (unlikely(current->syscall_dispatch.on_dispatch)) { - current->syscall_dispatch.on_dispatch = false; - return; - } - } - - audit_syscall_exit(regs); - - if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) - trace_sys_exit(regs, syscall_get_return_value(current, regs)); - - step = report_single_step(work); - if (step || work & SYSCALL_WORK_SYSCALL_TRACE) - ptrace_report_syscall_exit(regs, step); -} - -/* - * Syscall specific exit to user mode preparation. Runs with interrupts - * enabled. +/** + * exit_to_user_mode_loop - do any pending work before leaving to user space + * @regs: Pointer to pt_regs on entry stack + * @ti_work: TIF work flags as read by the caller */ -static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) +__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, + unsigned long ti_work) { - unsigned long work = READ_ONCE(current_thread_info()->syscall_work); - unsigned long nr = syscall_get_nr(current, regs); - - CT_WARN_ON(ct_state() != CT_STATE_KERNEL); + for (;;) { + ti_work = __exit_to_user_mode_loop(regs, ti_work); - if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { - if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) - local_irq_enable(); + if (likely(!rseq_exit_to_user_mode_restart(regs, ti_work))) + return ti_work; + ti_work = read_thread_flags(); } - - rseq_syscall(regs); - - /* - * Do one-time syscall specific work. If these work items are - * enabled, we want to run them exactly once per syscall exit with - * interrupts enabled. - */ - if (unlikely(work & SYSCALL_WORK_EXIT)) - syscall_exit_work(regs, work); -} - -static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs) -{ - syscall_exit_to_user_mode_prepare(regs); - local_irq_disable_exit_to_user(); - exit_to_user_mode_prepare(regs); -} - -void syscall_exit_to_user_mode_work(struct pt_regs *regs) -{ - __syscall_exit_to_user_mode_work(regs); -} - -__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) -{ - instrumentation_begin(); - __syscall_exit_to_user_mode_work(regs); - instrumentation_end(); - exit_to_user_mode(); -} - -noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) -{ - enter_from_user_mode(regs); -} - -noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) -{ - instrumentation_begin(); - exit_to_user_mode_prepare(regs); - instrumentation_end(); - exit_to_user_mode(); } noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) @@ -267,7 +114,8 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * TINY_RCU does not support EQS, so let the compiler eliminate * this part when enabled. */ - if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { + if (!IS_ENABLED(CONFIG_TINY_RCU) && + (is_idle_task(current) || arch_in_rcu_eqs())) { /* * If RCU is not watching then the same careful * sequence vs. lockdep and tracing is required @@ -300,6 +148,20 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) return ret; } +/** + * arch_irqentry_exit_need_resched - Architecture specific need resched function + * + * Invoked from raw_irqentry_exit_cond_resched() to check if resched is needed. + * Defaults return true. + * + * The main purpose is to permit arch to avoid preemption of a task from an IRQ. + */ +static inline bool arch_irqentry_exit_need_resched(void); + +#ifndef arch_irqentry_exit_need_resched +static inline bool arch_irqentry_exit_need_resched(void) { return true; } +#endif + void raw_irqentry_exit_cond_resched(void) { if (!preempt_count()) { @@ -307,7 +169,7 @@ void raw_irqentry_exit_cond_resched(void) rcu_irq_exit_check_preempt(); if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) WARN_ON_ONCE(!on_thread_stack()); - if (need_resched()) + if (need_resched() && arch_irqentry_exit_need_resched()) preempt_schedule_irq(); } } diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c new file mode 100644 index 000000000000..940a597ded40 --- /dev/null +++ b/kernel/entry/syscall-common.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/audit.h> +#include <linux/entry-common.h> +#include "common.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/syscalls.h> + +static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) +{ + if (unlikely(audit_context())) { + unsigned long args[6]; + + syscall_get_arguments(current, regs, args); + audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); + } +} + +long syscall_trace_enter(struct pt_regs *regs, long syscall, + unsigned long work) +{ + long ret = 0; + + /* + * Handle Syscall User Dispatch. This must comes first, since + * the ABI here can be something that doesn't make sense for + * other syscall_work features. + */ + if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { + if (syscall_user_dispatch(regs)) + return -1L; + } + + /* Handle ptrace */ + if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { + ret = ptrace_report_syscall_entry(regs); + if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) + return -1L; + } + + /* Do seccomp after ptrace, to catch any tracer changes. */ + if (work & SYSCALL_WORK_SECCOMP) { + ret = __secure_computing(); + if (ret == -1L) + return ret; + } + + /* Either of the above might have changed the syscall number */ + syscall = syscall_get_nr(current, regs); + + if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) { + trace_sys_enter(regs, syscall); + /* + * Probes or BPF hooks in the tracepoint may have changed the + * system call number as well. + */ + syscall = syscall_get_nr(current, regs); + } + + syscall_enter_audit(regs, syscall); + + return ret ? : syscall; +} + +/* + * If SYSCALL_EMU is set, then the only reason to report is when + * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall + * instruction has been already reported in syscall_enter_from_user_mode(). + */ +static inline bool report_single_step(unsigned long work) +{ + if (work & SYSCALL_WORK_SYSCALL_EMU) + return false; + + return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; +} + +void syscall_exit_work(struct pt_regs *regs, unsigned long work) +{ + bool step; + + /* + * If the syscall was rolled back due to syscall user dispatching, + * then the tracers below are not invoked for the same reason as + * the entry side was not invoked in syscall_trace_enter(): The ABI + * of these syscalls is unknown. + */ + if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { + if (unlikely(current->syscall_dispatch.on_dispatch)) { + current->syscall_dispatch.on_dispatch = false; + return; + } + } + + audit_syscall_exit(regs); + + if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) + trace_sys_exit(regs, syscall_get_return_value(current, regs)); + + step = report_single_step(work); + if (step || work & SYSCALL_WORK_SYSCALL_TRACE) + ptrace_report_syscall_exit(regs, step); +} diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c index 5340c5aa89e7..a9055eccb27e 100644 --- a/kernel/entry/syscall_user_dispatch.c +++ b/kernel/entry/syscall_user_dispatch.c @@ -78,7 +78,7 @@ static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned lon if (offset || len || selector) return -EINVAL; break; - case PR_SYS_DISPATCH_ON: + case PR_SYS_DISPATCH_EXCLUSIVE_ON: /* * Validate the direct dispatcher region just for basic * sanity against overflow and a 0-sized dispatcher @@ -87,30 +87,40 @@ static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned lon */ if (offset && offset + len <= offset) return -EINVAL; - + break; + case PR_SYS_DISPATCH_INCLUSIVE_ON: + if (len == 0 || offset + len <= offset) + return -EINVAL; /* - * access_ok() will clear memory tags for tagged addresses - * if current has memory tagging enabled. - - * To enable a tracer to set a tracees selector the - * selector address must be untagged for access_ok(), - * otherwise an untagged tracer will always fail to set a - * tagged tracees selector. + * Invert the range, the check in syscall_user_dispatch() + * supports wrap-around. */ - if (selector && !access_ok(untagged_addr(selector), sizeof(*selector))) - return -EFAULT; - + offset = offset + len; + len = -len; break; default: return -EINVAL; } + /* + * access_ok() will clear memory tags for tagged addresses + * if current has memory tagging enabled. + * + * To enable a tracer to set a tracees selector the + * selector address must be untagged for access_ok(), + * otherwise an untagged tracer will always fail to set a + * tagged tracees selector. + */ + if (mode != PR_SYS_DISPATCH_OFF && selector && + !access_ok(untagged_addr(selector), sizeof(*selector))) + return -EFAULT; + task->syscall_dispatch.selector = selector; task->syscall_dispatch.offset = offset; task->syscall_dispatch.len = len; task->syscall_dispatch.on_dispatch = false; - if (mode == PR_SYS_DISPATCH_ON) + if (mode != PR_SYS_DISPATCH_OFF) set_task_syscall_work(task, SYSCALL_USER_DISPATCH); else clear_task_syscall_work(task, SYSCALL_USER_DISPATCH); diff --git a/kernel/entry/kvm.c b/kernel/entry/virt.c index 8485f63863af..c52f99249763 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/virt.c @@ -1,17 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/entry-kvm.h> -#include <linux/kvm_host.h> +#include <linux/entry-virt.h> -static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) +static int xfer_to_guest_mode_work(unsigned long ti_work) { do { int ret; - if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) { - kvm_handle_signal_exit(vcpu); + if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) return -EINTR; - } if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) schedule(); @@ -19,7 +16,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) if (ti_work & _TIF_NOTIFY_RESUME) resume_user_mode_work(NULL); - ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work); + ret = arch_xfer_to_guest_mode_handle_work(ti_work); if (ret) return ret; @@ -28,7 +25,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) return 0; } -int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu) +int xfer_to_guest_mode_handle_work(void) { unsigned long ti_work; @@ -44,6 +41,6 @@ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu) if (!(ti_work & XFER_TO_GUEST_MODE_WORK)) return 0; - return xfer_to_guest_mode_work(vcpu, ti_work); + return xfer_to_guest_mode_work(ti_work); } EXPORT_SYMBOL_GPL(xfer_to_guest_mode_handle_work); diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 6c83ad674d01..b9c7e00725d6 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -217,22 +217,26 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr } struct perf_callchain_entry * -get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, - u32 max_stack, bool crosstask, bool add_mark) +get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, + u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie) { struct perf_callchain_entry *entry; struct perf_callchain_entry_ctx ctx; int rctx, start_entry_idx; + /* crosstask is not supported for user stacks */ + if (crosstask && user && !kernel) + return NULL; + entry = get_callchain_entry(&rctx); if (!entry) return NULL; - ctx.entry = entry; - ctx.max_stack = max_stack; - ctx.nr = entry->nr = init_nr; - ctx.contexts = 0; - ctx.contexts_maxed = false; + ctx.entry = entry; + ctx.max_stack = max_stack; + ctx.nr = entry->nr = 0; + ctx.contexts = 0; + ctx.contexts_maxed = false; if (kernel && !user_mode(regs)) { if (add_mark) @@ -240,25 +244,31 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, perf_callchain_kernel(&ctx, regs); } - if (user) { + if (user && !crosstask) { if (!user_mode(regs)) { - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; + if (current->flags & (PF_KTHREAD | PF_USER_WORKER)) + goto exit_put; + regs = task_pt_regs(current); } - if (regs) { - if (crosstask) - goto exit_put; + if (defer_cookie) { + /* + * Foretell the coming of PERF_RECORD_CALLCHAIN_DEFERRED + * which can be stitched to this one, and add + * the cookie after it (it will be cut off when the + * user stack is copied to the callchain). + */ + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED); + perf_callchain_store_context(&ctx, defer_cookie); + goto exit_put; + } - if (add_mark) - perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); + if (add_mark) + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); - start_entry_idx = entry->nr; - perf_callchain_user(&ctx, regs); - fixup_uretprobe_trampoline_entries(entry, start_entry_idx); - } + start_entry_idx = entry->nr; + perf_callchain_user(&ctx, regs); + fixup_uretprobe_trampoline_entries(entry, start_entry_idx); } exit_put: diff --git a/kernel/events/core.c b/kernel/events/core.c index 95e703891b24..dad0d3d2e85f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -56,6 +56,7 @@ #include <linux/buildid.h> #include <linux/task_work.h> #include <linux/percpu-rwsem.h> +#include <linux/unwind_deferred.h> #include "internal.h" @@ -207,6 +208,19 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, __perf_ctx_unlock(&cpuctx->ctx); } +typedef struct { + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; +} class_perf_ctx_lock_t; + +static inline void class_perf_ctx_lock_destructor(class_perf_ctx_lock_t *_T) +{ perf_ctx_unlock(_T->cpuctx, _T->ctx); } + +static inline class_perf_ctx_lock_t +class_perf_ctx_lock_constructor(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ perf_ctx_lock(cpuctx, ctx); return (class_perf_ctx_lock_t){ cpuctx, ctx }; } + #define TASK_TOMBSTONE ((void *)-1L) static bool is_kernel_event(struct perf_event *event) @@ -938,13 +952,19 @@ static void perf_cgroup_switch(struct task_struct *task) if (READ_ONCE(cpuctx->cgrp) == NULL) return; - WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); - cgrp = perf_cgroup_from_task(task, NULL); if (READ_ONCE(cpuctx->cgrp) == cgrp) return; - perf_ctx_lock(cpuctx, cpuctx->task_ctx); + guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); + /* + * Re-check, could've raced vs perf_remove_from_context(). + */ + if (READ_ONCE(cpuctx->cgrp) == NULL) + return; + + WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); + perf_ctx_disable(&cpuctx->ctx, true); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); @@ -962,7 +982,6 @@ static void perf_cgroup_switch(struct task_struct *task) ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); perf_ctx_enable(&cpuctx->ctx, true); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } static int perf_cgroup_ensure_storage(struct perf_event *event, @@ -1270,6 +1289,10 @@ static void put_ctx(struct perf_event_context *ctx) if (ctx->task && ctx->task != TASK_TOMBSTONE) put_task_struct(ctx->task); call_rcu(&ctx->rcu_head, free_ctx); + } else { + smp_mb__after_atomic(); /* pairs with wait_var_event() */ + if (ctx->task == TASK_TOMBSTONE) + wake_up_var(&ctx->refcount); } } @@ -2116,18 +2139,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (event->group_leader == event) del_event_from_groups(event, ctx); - /* - * If event was in error state, then keep it - * that way, otherwise bogus counts will be - * returned on read(). The only way to get out - * of error state is by explicit re-enabling - * of the event - */ - if (event->state > PERF_EVENT_STATE_OFF) { - perf_cgroup_event_disable(event, ctx); - perf_event_set_state(event, PERF_EVENT_STATE_OFF); - } - ctx->generation++; event->pmu_ctx->nr_events--; } @@ -2145,8 +2156,9 @@ perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event) } static void put_event(struct perf_event *event); -static void event_sched_out(struct perf_event *event, - struct perf_event_context *ctx); +static void __event_disable(struct perf_event *event, + struct perf_event_context *ctx, + enum perf_event_state state); static void perf_put_aux_event(struct perf_event *event) { @@ -2167,7 +2179,7 @@ static void perf_put_aux_event(struct perf_event *event) * If the event is an aux_event, tear down all links to * it from other events. */ - for_each_sibling_event(iter, event->group_leader) { + for_each_sibling_event(iter, event) { if (iter->aux_event != event) continue; @@ -2179,8 +2191,7 @@ static void perf_put_aux_event(struct perf_event *event) * state so that we don't try to schedule it again. Note * that perf_event_enable() will clear the ERROR status. */ - event_sched_out(iter, ctx); - perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + __event_disable(iter, ctx, PERF_EVENT_STATE_ERROR); } } @@ -2238,18 +2249,6 @@ static inline struct list_head *get_event_list(struct perf_event *event) &event->pmu_ctx->flexible_active; } -/* - * Events that have PERF_EV_CAP_SIBLING require being part of a group and - * cannot exist on their own, schedule them out and move them into the ERROR - * state. Also see _perf_event_enable(), it will not be able to recover - * this ERROR state. - */ -static inline void perf_remove_sibling_event(struct perf_event *event) -{ - event_sched_out(event, event->ctx); - perf_event_set_state(event, PERF_EVENT_STATE_ERROR); -} - static void perf_group_detach(struct perf_event *event) { struct perf_event *leader = event->group_leader; @@ -2285,8 +2284,15 @@ static void perf_group_detach(struct perf_event *event) */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { + /* + * Events that have PERF_EV_CAP_SIBLING require being part of + * a group and cannot exist on their own, schedule them out + * and move them into the ERROR state. Also see + * _perf_event_enable(), it will not be able to recover this + * ERROR state. + */ if (sibling->event_caps & PERF_EV_CAP_SIBLING) - perf_remove_sibling_event(sibling); + __event_disable(sibling, ctx, PERF_EVENT_STATE_ERROR); sibling->group_leader = sibling; list_del_init(&sibling->sibling_list); @@ -2311,8 +2317,6 @@ out: perf_event__header_size(leader); } -static void sync_child_event(struct perf_event *child_event); - static void perf_child_detach(struct perf_event *event) { struct perf_event *parent_event = event->parent; @@ -2325,9 +2329,12 @@ static void perf_child_detach(struct perf_event *event) if (WARN_ON_ONCE(!parent_event)) return; + /* + * Can't check this from an IPI, the holder is likey another CPU. + * lockdep_assert_held(&parent_event->child_mutex); + */ - sync_child_event(event); list_del_init(&event->child_list); } @@ -2343,6 +2350,11 @@ event_filter_match(struct perf_event *event) perf_cgroup_match(event); } +static inline bool is_event_in_freq_mode(struct perf_event *event) +{ + return event->attr.freq && event->attr.sample_freq; +} + static void event_sched_out(struct perf_event *event, struct perf_event_context *ctx) { @@ -2380,7 +2392,7 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx) if (!is_software_event(event)) cpc->active_oncpu--; - if (event->attr.freq && event->attr.sample_freq) { + if (is_event_in_freq_mode(event)) { ctx->nr_freq--; epc->nr_freq--; } @@ -2450,8 +2462,9 @@ ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event) #define DETACH_GROUP 0x01UL #define DETACH_CHILD 0x02UL -#define DETACH_DEAD 0x04UL -#define DETACH_EXIT 0x08UL +#define DETACH_EXIT 0x04UL +#define DETACH_REVOKE 0x08UL +#define DETACH_DEAD 0x10UL /* * Cross CPU call to remove a performance event @@ -2477,12 +2490,18 @@ __perf_remove_from_context(struct perf_event *event, */ if (flags & DETACH_EXIT) state = PERF_EVENT_STATE_EXIT; - if (flags & DETACH_DEAD) { - event->pending_disable = 1; + if (flags & DETACH_REVOKE) + state = PERF_EVENT_STATE_REVOKED; + if (flags & DETACH_DEAD) state = PERF_EVENT_STATE_DEAD; - } + event_sched_out(event, ctx); + + if (event->state > PERF_EVENT_STATE_OFF) + perf_cgroup_event_disable(event, ctx); + perf_event_set_state(event, min(event->state, state)); + if (flags & DETACH_GROUP) perf_group_detach(event); if (flags & DETACH_CHILD) @@ -2545,6 +2564,15 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla event_function_call(event, __perf_remove_from_context, (void *)flags); } +static void __event_disable(struct perf_event *event, + struct perf_event_context *ctx, + enum perf_event_state state) +{ + event_sched_out(event, ctx); + perf_cgroup_event_disable(event, ctx); + perf_event_set_state(event, state); +} + /* * Cross CPU call to disable a performance event */ @@ -2559,13 +2587,18 @@ static void __perf_event_disable(struct perf_event *event, perf_pmu_disable(event->pmu_ctx->pmu); ctx_time_update_event(ctx, event); + /* + * When disabling a group leader, the whole group becomes ineligible + * to run, so schedule out the full group. + */ if (event == event->group_leader) group_sched_out(event, ctx); - else - event_sched_out(event, ctx); - perf_event_set_state(event, PERF_EVENT_STATE_OFF); - perf_cgroup_event_disable(event, ctx); + /* + * But only mark the leader OFF; the siblings will remain + * INACTIVE. + */ + __event_disable(event, ctx, PERF_EVENT_STATE_OFF); perf_pmu_enable(event->pmu_ctx->pmu); } @@ -2628,6 +2661,47 @@ void perf_event_disable_inatomic(struct perf_event *event) static void perf_log_throttle(struct perf_event *event, int enable); static void perf_log_itrace_start(struct perf_event *event); +static void perf_event_unthrottle(struct perf_event *event, bool start) +{ + if (event->state != PERF_EVENT_STATE_ACTIVE) + return; + + event->hw.interrupts = 0; + if (start) + event->pmu->start(event, 0); + if (event == event->group_leader) + perf_log_throttle(event, 1); +} + +static void perf_event_throttle(struct perf_event *event) +{ + if (event->state != PERF_EVENT_STATE_ACTIVE) + return; + + event->hw.interrupts = MAX_INTERRUPTS; + event->pmu->stop(event, 0); + if (event == event->group_leader) + perf_log_throttle(event, 0); +} + +static void perf_event_unthrottle_group(struct perf_event *event, bool skip_start_event) +{ + struct perf_event *sibling, *leader = event->group_leader; + + perf_event_unthrottle(leader, skip_start_event ? leader != event : true); + for_each_sibling_event(sibling, leader) + perf_event_unthrottle(sibling, skip_start_event ? sibling != event : true); +} + +static void perf_event_throttle_group(struct perf_event *event) +{ + struct perf_event *sibling, *leader = event->group_leader; + + perf_event_throttle(leader); + for_each_sibling_event(sibling, leader) + perf_event_throttle(sibling); +} + static int event_sched_in(struct perf_event *event, struct perf_event_context *ctx) { @@ -2656,10 +2730,8 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx) * ticks already, also for a heavily scheduling task there is little * guarantee it'll get a tick in a timely manner. */ - if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { - perf_log_throttle(event, 1); - event->hw.interrupts = 0; - } + if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) + perf_event_unthrottle(event, false); perf_pmu_disable(event->pmu); @@ -2674,7 +2746,7 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx) if (!is_software_event(event)) cpc->active_oncpu++; - if (event->attr.freq && event->attr.sample_freq) { + if (is_event_in_freq_mode(event)) { ctx->nr_freq++; epc->nr_freq++; } @@ -3900,7 +3972,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx, */ static inline bool event_update_userpage(struct perf_event *event) { - if (likely(!atomic_read(&event->mmap_count))) + if (likely(!refcount_read(&event->mmap_count))) return false; perf_event_update_time(event); @@ -4237,14 +4309,10 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list) hwc = &event->hw; - if (hwc->interrupts == MAX_INTERRUPTS) { - hwc->interrupts = 0; - perf_log_throttle(event, 1); - if (!event->attr.freq || !event->attr.sample_freq) - event->pmu->start(event, 0); - } + if (hwc->interrupts == MAX_INTERRUPTS) + perf_event_unthrottle_group(event, is_event_in_freq_mode(event)); - if (!event->attr.freq || !event->attr.sample_freq) + if (!is_event_in_freq_mode(event)) continue; /* @@ -4516,7 +4584,9 @@ out: static void perf_remove_from_owner(struct perf_event *event); static void perf_event_exit_event(struct perf_event *event, - struct perf_event_context *ctx); + struct perf_event_context *ctx, + struct task_struct *task, + bool revoke); /* * Removes all events from the current task that have been marked @@ -4543,7 +4613,7 @@ static void perf_event_remove_on_exec(struct perf_event_context *ctx) modified = true; - perf_event_exit_event(event, ctx); + perf_event_exit_event(event, ctx, ctx->task, false); } raw_spin_lock_irqsave(&ctx->lock, flags); @@ -5125,6 +5195,7 @@ static bool is_sb_event(struct perf_event *event) attr->context_switch || attr->text_poke || attr->bpf_event) return true; + return false; } @@ -5521,6 +5592,8 @@ static void perf_free_addr_filters(struct perf_event *event); /* vs perf_event_alloc() error */ static void __free_event(struct perf_event *event) { + struct pmu *pmu = event->pmu; + if (event->attach_state & PERF_ATTACH_CALLCHAIN) put_callchain_buffers(); @@ -5550,6 +5623,7 @@ static void __free_event(struct perf_event *event) * put_pmu_ctx() needs an event->ctx reference, because of * epc->ctx. */ + WARN_ON_ONCE(!pmu); WARN_ON_ONCE(!event->ctx); WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx); put_pmu_ctx(event->pmu_ctx); @@ -5562,8 +5636,13 @@ static void __free_event(struct perf_event *event) if (event->ctx) put_ctx(event->ctx); - if (event->pmu) - module_put(event->pmu->module); + if (pmu) { + module_put(pmu->module); + scoped_guard (spinlock, &pmu->events_lock) { + list_del(&event->pmu_list); + wake_up_var(pmu); + } + } call_rcu(&event->rcu_head, free_event_rcu); } @@ -5600,13 +5679,13 @@ static void _free_event(struct perf_event *event) /* * Used to free events which have a known refcount of 1, such as in error paths - * where the event isn't exposed yet and inherited events. + * of inherited events. */ static void free_event(struct perf_event *event) { if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, - "unexpected event refcount: %ld; ptr=%p\n", - atomic_long_read(&event->refcount), event)) { + "unexpected event refcount: %ld; ptr=%p\n", + atomic_long_read(&event->refcount), event)) { /* leak to avoid use-after-free */ return; } @@ -5689,7 +5768,6 @@ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct perf_event *child, *tmp; - LIST_HEAD(free_list); /* * If we got here through err_alloc: free_event(event); we will not @@ -5718,15 +5796,17 @@ int perf_event_release_kernel(struct perf_event *event) * Thus this guarantees that we will in fact observe and kill _ALL_ * child events. */ - perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); + if (event->state > PERF_EVENT_STATE_REVOKED) { + perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); + } else { + event->state = PERF_EVENT_STATE_DEAD; + } perf_event_ctx_unlock(event, ctx); again: mutex_lock(&event->child_mutex); list_for_each_entry(child, &event->child_list, child_list) { - void *var = NULL; - /* * Cannot change, child events are not migrated, see the * comment with perf_event_ctx_lock_nested(). @@ -5759,44 +5839,24 @@ again: tmp = list_first_entry_or_null(&event->child_list, struct perf_event, child_list); if (tmp == child) { - perf_remove_from_context(child, DETACH_GROUP); - list_move(&child->child_list, &free_list); + perf_remove_from_context(child, DETACH_GROUP | DETACH_CHILD); } else { - var = &ctx->refcount; + child = NULL; } mutex_unlock(&event->child_mutex); mutex_unlock(&ctx->mutex); - put_ctx(ctx); - if (var) { - /* - * If perf_event_free_task() has deleted all events from the - * ctx while the child_mutex got released above, make sure to - * notify about the preceding put_ctx(). - */ - smp_mb(); /* pairs with wait_var_event() */ - wake_up_var(var); + if (child) { + /* Last reference unless ->pending_task work is pending */ + put_event(child); } + put_ctx(ctx); + goto again; } mutex_unlock(&event->child_mutex); - list_for_each_entry_safe(child, tmp, &free_list, child_list) { - void *var = &child->ctx->refcount; - - list_del(&child->child_list); - /* Last reference unless ->pending_task work is pending */ - put_event(child); - - /* - * Wake any perf_event_free_task() waiting for this event to be - * freed. - */ - smp_mb(); /* pairs with wait_var_event() */ - wake_up_var(var); - } - no_ctx: /* * Last reference unless ->pending_task work is pending on this event @@ -6068,8 +6128,14 @@ static __poll_t perf_poll(struct file *file, poll_table *wait) struct perf_buffer *rb; __poll_t events = EPOLLHUP; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return EPOLLERR; + poll_wait(file, &event->waitq, wait); + if (event->state <= PERF_EVENT_STATE_REVOKED) + return EPOLLERR; + if (is_event_hup(event)) return events; @@ -6167,14 +6233,6 @@ static void __perf_event_period(struct perf_event *event, active = (event->state == PERF_EVENT_STATE_ACTIVE); if (active) { perf_pmu_disable(event->pmu); - /* - * We could be throttled; unthrottle now to avoid the tick - * trying to unthrottle while we already re-started the event. - */ - if (event->hw.interrupts == MAX_INTERRUPTS) { - event->hw.interrupts = 0; - perf_log_throttle(event, 1); - } event->pmu->stop(event, PERF_EF_UPDATE); } @@ -6182,6 +6240,14 @@ static void __perf_event_period(struct perf_event *event, if (active) { event->pmu->start(event, PERF_EF_RELOAD); + /* + * Once the period is force-reset, the event starts immediately. + * But the event/group could be throttled. Unthrottle the + * event/group now to avoid the next tick trying to unthrottle + * while we already re-started the event/group. + */ + if (event->hw.interrupts == MAX_INTERRUPTS) + perf_event_unthrottle_group(event, true); perf_pmu_enable(event->pmu); } } @@ -6239,12 +6305,18 @@ static int perf_event_set_output(struct perf_event *event, static int perf_event_set_filter(struct perf_event *event, void __user *arg); static int perf_copy_attr(struct perf_event_attr __user *uattr, struct perf_event_attr *attr); +static int __perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie); static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { void (*func)(struct perf_event *); u32 flags = arg; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + switch (cmd) { case PERF_EVENT_IOC_ENABLE: func = _perf_event_enable; @@ -6301,7 +6373,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon if (IS_ERR(prog)) return PTR_ERR(prog); - err = perf_event_set_bpf_prog(event, prog, 0); + err = __perf_event_set_bpf_prog(event, prog, 0); if (err) { bpf_prog_put(prog); return err; @@ -6620,18 +6692,31 @@ void ring_buffer_put(struct perf_buffer *rb) call_rcu(&rb->rcu_head, rb_free_rcu); } +typedef void (*mapped_f)(struct perf_event *event, struct mm_struct *mm); + +#define get_mapped(event, func) \ +({ struct pmu *pmu; \ + mapped_f f = NULL; \ + guard(rcu)(); \ + pmu = READ_ONCE(event->pmu); \ + if (pmu) \ + f = pmu->func; \ + f; \ +}) + static void perf_mmap_open(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; + mapped_f mapped = get_mapped(event, event_mapped); - atomic_inc(&event->mmap_count); - atomic_inc(&event->rb->mmap_count); + refcount_inc(&event->mmap_count); + refcount_inc(&event->rb->mmap_count); if (vma->vm_pgoff) - atomic_inc(&event->rb->aux_mmap_count); + refcount_inc(&event->rb->aux_mmap_count); - if (event->pmu->event_mapped) - event->pmu->event_mapped(event, vma->vm_mm); + if (mapped) + mapped(event, vma->vm_mm); } static void perf_pmu_output_stop(struct perf_event *event); @@ -6647,21 +6732,23 @@ static void perf_pmu_output_stop(struct perf_event *event); static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; + mapped_f unmapped = get_mapped(event, event_unmapped); struct perf_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; int mmap_locked = rb->mmap_locked; unsigned long size = perf_data_size(rb); bool detach_rest = false; - if (event->pmu->event_unmapped) - event->pmu->event_unmapped(event, vma->vm_mm); + /* FIXIES vs perf_pmu_unregister() */ + if (unmapped) + unmapped(event, vma->vm_mm); /* * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex * to avoid complications. */ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && - atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { + refcount_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { /* * Stop all AUX events that are writing to this buffer, * so that we can free its AUX pages and corresponding PMU @@ -6681,10 +6768,10 @@ static void perf_mmap_close(struct vm_area_struct *vma) mutex_unlock(&rb->aux_mutex); } - if (atomic_dec_and_test(&rb->mmap_count)) + if (refcount_dec_and_test(&rb->mmap_count)) detach_rest = true; - if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) + if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) goto out_put; ring_buffer_attach(event, NULL); @@ -6760,10 +6847,20 @@ static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf) return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS; } +static int perf_mmap_may_split(struct vm_area_struct *vma, unsigned long addr) +{ + /* + * Forbid splitting perf mappings to prevent refcount leaks due to + * the resulting non-matching offsets and sizes. See open()/close(). + */ + return -EINVAL; +} + static const struct vm_operations_struct perf_mmap_vmops = { .open = perf_mmap_open, .close = perf_mmap_close, /* non mergeable */ .pfn_mkwrite = perf_mmap_pfn_mkwrite, + .may_split = perf_mmap_may_split, }; static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) @@ -6835,215 +6932,241 @@ static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) return err; } -static int perf_mmap(struct file *file, struct vm_area_struct *vma) +static bool perf_mmap_calc_limits(struct vm_area_struct *vma, long *user_extra, long *extra) { - struct perf_event *event = file->private_data; - unsigned long user_locked, user_lock_limit; + unsigned long user_locked, user_lock_limit, locked, lock_limit; struct user_struct *user = current_user(); - struct mutex *aux_mutex = NULL; - struct perf_buffer *rb = NULL; - unsigned long locked, lock_limit; - unsigned long vma_size; - unsigned long nr_pages; - long user_extra = 0, extra = 0; - int ret, flags = 0; - /* - * Don't allow mmap() of inherited per-task counters. This would - * create a performance issue due to all children writing to the - * same rb. - */ - if (event->cpu == -1 && event->attr.inherit) - return -EINVAL; + user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); + /* Increase the limit linearly with more CPUs */ + user_lock_limit *= num_online_cpus(); - if (!(vma->vm_flags & VM_SHARED)) - return -EINVAL; + user_locked = atomic_long_read(&user->locked_vm); - ret = security_perf_event_read(event); - if (ret) - return ret; + /* + * sysctl_perf_event_mlock may have changed, so that + * user->locked_vm > user_lock_limit + */ + if (user_locked > user_lock_limit) + user_locked = user_lock_limit; + user_locked += *user_extra; - vma_size = vma->vm_end - vma->vm_start; - nr_pages = vma_size / PAGE_SIZE; + if (user_locked > user_lock_limit) { + /* + * charge locked_vm until it hits user_lock_limit; + * charge the rest from pinned_vm + */ + *extra = user_locked - user_lock_limit; + *user_extra -= *extra; + } - if (nr_pages > INT_MAX) - return -ENOMEM; + lock_limit = rlimit(RLIMIT_MEMLOCK); + lock_limit >>= PAGE_SHIFT; + locked = atomic64_read(&vma->vm_mm->pinned_vm) + *extra; - if (vma_size != PAGE_SIZE * nr_pages) - return -EINVAL; + return locked <= lock_limit || !perf_is_paranoid() || capable(CAP_IPC_LOCK); +} - user_extra = nr_pages; +static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long extra) +{ + struct user_struct *user = current_user(); - mutex_lock(&event->mmap_mutex); - ret = -EINVAL; + atomic_long_add(user_extra, &user->locked_vm); + atomic64_add(extra, &vma->vm_mm->pinned_vm); +} - if (vma->vm_pgoff == 0) { - nr_pages -= 1; +static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, + unsigned long nr_pages) +{ + long extra = 0, user_extra = nr_pages; + struct perf_buffer *rb; + int rb_flags = 0; - /* - * If we have rb pages ensure they're a power-of-two number, so we - * can do bitmasks instead of modulo. - */ - if (nr_pages != 0 && !is_power_of_2(nr_pages)) - goto unlock; + nr_pages -= 1; - WARN_ON_ONCE(event->ctx->parent_ctx); + /* + * If we have rb pages ensure they're a power-of-two number, so we + * can do bitmasks instead of modulo. + */ + if (nr_pages != 0 && !is_power_of_2(nr_pages)) + return -EINVAL; - if (event->rb) { - if (data_page_nr(event->rb) != nr_pages) - goto unlock; + WARN_ON_ONCE(event->ctx->parent_ctx); - if (atomic_inc_not_zero(&event->rb->mmap_count)) { - /* - * Success -- managed to mmap() the same buffer - * multiple times. - */ - ret = 0; - /* We need the rb to map pages. */ - rb = event->rb; - goto unlock; - } + if (event->rb) { + if (data_page_nr(event->rb) != nr_pages) + return -EINVAL; + if (refcount_inc_not_zero(&event->rb->mmap_count)) { /* - * Raced against perf_mmap_close()'s - * atomic_dec_and_mutex_lock() remove the - * event and continue as if !event->rb + * Success -- managed to mmap() the same buffer + * multiple times. */ - ring_buffer_attach(event, NULL); + perf_mmap_account(vma, user_extra, extra); + refcount_inc(&event->mmap_count); + return 0; } - } else { /* - * AUX area mapping: if rb->aux_nr_pages != 0, it's already - * mapped, all subsequent mappings should have the same size - * and offset. Must be above the normal perf buffer. + * Raced against perf_mmap_close()'s + * refcount_dec_and_mutex_lock() remove the + * event and continue as if !event->rb */ - u64 aux_offset, aux_size; - - rb = event->rb; - if (!rb) - goto aux_unlock; + ring_buffer_attach(event, NULL); + } - aux_mutex = &rb->aux_mutex; - mutex_lock(aux_mutex); + if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) + return -EPERM; - aux_offset = READ_ONCE(rb->user_page->aux_offset); - aux_size = READ_ONCE(rb->user_page->aux_size); + if (vma->vm_flags & VM_WRITE) + rb_flags |= RING_BUFFER_WRITABLE; - if (aux_offset < perf_data_size(rb) + PAGE_SIZE) - goto aux_unlock; + rb = rb_alloc(nr_pages, + event->attr.watermark ? event->attr.wakeup_watermark : 0, + event->cpu, rb_flags); - if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) - goto aux_unlock; + if (!rb) + return -ENOMEM; - /* already mapped with a different offset */ - if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) - goto aux_unlock; + refcount_set(&rb->mmap_count, 1); + rb->mmap_user = get_current_user(); + rb->mmap_locked = extra; - if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE) - goto aux_unlock; + ring_buffer_attach(event, rb); - /* already mapped with a different size */ - if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) - goto aux_unlock; + perf_event_update_time(event); + perf_event_init_userpage(event); + perf_event_update_userpage(event); - if (!is_power_of_2(nr_pages)) - goto aux_unlock; + perf_mmap_account(vma, user_extra, extra); + refcount_set(&event->mmap_count, 1); - if (!atomic_inc_not_zero(&rb->mmap_count)) - goto aux_unlock; + return 0; +} - if (rb_has_aux(rb)) { - atomic_inc(&rb->aux_mmap_count); - ret = 0; - goto unlock; - } +static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event, + unsigned long nr_pages) +{ + long extra = 0, user_extra = nr_pages; + u64 aux_offset, aux_size; + struct perf_buffer *rb; + int ret, rb_flags = 0; - atomic_set(&rb->aux_mmap_count, 1); - } + rb = event->rb; + if (!rb) + return -EINVAL; - user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); + guard(mutex)(&rb->aux_mutex); /* - * Increase the limit linearly with more CPUs: + * AUX area mapping: if rb->aux_nr_pages != 0, it's already + * mapped, all subsequent mappings should have the same size + * and offset. Must be above the normal perf buffer. */ - user_lock_limit *= num_online_cpus(); + aux_offset = READ_ONCE(rb->user_page->aux_offset); + aux_size = READ_ONCE(rb->user_page->aux_size); - user_locked = atomic_long_read(&user->locked_vm); + if (aux_offset < perf_data_size(rb) + PAGE_SIZE) + return -EINVAL; - /* - * sysctl_perf_event_mlock may have changed, so that - * user->locked_vm > user_lock_limit - */ - if (user_locked > user_lock_limit) - user_locked = user_lock_limit; - user_locked += user_extra; + if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) + return -EINVAL; - if (user_locked > user_lock_limit) { - /* - * charge locked_vm until it hits user_lock_limit; - * charge the rest from pinned_vm - */ - extra = user_locked - user_lock_limit; - user_extra -= extra; - } + /* already mapped with a different offset */ + if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) + return -EINVAL; - lock_limit = rlimit(RLIMIT_MEMLOCK); - lock_limit >>= PAGE_SHIFT; - locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra; + if (aux_size != nr_pages * PAGE_SIZE) + return -EINVAL; - if ((locked > lock_limit) && perf_is_paranoid() && - !capable(CAP_IPC_LOCK)) { - ret = -EPERM; - goto unlock; - } + /* already mapped with a different size */ + if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) + return -EINVAL; - WARN_ON(!rb && event->rb); + if (!is_power_of_2(nr_pages)) + return -EINVAL; - if (vma->vm_flags & VM_WRITE) - flags |= RING_BUFFER_WRITABLE; + if (!refcount_inc_not_zero(&rb->mmap_count)) + return -EINVAL; - if (!rb) { - rb = rb_alloc(nr_pages, - event->attr.watermark ? event->attr.wakeup_watermark : 0, - event->cpu, flags); + if (rb_has_aux(rb)) { + refcount_inc(&rb->aux_mmap_count); - if (!rb) { - ret = -ENOMEM; - goto unlock; + } else { + if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { + refcount_dec(&rb->mmap_count); + return -EPERM; } - atomic_set(&rb->mmap_count, 1); - rb->mmap_user = get_current_user(); - rb->mmap_locked = extra; + WARN_ON(!rb && event->rb); - ring_buffer_attach(event, rb); + if (vma->vm_flags & VM_WRITE) + rb_flags |= RING_BUFFER_WRITABLE; - perf_event_update_time(event); - perf_event_init_userpage(event); - perf_event_update_userpage(event); - } else { ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, - event->attr.aux_watermark, flags); - if (!ret) - rb->aux_mmap_locked = extra; + event->attr.aux_watermark, rb_flags); + if (ret) { + refcount_dec(&rb->mmap_count); + return ret; + } + + refcount_set(&rb->aux_mmap_count, 1); + rb->aux_mmap_locked = extra; } - ret = 0; + perf_mmap_account(vma, user_extra, extra); + refcount_inc(&event->mmap_count); -unlock: - if (!ret) { - atomic_long_add(user_extra, &user->locked_vm); - atomic64_add(extra, &vma->vm_mm->pinned_vm); + return 0; +} + +static int perf_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct perf_event *event = file->private_data; + unsigned long vma_size, nr_pages; + mapped_f mapped; + int ret; + + /* + * Don't allow mmap() of inherited per-task counters. This would + * create a performance issue due to all children writing to the + * same rb. + */ + if (event->cpu == -1 && event->attr.inherit) + return -EINVAL; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + ret = security_perf_event_read(event); + if (ret) + return ret; + + vma_size = vma->vm_end - vma->vm_start; + nr_pages = vma_size / PAGE_SIZE; + + if (nr_pages > INT_MAX) + return -ENOMEM; + + if (vma_size != PAGE_SIZE * nr_pages) + return -EINVAL; - atomic_inc(&event->mmap_count); - } else if (rb) { - atomic_dec(&rb->mmap_count); + scoped_guard (mutex, &event->mmap_mutex) { + /* + * This relies on __pmu_detach_event() taking mmap_mutex after marking + * the event REVOKED. Either we observe the state, or __pmu_detach_event() + * will detach the rb created here. + */ + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + + if (vma->vm_pgoff == 0) + ret = perf_mmap_rb(vma, event, nr_pages); + else + ret = perf_mmap_aux(vma, event, nr_pages); + if (ret) + return ret; } -aux_unlock: - if (aux_mutex) - mutex_unlock(aux_mutex); - mutex_unlock(&event->mmap_mutex); /* * Since pinned accounting is per vm we cannot allow fork() to copy our @@ -7052,11 +7175,19 @@ aux_unlock: vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &perf_mmap_vmops; - if (!ret) - ret = map_range(rb, vma); + mapped = get_mapped(event, event_mapped); + if (mapped) + mapped(event, vma->vm_mm); - if (!ret && event->pmu->event_mapped) - event->pmu->event_mapped(event, vma->vm_mm); + /* + * Try to map it into the page table. On fail, invoke + * perf_mmap_close() to undo the above, as the callsite expects + * full cleanup in this case and therefore does not invoke + * vmops::close(). + */ + ret = map_range(event->rb, vma); + if (ret) + perf_mmap_close(vma); return ret; } @@ -7067,6 +7198,9 @@ static int perf_fasync(int fd, struct file *filp, int on) struct perf_event *event = filp->private_data; int retval; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + inode_lock(inode); retval = fasync_helper(fd, filp, on, &event->fasync); inode_unlock(inode); @@ -7107,18 +7241,18 @@ void perf_event_wakeup(struct perf_event *event) static void perf_sigtrap(struct perf_event *event) { /* - * We'd expect this to only occur if the irq_work is delayed and either - * ctx->task or current has changed in the meantime. This can be the - * case on architectures that do not implement arch_irq_work_raise(). + * Both perf_pending_task() and perf_pending_irq() can race with the + * task exiting. */ - if (WARN_ON_ONCE(event->ctx->task != current)) + if (current->flags & PF_EXITING) return; /* - * Both perf_pending_task() and perf_pending_irq() can race with the - * task exiting. + * We'd expect this to only occur if the irq_work is delayed and either + * ctx->task or current has changed in the meantime. This can be the + * case on architectures that do not implement arch_irq_work_raise(). */ - if (current->flags & PF_EXITING) + if (WARN_ON_ONCE(event->ctx->task != current)) return; send_sig_perf((void __user *)event->pending_addr, @@ -7154,15 +7288,15 @@ static void __perf_pending_disable(struct perf_event *event) * CPU-A CPU-B * * perf_event_disable_inatomic() - * @pending_disable = CPU-A; + * @pending_disable = 1; * irq_work_queue(); * * sched-out - * @pending_disable = -1; + * @pending_disable = 0; * * sched-in * perf_event_disable_inatomic() - * @pending_disable = CPU-B; + * @pending_disable = 1; * irq_work_queue(); // FAILS * * irq_work_run() @@ -7317,7 +7451,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user, if (user_mode(regs)) { regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; - } else if (!(current->flags & PF_KTHREAD)) { + } else if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { perf_get_regs_user(regs_user, regs); } else { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; @@ -7360,6 +7494,10 @@ perf_sample_ustack_size(u16 stack_size, u16 header_size, if (!regs) return 0; + /* No mm, no stack, no dump. */ + if (!current->mm) + return 0; + /* * Check if we fit in with the requested stack size into the: * - TASK_SIZE @@ -7953,7 +8091,7 @@ static u64 perf_virt_to_phys(u64 virt) * Try IRQ-safe get_user_page_fast_only first. * If failed, leave phys_addr as 0. */ - if (current->mm != NULL) { + if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { struct page *p; pagefault_disable(); @@ -8061,21 +8199,35 @@ static u64 perf_get_page_size(unsigned long addr) static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; +static struct unwind_work perf_unwind_work; + struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { bool kernel = !event->attr.exclude_callchain_kernel; - bool user = !event->attr.exclude_callchain_user; + bool user = !event->attr.exclude_callchain_user && + !(current->flags & (PF_KTHREAD | PF_USER_WORKER)); /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; + bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user && + event->attr.defer_callchain; const u32 max_stack = event->attr.sample_max_stack; struct perf_callchain_entry *callchain; + u64 defer_cookie; + + if (!current->mm) + user = false; if (!kernel && !user) return &__empty_callchain; - callchain = get_perf_callchain(regs, 0, kernel, user, - max_stack, crosstask, true); + if (!(user && defer_user && !crosstask && + unwind_deferred_request(&perf_unwind_work, &defer_cookie) >= 0)) + defer_cookie = 0; + + callchain = get_perf_callchain(regs, kernel, user, max_stack, + crosstask, true, defer_cookie); + return callchain ?: &__empty_callchain; } @@ -9260,7 +9412,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) flags |= MAP_HUGETLB; if (file) { - struct inode *inode; + const struct inode *inode; dev_t dev; buf = kmalloc(PATH_MAX, GFP_KERNEL); @@ -9273,12 +9425,12 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) * need to add enough zero bytes after the string to handle * the 64bit alignment we do later. */ - name = file_path(file, buf, PATH_MAX - sizeof(u64)); + name = d_path(file_user_path(file), buf, PATH_MAX - sizeof(u64)); if (IS_ERR(name)) { name = "//toolong"; goto cpy_name; } - inode = file_inode(vma->vm_file); + inode = file_user_inode(vma->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; gen = inode->i_generation; @@ -9349,7 +9501,7 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter, if (!filter->path.dentry) return false; - if (d_inode(filter->path.dentry) != file_inode(file)) + if (d_inode(filter->path.dentry) != file_user_inode(file)) return false; if (filter->offset > offset + size) @@ -9860,6 +10012,66 @@ void perf_event_bpf_event(struct bpf_prog *prog, perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); } +struct perf_callchain_deferred_event { + struct unwind_stacktrace *trace; + struct { + struct perf_event_header header; + u64 cookie; + u64 nr; + u64 ips[]; + } event; +}; + +static void perf_callchain_deferred_output(struct perf_event *event, void *data) +{ + struct perf_callchain_deferred_event *deferred_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret, size = deferred_event->event.header.size; + + if (!event->attr.defer_output) + return; + + /* XXX do we really need sample_id_all for this ??? */ + perf_event_header__init_id(&deferred_event->event.header, &sample, event); + + ret = perf_output_begin(&handle, &sample, event, + deferred_event->event.header.size); + if (ret) + goto out; + + perf_output_put(&handle, deferred_event->event); + for (int i = 0; i < deferred_event->trace->nr; i++) { + u64 entry = deferred_event->trace->entries[i]; + perf_output_put(&handle, entry); + } + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +out: + deferred_event->event.header.size = size; +} + +static void perf_unwind_deferred_callback(struct unwind_work *work, + struct unwind_stacktrace *trace, u64 cookie) +{ + struct perf_callchain_deferred_event deferred_event = { + .trace = trace, + .event = { + .header = { + .type = PERF_RECORD_CALLCHAIN_DEFERRED, + .misc = PERF_RECORD_MISC_USER, + .size = sizeof(deferred_event.event) + + (trace->nr * sizeof(u64)), + }, + .cookie = cookie, + .nr = trace->nr, + }, + }; + + perf_iterate_sb(perf_callchain_deferred_output, &deferred_event, NULL); +} + struct perf_text_poke_event { const void *old_bytes; const void *new_bytes; @@ -9946,7 +10158,7 @@ void perf_event_text_poke(const void *addr, const void *old_bytes, void perf_event_itrace_started(struct perf_event *event) { - event->attach_state |= PERF_ATTACH_ITRACE; + WRITE_ONCE(event->attach_state, event->attach_state | PERF_ATTACH_ITRACE); } static void perf_log_itrace_start(struct perf_event *event) @@ -10029,14 +10241,13 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle) hwc->interrupts = 1; } else { hwc->interrupts++; - if (unlikely(throttle && - hwc->interrupts > max_samples_per_tick)) { - __this_cpu_inc(perf_throttled_count); - tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); - hwc->interrupts = MAX_INTERRUPTS; - perf_log_throttle(event, 0); - ret = 1; - } + } + + if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) { + __this_cpu_inc(perf_throttled_count); + tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); + perf_event_throttle_group(event); + ret = 1; } if (event->attr.freq) { @@ -10201,6 +10412,7 @@ static int __perf_event_overflow(struct perf_event *event, ret = 1; event->pending_kill = POLL_HUP; perf_event_disable_inatomic(event); + event->pmu->stop(event, 0); } if (event->attr.sigtrap) { @@ -11013,7 +11225,7 @@ static int perf_uprobe_event_init(struct perf_event *event) if (event->attr.type != perf_uprobe.type) return -ENOENT; - if (!perfmon_capable()) + if (!capable(CAP_SYS_ADMIN)) return -EACCES; /* @@ -11069,11 +11281,15 @@ static inline bool perf_event_is_tracing(struct perf_event *event) return false; } -int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, - u64 bpf_cookie) +static int __perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) { bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + if (!perf_event_is_tracing(event)) return perf_event_set_bpf_handler(event, prog, bpf_cookie); @@ -11098,6 +11314,10 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, if (prog->kprobe_override && !is_kprobe) return -EINVAL; + /* Writing to context allowed only for uprobes. */ + if (prog->aux->kprobe_write_ctx && !is_uprobe) + return -EINVAL; + if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); @@ -11108,6 +11328,20 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, return perf_event_attach_bpf_prog(event, prog, bpf_cookie); } +int perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) +{ + struct perf_event_context *ctx; + int ret; + + ctx = perf_event_ctx_lock(event); + ret = __perf_event_set_bpf_prog(event, prog, bpf_cookie); + perf_event_ctx_unlock(event, ctx); + + return ret; +} + void perf_event_free_bpf_prog(struct perf_event *event) { if (!event->prog) @@ -11130,7 +11364,15 @@ static void perf_event_free_filter(struct perf_event *event) { } -int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, +static int __perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) +{ + return -ENOENT; +} + +int perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, u64 bpf_cookie) { return -ENOENT; @@ -11600,7 +11842,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) event = container_of(hrtimer, struct perf_event, hw.hrtimer); - if (event->state != PERF_EVENT_STATE_ACTIVE) + if (event->state != PERF_EVENT_STATE_ACTIVE || + event->hw.state & PERF_HES_STOPPED) return HRTIMER_NORESTART; event->pmu->read(event); @@ -11645,11 +11888,21 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - if (is_sampling_event(event)) { + /* + * Careful: this function can be triggered in the hrtimer handler, + * for cpu-clock events, so hrtimer_cancel() would cause a + * deadlock. + * + * So use hrtimer_try_to_cancel() to try to stop the hrtimer, + * and the cpu-clock handler also sets the PERF_HES_STOPPED flag, + * which guarantees that perf_swevent_hrtimer() will stop the + * hrtimer once it sees the PERF_HES_STOPPED flag. + */ + if (is_sampling_event(event) && (hwc->interrupts != MAX_INTERRUPTS)) { ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); local64_set(&hwc->period_left, ktime_to_ns(remaining)); - hrtimer_cancel(&hwc->hrtimer); + hrtimer_try_to_cancel(&hwc->hrtimer); } } @@ -11693,14 +11946,17 @@ static void cpu_clock_event_update(struct perf_event *event) static void cpu_clock_event_start(struct perf_event *event, int flags) { + event->hw.state = 0; local64_set(&event->hw.prev_count, local_clock()); perf_swevent_start_hrtimer(event); } static void cpu_clock_event_stop(struct perf_event *event, int flags) { + event->hw.state = PERF_HES_STOPPED; perf_swevent_cancel_hrtimer(event); - cpu_clock_event_update(event); + if (flags & PERF_EF_UPDATE) + cpu_clock_event_update(event); } static int cpu_clock_event_add(struct perf_event *event, int flags) @@ -11714,7 +11970,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags) static void cpu_clock_event_del(struct perf_event *event, int flags) { - cpu_clock_event_stop(event, flags); + cpu_clock_event_stop(event, PERF_EF_UPDATE); } static void cpu_clock_event_read(struct perf_event *event) @@ -11771,14 +12027,17 @@ static void task_clock_event_update(struct perf_event *event, u64 now) static void task_clock_event_start(struct perf_event *event, int flags) { + event->hw.state = 0; local64_set(&event->hw.prev_count, event->ctx->time); perf_swevent_start_hrtimer(event); } static void task_clock_event_stop(struct perf_event *event, int flags) { + event->hw.state = PERF_HES_STOPPED; perf_swevent_cancel_hrtimer(event); - task_clock_event_update(event, event->ctx->time); + if (flags & PERF_EF_UPDATE) + task_clock_event_update(event, event->ctx->time); } static int task_clock_event_add(struct perf_event *event, int flags) @@ -12054,7 +12313,7 @@ static const struct attribute_group *pmu_dev_groups[] = { }; static int pmu_bus_running; -static struct bus_type pmu_bus = { +static const struct bus_type pmu_bus = { .name = "event_source", .dev_groups = pmu_dev_groups, }; @@ -12235,6 +12494,9 @@ int perf_pmu_register(struct pmu *_pmu, const char *name, int type) if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; + INIT_LIST_HEAD(&pmu->events); + spin_lock_init(&pmu->events_lock); + /* * Now that the PMU is complete, make it visible to perf_try_init_event(). */ @@ -12248,21 +12510,143 @@ int perf_pmu_register(struct pmu *_pmu, const char *name, int type) } EXPORT_SYMBOL_GPL(perf_pmu_register); -void perf_pmu_unregister(struct pmu *pmu) +static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event, + struct perf_event_context *ctx) +{ + /* + * De-schedule the event and mark it REVOKED. + */ + perf_event_exit_event(event, ctx, ctx->task, true); + + /* + * All _free_event() bits that rely on event->pmu: + * + * Notably, perf_mmap() relies on the ordering here. + */ + scoped_guard (mutex, &event->mmap_mutex) { + WARN_ON_ONCE(pmu->event_unmapped); + /* + * Mostly an empty lock sequence, such that perf_mmap(), which + * relies on mmap_mutex, is sure to observe the state change. + */ + } + + perf_event_free_bpf_prog(event); + perf_free_addr_filters(event); + + if (event->destroy) { + event->destroy(event); + event->destroy = NULL; + } + + if (event->pmu_ctx) { + put_pmu_ctx(event->pmu_ctx); + event->pmu_ctx = NULL; + } + + exclusive_event_destroy(event); + module_put(pmu->module); + + event->pmu = NULL; /* force fault instead of UAF */ +} + +static void pmu_detach_event(struct pmu *pmu, struct perf_event *event) +{ + struct perf_event_context *ctx; + + ctx = perf_event_ctx_lock(event); + __pmu_detach_event(pmu, event, ctx); + perf_event_ctx_unlock(event, ctx); + + scoped_guard (spinlock, &pmu->events_lock) + list_del(&event->pmu_list); +} + +static struct perf_event *pmu_get_event(struct pmu *pmu) +{ + struct perf_event *event; + + guard(spinlock)(&pmu->events_lock); + list_for_each_entry(event, &pmu->events, pmu_list) { + if (atomic_long_inc_not_zero(&event->refcount)) + return event; + } + + return NULL; +} + +static bool pmu_empty(struct pmu *pmu) +{ + guard(spinlock)(&pmu->events_lock); + return list_empty(&pmu->events); +} + +static void pmu_detach_events(struct pmu *pmu) +{ + struct perf_event *event; + + for (;;) { + event = pmu_get_event(pmu); + if (!event) + break; + + pmu_detach_event(pmu, event); + put_event(event); + } + + /* + * wait for pending _free_event()s + */ + wait_var_event(pmu, pmu_empty(pmu)); +} + +int perf_pmu_unregister(struct pmu *pmu) { scoped_guard (mutex, &pmus_lock) { + if (!idr_cmpxchg(&pmu_idr, pmu->type, pmu, NULL)) + return -EINVAL; + list_del_rcu(&pmu->entry); - idr_remove(&pmu_idr, pmu->type); } /* * We dereference the pmu list under both SRCU and regular RCU, so * synchronize against both of those. + * + * Notably, the entirety of event creation, from perf_init_event() + * (which will now fail, because of the above) until + * perf_install_in_context() should be under SRCU such that + * this synchronizes against event creation. This avoids trying to + * detach events that are not fully formed. */ synchronize_srcu(&pmus_srcu); synchronize_rcu(); + if (pmu->event_unmapped && !pmu_empty(pmu)) { + /* + * Can't force remove events when pmu::event_unmapped() + * is used in perf_mmap_close(). + */ + guard(mutex)(&pmus_lock); + idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu); + list_add_rcu(&pmu->entry, &pmus); + return -EBUSY; + } + + scoped_guard (mutex, &pmus_lock) + idr_remove(&pmu_idr, pmu->type); + + /* + * PMU is removed from the pmus list, so no new events will + * be created, now take care of the existing ones. + */ + pmu_detach_events(pmu); + + /* + * PMU is unused, make it go away. + */ perf_pmu_free(pmu); + return 0; } EXPORT_SYMBOL_GPL(perf_pmu_unregister); @@ -12356,7 +12740,7 @@ static struct pmu *perf_init_event(struct perf_event *event) struct pmu *pmu; int type, ret; - guard(srcu)(&pmus_srcu); + guard(srcu)(&pmus_srcu); /* pmu idr/list access */ /* * Save original type before calling pmu->event_init() since certain @@ -12580,6 +12964,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->active_entry); INIT_LIST_HEAD(&event->addr_filters.list); INIT_HLIST_NODE(&event->hlist_entry); + INIT_LIST_HEAD(&event->pmu_list); init_waitqueue_head(&event->waitq); @@ -12651,7 +13036,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, hwc = &event->hw; hwc->sample_period = attr->sample_period; - if (attr->freq && attr->sample_freq) + if (is_event_in_freq_mode(event)) hwc->sample_period = 1; hwc->last_period = hwc->sample_period; @@ -12758,6 +13143,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, /* symmetric to unaccount_event() in _free_event() */ account_event(event); + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + lockdep_assert_held(&pmus_srcu); + scoped_guard (spinlock, &pmu->events_lock) + list_add(&event->pmu_list, &pmu->events); + return_ptr(event); } @@ -12953,17 +13345,20 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex); set: /* Can't redirect output if we've got an active mmap() */ - if (atomic_read(&event->mmap_count)) + if (refcount_read(&event->mmap_count)) goto unlock; if (output_event) { + if (output_event->state <= PERF_EVENT_STATE_REVOKED) + goto unlock; + /* get the rb we want to redirect to */ rb = ring_buffer_get(output_event); if (!rb) goto unlock; /* did we race against perf_mmap_close() */ - if (!atomic_read(&rb->mmap_count)) { + if (!refcount_read(&rb->mmap_count)) { ring_buffer_put(rb); goto unlock; } @@ -13138,6 +13533,11 @@ SYSCALL_DEFINE5(perf_event_open, if (event_fd < 0) return event_fd; + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + guard(srcu)(&pmus_srcu); + CLASS(fd, group)(group_fd); // group_fd == -1 => empty if (group_fd != -1) { if (!is_perf_file(group)) { @@ -13145,6 +13545,10 @@ SYSCALL_DEFINE5(perf_event_open, goto err_fd; } group_leader = fd_file(group)->private_data; + if (group_leader->state <= PERF_EVENT_STATE_REVOKED) { + err = -ENODEV; + goto err_fd; + } if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) @@ -13441,7 +13845,7 @@ err_cred: if (task) up_read(&task->signal->exec_update_lock); err_alloc: - free_event(event); + put_event(event); err_task: if (task) put_task_struct(task); @@ -13478,6 +13882,11 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, if (attr->aux_output || attr->aux_action) return ERR_PTR(-EINVAL); + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + guard(srcu)(&pmus_srcu); + event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler, context, -1); if (IS_ERR(event)) { @@ -13549,7 +13958,7 @@ err_unlock: perf_unpin_context(ctx); put_ctx(ctx); err_alloc: - free_event(event); + put_event(event); err: return ERR_PTR(err); } @@ -13664,14 +14073,13 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) } EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); -static void sync_child_event(struct perf_event *child_event) +static void sync_child_event(struct perf_event *child_event, + struct task_struct *task) { struct perf_event *parent_event = child_event->parent; u64 child_val; if (child_event->attr.inherit_stat) { - struct task_struct *task = child_event->ctx->task; - if (task && task != TASK_TOMBSTONE) perf_event_read_event(child_event, task); } @@ -13689,10 +14097,14 @@ static void sync_child_event(struct perf_event *child_event) } static void -perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) +perf_event_exit_event(struct perf_event *event, + struct perf_event_context *ctx, + struct task_struct *task, + bool revoke) { struct perf_event *parent_event = event->parent; - unsigned long detach_flags = 0; + unsigned long detach_flags = DETACH_EXIT; + unsigned int attach_state; if (parent_event) { /* @@ -13707,22 +14119,41 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) * Do destroy all inherited groups, we don't care about those * and being thorough is better. */ - detach_flags = DETACH_GROUP | DETACH_CHILD; + detach_flags |= DETACH_GROUP | DETACH_CHILD; mutex_lock(&parent_event->child_mutex); + /* PERF_ATTACH_ITRACE might be set concurrently */ + attach_state = READ_ONCE(event->attach_state); + + if (attach_state & PERF_ATTACH_CHILD) + sync_child_event(event, task); } - perf_remove_from_context(event, detach_flags | DETACH_EXIT); + if (revoke) + detach_flags |= DETACH_GROUP | DETACH_REVOKE; + perf_remove_from_context(event, detach_flags); /* * Child events can be freed. */ if (parent_event) { mutex_unlock(&parent_event->child_mutex); + /* - * Kick perf_poll() for is_event_hup(); + * Match the refcount initialization. Make sure it doesn't happen + * twice if pmu_detach_event() calls it on an already exited task. */ - perf_event_wakeup(parent_event); - put_event(event); + if (attach_state & PERF_ATTACH_CHILD) { + /* + * Kick perf_poll() for is_event_hup(); + */ + perf_event_wakeup(parent_event); + /* + * pmu_detach_event() will have an extra refcount. + * perf_pending_task() might have one too. + */ + put_event(event); + } + return; } @@ -13732,15 +14163,13 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) perf_event_wakeup(event); } -static void perf_event_exit_task_context(struct task_struct *child) +static void perf_event_exit_task_context(struct task_struct *task, bool exit) { - struct perf_event_context *child_ctx, *clone_ctx = NULL; + struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_event *child_event, *next; - WARN_ON_ONCE(child != current); - - child_ctx = perf_pin_task_context(child); - if (!child_ctx) + ctx = perf_pin_task_context(task); + if (!ctx) return; /* @@ -13753,27 +14182,28 @@ static void perf_event_exit_task_context(struct task_struct *child) * without ctx::mutex (it cannot because of the move_group double mutex * lock thing). See the comments in perf_install_in_context(). */ - mutex_lock(&child_ctx->mutex); + mutex_lock(&ctx->mutex); /* * In a single ctx::lock section, de-schedule the events and detach the * context from the task such that we cannot ever get it scheduled back * in. */ - raw_spin_lock_irq(&child_ctx->lock); - task_ctx_sched_out(child_ctx, NULL, EVENT_ALL); + raw_spin_lock_irq(&ctx->lock); + if (exit) + task_ctx_sched_out(ctx, NULL, EVENT_ALL); /* * Now that the context is inactive, destroy the task <-> ctx relation * and mark the context dead. */ - RCU_INIT_POINTER(child->perf_event_ctxp, NULL); - put_ctx(child_ctx); /* cannot be last */ - WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); - put_task_struct(current); /* cannot be last */ + RCU_INIT_POINTER(task->perf_event_ctxp, NULL); + put_ctx(ctx); /* cannot be last */ + WRITE_ONCE(ctx->task, TASK_TOMBSTONE); + put_task_struct(task); /* cannot be last */ - clone_ctx = unclone_ctx(child_ctx); - raw_spin_unlock_irq(&child_ctx->lock); + clone_ctx = unclone_ctx(ctx); + raw_spin_unlock_irq(&ctx->lock); if (clone_ctx) put_ctx(clone_ctx); @@ -13783,28 +14213,48 @@ static void perf_event_exit_task_context(struct task_struct *child) * won't get any samples after PERF_RECORD_EXIT. We can however still * get a few PERF_RECORD_READ events. */ - perf_event_task(child, child_ctx, 0); + if (exit) + perf_event_task(task, ctx, 0); - list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) - perf_event_exit_event(child_event, child_ctx); + list_for_each_entry_safe(child_event, next, &ctx->event_list, event_entry) + perf_event_exit_event(child_event, ctx, exit ? task : NULL, false); - mutex_unlock(&child_ctx->mutex); + mutex_unlock(&ctx->mutex); - put_ctx(child_ctx); + if (!exit) { + /* + * perf_event_release_kernel() could still have a reference on + * this context. In that case we must wait for these events to + * have been freed (in particular all their references to this + * task must've been dropped). + * + * Without this copy_process() will unconditionally free this + * task (irrespective of its reference count) and + * _free_event()'s put_task_struct(event->hw.target) will be a + * use-after-free. + * + * Wait for all events to drop their context reference. + */ + wait_var_event(&ctx->refcount, + refcount_read(&ctx->refcount) == 1); + } + put_ctx(ctx); } /* - * When a child task exits, feed back event values to parent events. + * When a task exits, feed back event values to parent events. * * Can be called with exec_update_lock held when called from * setup_new_exec(). */ -void perf_event_exit_task(struct task_struct *child) +void perf_event_exit_task(struct task_struct *task) { struct perf_event *event, *tmp; - mutex_lock(&child->perf_event_mutex); - list_for_each_entry_safe(event, tmp, &child->perf_event_list, + WARN_ON_ONCE(task != current); + + mutex_lock(&task->perf_event_mutex); + list_for_each_entry_safe(event, tmp, &task->perf_event_list, owner_entry) { list_del_init(&event->owner_entry); @@ -13815,42 +14265,23 @@ void perf_event_exit_task(struct task_struct *child) */ smp_store_release(&event->owner, NULL); } - mutex_unlock(&child->perf_event_mutex); + mutex_unlock(&task->perf_event_mutex); - perf_event_exit_task_context(child); + perf_event_exit_task_context(task, true); /* * The perf_event_exit_task_context calls perf_event_task - * with child's task_ctx, which generates EXIT events for - * child contexts and sets child->perf_event_ctxp[] to NULL. + * with task's task_ctx, which generates EXIT events for + * task contexts and sets task->perf_event_ctxp[] to NULL. * At this point we need to send EXIT events to cpu contexts. */ - perf_event_task(child, NULL, 0); + perf_event_task(task, NULL, 0); /* * Detach the perf_ctx_data for the system-wide event. */ guard(percpu_read)(&global_ctx_data_rwsem); - detach_task_ctx_data(child); -} - -static void perf_free_event(struct perf_event *event, - struct perf_event_context *ctx) -{ - struct perf_event *parent = event->parent; - - if (WARN_ON_ONCE(!parent)) - return; - - mutex_lock(&parent->child_mutex); - list_del_init(&event->child_list); - mutex_unlock(&parent->child_mutex); - - raw_spin_lock_irq(&ctx->lock); - perf_group_detach(event); - list_del_event(event, ctx); - raw_spin_unlock_irq(&ctx->lock); - put_event(event); + detach_task_ctx_data(task); } /* @@ -13862,48 +14293,7 @@ static void perf_free_event(struct perf_event *event, */ void perf_event_free_task(struct task_struct *task) { - struct perf_event_context *ctx; - struct perf_event *event, *tmp; - - ctx = rcu_access_pointer(task->perf_event_ctxp); - if (!ctx) - return; - - mutex_lock(&ctx->mutex); - raw_spin_lock_irq(&ctx->lock); - /* - * Destroy the task <-> ctx relation and mark the context dead. - * - * This is important because even though the task hasn't been - * exposed yet the context has been (through child_list). - */ - RCU_INIT_POINTER(task->perf_event_ctxp, NULL); - WRITE_ONCE(ctx->task, TASK_TOMBSTONE); - put_task_struct(task); /* cannot be last */ - raw_spin_unlock_irq(&ctx->lock); - - - list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) - perf_free_event(event, ctx); - - mutex_unlock(&ctx->mutex); - - /* - * perf_event_release_kernel() could've stolen some of our - * child events and still have them on its free_list. In that - * case we must wait for these events to have been freed (in - * particular all their references to this task must've been - * dropped). - * - * Without this copy_process() will unconditionally free this - * task (irrespective of its reference count) and - * _free_event()'s put_task_struct(event->hw.target) will be a - * use-after-free. - * - * Wait for all events to drop their context reference. - */ - wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1); - put_ctx(ctx); /* must be last */ + perf_event_exit_task_context(task, false); } void perf_event_delayed_put(struct task_struct *task) @@ -13980,6 +14370,14 @@ inherit_event(struct perf_event *parent_event, if (parent_event->parent) parent_event = parent_event->parent; + if (parent_event->state <= PERF_EVENT_STATE_REVOKED) + return NULL; + + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + guard(srcu)(&pmus_srcu); + child_event = perf_event_alloc(&parent_event->attr, parent_event->cpu, child, @@ -14484,6 +14882,9 @@ void __init perf_event_init(void) idr_init(&pmu_idr); + unwind_deferred_init(&perf_unwind_work, + perf_unwind_deferred_callback); + perf_event_init_all_cpus(); init_srcu_struct(&pmus_srcu); perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 249288d82b8d..d9cc57083091 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -35,7 +35,7 @@ struct perf_buffer { spinlock_t event_lock; struct list_head event_list; - atomic_t mmap_count; + refcount_t mmap_count; unsigned long mmap_locked; struct user_struct *mmap_user; @@ -47,7 +47,7 @@ struct perf_buffer { unsigned long aux_pgoff; int aux_nr_pages; int aux_overwrite; - atomic_t aux_mmap_count; + refcount_t aux_mmap_count; unsigned long aux_mmap_locked; void (*free_aux)(void *); refcount_t aux_refcount; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 5130b119d0ae..20a905023736 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -400,7 +400,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, * the same order, see perf_mmap_close. Otherwise we end up freeing * aux pages in this path, which is a bug, because in_atomic(). */ - if (!atomic_read(&rb->aux_mmap_count)) + if (!refcount_read(&rb->aux_mmap_count)) goto err; if (!refcount_inc_not_zero(&rb->aux_refcount)) @@ -441,7 +441,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, * store that will be enabled on successful return */ if (!handle->size) { /* A, matches D */ - event->pending_disable = smp_processor_id(); + perf_event_disable_inatomic(handle->event); perf_output_wakeup(handle); WRITE_ONCE(rb->aux_nest, 0); goto err_put; @@ -526,7 +526,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) if (wakeup) { if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) - handle->event->pending_disable = smp_processor_id(); + perf_event_disable_inatomic(handle->event); perf_output_wakeup(handle); } @@ -679,7 +679,15 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, { bool overwrite = !(flags & RING_BUFFER_WRITABLE); int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); - int ret = -ENOMEM, max_order; + bool use_contiguous_pages = event->pmu->capabilities & ( + PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_PREFER_LARGE); + /* + * Initialize max_order to 0 for page allocation. This allocates single + * pages to minimize memory fragmentation. This is overridden if the + * PMU needs or prefers contiguous pages (use_contiguous_pages = true). + */ + int max_order = 0; + int ret = -ENOMEM; if (!has_aux(event)) return -EOPNOTSUPP; @@ -689,8 +697,8 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, if (!overwrite) { /* - * Watermark defaults to half the buffer, and so does the - * max_order, to aid PMU drivers in double buffering. + * Watermark defaults to half the buffer, to aid PMU drivers + * in double buffering. */ if (!watermark) watermark = min_t(unsigned long, @@ -698,16 +706,19 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, (unsigned long)nr_pages << (PAGE_SHIFT - 1)); /* - * Use aux_watermark as the basis for chunking to - * help PMU drivers honor the watermark. + * If using contiguous pages, use aux_watermark as the basis + * for chunking to help PMU drivers honor the watermark. */ - max_order = get_order(watermark); + if (use_contiguous_pages) + max_order = get_order(watermark); } else { /* - * We need to start with the max_order that fits in nr_pages, - * not the other way around, hence ilog2() and not get_order. + * If using contiguous pages, we need to start with the + * max_order that fits in nr_pages, not the other way around, + * hence ilog2() and not get_order. */ - max_order = ilog2(nr_pages); + if (use_contiguous_pages) + max_order = ilog2(nr_pages); watermark = 0; } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8d783b5882b6..d546d32390a8 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -29,6 +29,7 @@ #include <linux/workqueue.h> #include <linux/srcu.h> #include <linux/oom.h> /* check_stable_address_space */ +#include <linux/pagewalk.h> #include <linux/uprobes.h> @@ -78,7 +79,7 @@ struct uprobe { * The generic code assumes that it has two members of unknown type * owned by the arch-specific code: * - * insn - copy_insn() saves the original instruction here for + * insn - copy_insn() saves the original instruction here for * arch_uprobe_analyze_insn(). * * ixol - potentially modified instruction to execute out of @@ -106,8 +107,8 @@ static LIST_HEAD(delayed_uprobe_list); * allocated. */ struct xol_area { - wait_queue_head_t wq; /* if all slots are busy */ - unsigned long *bitmap; /* 0 = free slot */ + wait_queue_head_t wq; /* if all slots are busy */ + unsigned long *bitmap; /* 0 = free slot */ struct page *page; /* @@ -115,12 +116,12 @@ struct xol_area { * itself. The probed process or a naughty kernel module could make * the vma go away, and we must handle that reasonably gracefully. */ - unsigned long vaddr; /* Page(s) of instruction slots */ + unsigned long vaddr; /* Page(s) of instruction slots */ }; static void uprobe_warn(struct task_struct *t, const char *msg) { - pr_warn("uprobe: %s:%d failed to %s\n", current->comm, current->pid, msg); + pr_warn("uprobe: %s:%d failed to %s\n", t->comm, t->pid, msg); } /* @@ -152,91 +153,6 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) } /** - * __replace_page - replace page in vma by new page. - * based on replace_page in mm/ksm.c - * - * @vma: vma that holds the pte pointing to page - * @addr: address the old @page is mapped at - * @old_page: the page we are replacing by new_page - * @new_page: the modified page we replace page by - * - * If @new_page is NULL, only unmap @old_page. - * - * Returns 0 on success, negative error code otherwise. - */ -static int __replace_page(struct vm_area_struct *vma, unsigned long addr, - struct page *old_page, struct page *new_page) -{ - struct folio *old_folio = page_folio(old_page); - struct folio *new_folio; - struct mm_struct *mm = vma->vm_mm; - DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0); - int err; - struct mmu_notifier_range range; - pte_t pte; - - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, - addr + PAGE_SIZE); - - if (new_page) { - new_folio = page_folio(new_page); - err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL); - if (err) - return err; - } - - /* For folio_free_swap() below */ - folio_lock(old_folio); - - mmu_notifier_invalidate_range_start(&range); - err = -EAGAIN; - if (!page_vma_mapped_walk(&pvmw)) - goto unlock; - VM_BUG_ON_PAGE(addr != pvmw.address, old_page); - pte = ptep_get(pvmw.pte); - - /* - * Handle PFN swap PTES, such as device-exclusive ones, that actually - * map pages: simply trigger GUP again to fix it up. - */ - if (unlikely(!pte_present(pte))) { - page_vma_mapped_walk_done(&pvmw); - goto unlock; - } - - if (new_page) { - folio_get(new_folio); - folio_add_new_anon_rmap(new_folio, vma, addr, RMAP_EXCLUSIVE); - folio_add_lru_vma(new_folio, vma); - } else - /* no new page, just dec_mm_counter for old_page */ - dec_mm_counter(mm, MM_ANONPAGES); - - if (!folio_test_anon(old_folio)) { - dec_mm_counter(mm, mm_counter_file(old_folio)); - inc_mm_counter(mm, MM_ANONPAGES); - } - - flush_cache_page(vma, addr, pte_pfn(pte)); - ptep_clear_flush(vma, addr, pvmw.pte); - if (new_page) - set_pte_at(mm, addr, pvmw.pte, - mk_pte(new_page, vma->vm_page_prot)); - - folio_remove_rmap_pte(old_folio, old_page, vma); - if (!folio_mapped(old_folio)) - folio_free_swap(old_folio); - page_vma_mapped_walk_done(&pvmw); - folio_put(old_folio); - - err = 0; - unlock: - mmu_notifier_invalidate_range_end(&range); - folio_unlock(old_folio); - return err; -} - -/** * is_swbp_insn - check if instruction is breakpoint instruction. * @insn: instruction to be checked. * Default implementation of is_swbp_insn @@ -261,7 +177,7 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn) return is_swbp_insn(insn); } -static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) +void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) { void *kaddr = kmap_atomic(page); memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); @@ -275,7 +191,8 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src kunmap_atomic(kaddr); } -static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode) +static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn, + int nbytes, void *data) { uprobe_opcode_t old_opcode; bool is_swbp; @@ -289,10 +206,10 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * is a trap variant; uprobes always wins over any other (gdb) * breakpoint. */ - copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); + uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); is_swbp = is_swbp_insn(&old_opcode); - if (is_swbp_insn(new_opcode)) { + if (is_swbp_insn(insn)) { if (is_swbp) /* register: already installed? */ return 0; } else { @@ -463,6 +380,94 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, return ret; } +static bool orig_page_is_identical(struct vm_area_struct *vma, + unsigned long vaddr, struct page *page, bool *pmd_mappable) +{ + const pgoff_t index = vaddr_to_offset(vma, vaddr) >> PAGE_SHIFT; + struct folio *orig_folio = filemap_get_folio(vma->vm_file->f_mapping, + index); + struct page *orig_page; + bool identical; + + if (IS_ERR(orig_folio)) + return false; + orig_page = folio_file_page(orig_folio, index); + + *pmd_mappable = folio_test_pmd_mappable(orig_folio); + identical = folio_test_uptodate(orig_folio) && + pages_identical(page, orig_page); + folio_put(orig_folio); + return identical; +} + +static int __uprobe_write(struct vm_area_struct *vma, + struct folio_walk *fw, struct folio *folio, + unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, + bool is_register) +{ + const unsigned long vaddr = insn_vaddr & PAGE_MASK; + bool pmd_mappable; + + /* For now, we'll only handle PTE-mapped folios. */ + if (fw->level != FW_LEVEL_PTE) + return -EFAULT; + + /* + * See can_follow_write_pte(): we'd actually prefer a writable PTE here, + * but the VMA might not be writable. + */ + if (!pte_write(fw->pte)) { + if (!PageAnonExclusive(fw->page)) + return -EFAULT; + if (unlikely(userfaultfd_pte_wp(vma, fw->pte))) + return -EFAULT; + /* SOFTDIRTY is handled via pte_mkdirty() below. */ + } + + /* + * We'll temporarily unmap the page and flush the TLB, such that we can + * modify the page atomically. + */ + flush_cache_page(vma, vaddr, pte_pfn(fw->pte)); + fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep); + copy_to_page(fw->page, insn_vaddr, insn, nbytes); + + /* + * When unregistering, we may only zap a PTE if uffd is disabled and + * there are no unexpected folio references ... + */ + if (is_register || userfaultfd_missing(vma) || + (folio_ref_count(folio) != folio_expected_ref_count(folio) + 1)) + goto remap; + + /* + * ... and the mapped page is identical to the original page that + * would get faulted in on next access. + */ + if (!orig_page_is_identical(vma, vaddr, fw->page, &pmd_mappable)) + goto remap; + + dec_mm_counter(vma->vm_mm, MM_ANONPAGES); + folio_remove_rmap_pte(folio, fw->page, vma); + if (!folio_mapped(folio) && folio_test_swapcache(folio) && + folio_trylock(folio)) { + folio_free_swap(folio); + folio_unlock(folio); + } + folio_put(folio); + + return pmd_mappable; +remap: + /* + * Make sure that our copy_to_page() changes become visible before the + * set_pte_at() write. + */ + smp_wmb(); + /* We modified the page. Make sure to mark the PTE dirty. */ + set_pte_at(vma->vm_mm, vaddr, fw->ptep, pte_mkdirty(fw->pte)); + return 0; +} + /* * NOTE: * Expect the breakpoint instruction to be the smallest size instruction for @@ -474,147 +479,156 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, * * uprobe_write_opcode - write the opcode at a given virtual address. * @auprobe: arch specific probepoint information. - * @mm: the probed process address space. - * @vaddr: the virtual address to store the opcode. - * @opcode: opcode to be written at @vaddr. + * @vma: the probed virtual memory area. + * @opcode_vaddr: the virtual address to store the opcode. + * @opcode: opcode to be written at @opcode_vaddr. * - * Called with mm->mmap_lock held for read or write. + * Called with mm->mmap_lock held for write. * Return 0 (success) or a negative errno. */ -int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, - unsigned long vaddr, uprobe_opcode_t opcode) +int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + const unsigned long opcode_vaddr, uprobe_opcode_t opcode, + bool is_register) +{ + return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, + verify_opcode, is_register, true /* do_update_ref_ctr */, NULL); +} + +int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, + uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr, + void *data) { + const unsigned long vaddr = insn_vaddr & PAGE_MASK; + struct mm_struct *mm = vma->vm_mm; struct uprobe *uprobe; - struct page *old_page, *new_page; - struct vm_area_struct *vma; - int ret, is_register, ref_ctr_updated = 0; - bool orig_page_huge = false; + int ret, ref_ctr_updated = 0; unsigned int gup_flags = FOLL_FORCE; + struct mmu_notifier_range range; + struct folio_walk fw; + struct folio *folio; + struct page *page; - is_register = is_swbp_insn(&opcode); uprobe = container_of(auprobe, struct uprobe, arch); -retry: + if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags))) + return -EINVAL; + + /* + * When registering, we have to break COW to get an exclusive anonymous + * page that we can safely modify. Use FOLL_WRITE to trigger a write + * fault if required. When unregistering, we might be lucky and the + * anon page is already gone. So defer write faults until really + * required. Use FOLL_SPLIT_PMD, because __uprobe_write() + * cannot deal with PMDs yet. + */ if (is_register) - gup_flags |= FOLL_SPLIT_PMD; - /* Read the page with vaddr into memory */ - old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); - if (IS_ERR(old_page)) - return PTR_ERR(old_page); + gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD; - ret = verify_opcode(old_page, vaddr, &opcode); +retry: + ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &page, NULL); if (ret <= 0) - goto put_old; - - if (is_zero_page(old_page)) { - ret = -EINVAL; - goto put_old; - } + goto out; + folio = page_folio(page); - if (WARN(!is_register && PageCompound(old_page), - "uprobe unregister should never work on compound page\n")) { - ret = -EINVAL; - goto put_old; + ret = verify(page, insn_vaddr, insn, nbytes, data); + if (ret <= 0) { + folio_put(folio); + goto out; } /* We are going to replace instruction, update ref_ctr. */ - if (!ref_ctr_updated && uprobe->ref_ctr_offset) { + if (do_update_ref_ctr && !ref_ctr_updated && uprobe->ref_ctr_offset) { ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); - if (ret) - goto put_old; + if (ret) { + folio_put(folio); + goto out; + } ref_ctr_updated = 1; } ret = 0; - if (!is_register && !PageAnon(old_page)) - goto put_old; - - ret = anon_vma_prepare(vma); - if (ret) - goto put_old; - - ret = -ENOMEM; - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); - if (!new_page) - goto put_old; - - __SetPageUptodate(new_page); - copy_highpage(new_page, old_page); - copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + if (unlikely(!folio_test_anon(folio) || folio_is_zone_device(folio))) { + VM_WARN_ON_ONCE(is_register); + folio_put(folio); + goto out; + } if (!is_register) { - struct page *orig_page; - pgoff_t index; - - VM_BUG_ON_PAGE(!PageAnon(old_page), old_page); - - index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT; - orig_page = find_get_page(vma->vm_file->f_inode->i_mapping, - index); - - if (orig_page) { - if (PageUptodate(orig_page) && - pages_identical(new_page, orig_page)) { - /* let go new_page */ - put_page(new_page); - new_page = NULL; - - if (PageCompound(orig_page)) - orig_page_huge = true; - } - put_page(orig_page); - } + /* + * In the common case, we'll be able to zap the page when + * unregistering. So trigger MMU notifiers now, as we won't + * be able to do it under PTL. + */ + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, + vaddr, vaddr + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); } - ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page); - if (new_page) - put_page(new_page); -put_old: - put_page(old_page); + ret = -EAGAIN; + /* Walk the page tables again, to perform the actual update. */ + if (folio_walk_start(&fw, vma, vaddr, 0)) { + if (fw.page == page) + ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes, is_register); + folio_walk_end(&fw, vma); + } + + if (!is_register) + mmu_notifier_invalidate_range_end(&range); - if (unlikely(ret == -EAGAIN)) + folio_put(folio); + switch (ret) { + case -EFAULT: + gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD; + fallthrough; + case -EAGAIN: goto retry; + default: + break; + } +out: /* Revert back reference counter if instruction update failed. */ - if (ret && is_register && ref_ctr_updated) - update_ref_ctr(uprobe, mm, -1); + if (do_update_ref_ctr && ret < 0 && ref_ctr_updated) + update_ref_ctr(uprobe, mm, is_register ? -1 : 1); /* try collapse pmd for compound page */ - if (!ret && orig_page_huge) + if (ret > 0) collapse_pte_mapped_thp(mm, vaddr, false); - return ret; + return ret < 0 ? ret : 0; } /** * set_swbp - store breakpoint at a given address. * @auprobe: arch specific probepoint information. - * @mm: the probed process address space. + * @vma: the probed virtual memory area. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, store the breakpoint instruction at @vaddr. * Return 0 (success) or a negative errno. */ -int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) +int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) { - return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN); + return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, true); } /** * set_orig_insn - Restore the original instruction. - * @mm: the probed process address space. + * @vma: the probed virtual memory area. * @auprobe: arch specific probepoint information. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, restore the original opcode (opcode) at @vaddr. * Return 0 (success) or a negative errno. */ -int __weak -set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) +int __weak set_orig_insn(struct arch_uprobe *auprobe, + struct vm_area_struct *vma, unsigned long vaddr) { - return uprobe_write_opcode(auprobe, mm, vaddr, - *(uprobe_opcode_t *)&auprobe->insn); + return uprobe_write_opcode(auprobe, vma, vaddr, + *(uprobe_opcode_t *)&auprobe->insn, false); } /* uprobe should have guaranteed positive refcount */ @@ -1047,7 +1061,7 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, if (IS_ERR(page)) return PTR_ERR(page); - copy_from_page(page, offset, insn, nbytes); + uprobe_copy_from_page(page, offset, insn, nbytes); put_page(page); return 0; @@ -1134,10 +1148,10 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm) return ret; } -static int -install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long vaddr) +static int install_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, + unsigned long vaddr) { + struct mm_struct *mm = vma->vm_mm; bool first_uprobe; int ret; @@ -1149,24 +1163,26 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(), * the task can hit this breakpoint right after __replace_page(). */ - first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags); + first_uprobe = !mm_flags_test(MMF_HAS_UPROBES, mm); if (first_uprobe) - set_bit(MMF_HAS_UPROBES, &mm->flags); + mm_flags_set(MMF_HAS_UPROBES, mm); - ret = set_swbp(&uprobe->arch, mm, vaddr); + ret = set_swbp(&uprobe->arch, vma, vaddr); if (!ret) - clear_bit(MMF_RECALC_UPROBES, &mm->flags); + mm_flags_clear(MMF_RECALC_UPROBES, mm); else if (first_uprobe) - clear_bit(MMF_HAS_UPROBES, &mm->flags); + mm_flags_clear(MMF_HAS_UPROBES, mm); return ret; } -static int -remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) +static int remove_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, + unsigned long vaddr) { - set_bit(MMF_RECALC_UPROBES, &mm->flags); - return set_orig_insn(&uprobe->arch, mm, vaddr); + struct mm_struct *mm = vma->vm_mm; + + mm_flags_set(MMF_RECALC_UPROBES, mm); + return set_orig_insn(&uprobe->arch, vma, vaddr); } struct map_info { @@ -1204,7 +1220,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) * reclaim. This is optimistic, no harm done if it fails. */ prev = kmalloc(sizeof(struct map_info), - GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); + GFP_NOWAIT | __GFP_NOMEMALLOC); if (prev) prev->next = NULL; } @@ -1296,10 +1312,10 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) if (is_register) { /* consult only the "caller", new consumer. */ if (consumer_filter(new, mm)) - err = install_breakpoint(uprobe, mm, vma, info->vaddr); - } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { + err = install_breakpoint(uprobe, vma, info->vaddr); + } else if (mm_flags_test(MMF_HAS_UPROBES, mm)) { if (!filter_chain(uprobe, mm)) - err |= remove_breakpoint(uprobe, mm, info->vaddr); + err |= remove_breakpoint(uprobe, vma, info->vaddr); } unlock: @@ -1391,7 +1407,7 @@ struct uprobe *uprobe_register(struct inode *inode, return ERR_PTR(-EINVAL); /* - * This ensures that copy_from_page(), copy_to_page() and + * This ensures that uprobe_copy_from_page(), copy_to_page() and * __update_ref_ctr() can't cross page boundary. */ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) @@ -1457,7 +1473,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) struct vm_area_struct *vma; int err = 0; - mmap_read_lock(mm); + mmap_write_lock(mm); for_each_vma(vmi, vma) { unsigned long vaddr; loff_t offset; @@ -1472,9 +1488,9 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) continue; vaddr = offset_to_vaddr(vma, uprobe->offset); - err |= remove_breakpoint(uprobe, mm, vaddr); + err |= remove_breakpoint(uprobe, vma, vaddr); } - mmap_read_unlock(mm); + mmap_write_unlock(mm); return err; } @@ -1589,7 +1605,7 @@ int uprobe_mmap(struct vm_area_struct *vma) if (vma->vm_file && (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && - test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags)) + mm_flags_test(MMF_HAS_UPROBES, vma->vm_mm)) delayed_ref_ctr_inc(vma); if (!valid_vma(vma, true)) @@ -1610,7 +1626,7 @@ int uprobe_mmap(struct vm_area_struct *vma) if (!fatal_signal_pending(current) && filter_chain(uprobe, vma->vm_mm)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); - install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); + install_breakpoint(uprobe, vma, vaddr); } put_uprobe(uprobe); } @@ -1649,12 +1665,12 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ return; - if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) || - test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags)) + if (!mm_flags_test(MMF_HAS_UPROBES, vma->vm_mm) || + mm_flags_test(MMF_RECALC_UPROBES, vma->vm_mm)) return; if (vma_has_uprobes(vma, start, end)) - set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags); + mm_flags_set(MMF_RECALC_UPROBES, vma->vm_mm); } static vm_fault_t xol_fault(const struct vm_special_mapping *sm, @@ -1720,7 +1736,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) return ret; } -void * __weak arch_uprobe_trampoline(unsigned long *psize) +void * __weak arch_uretprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; @@ -1752,7 +1768,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); - insns = arch_uprobe_trampoline(&insns_size); + insns = arch_uretprobe_trampoline(&insns_size); arch_uprobe_copy_ixol(area->page, 0, insns, insns_size); if (!xol_add_vma(mm, area)) @@ -1786,6 +1802,14 @@ static struct xol_area *get_xol_area(void) return area; } +void __weak arch_uprobe_clear_state(struct mm_struct *mm) +{ +} + +void __weak arch_uprobe_init_state(struct mm_struct *mm) +{ +} + /* * uprobe_clear_state - Free the area allocated for slots. */ @@ -1797,6 +1821,8 @@ void uprobe_clear_state(struct mm_struct *mm) delayed_uprobe_remove(NULL, mm); mutex_unlock(&delayed_uprobe_lock); + arch_uprobe_clear_state(mm); + if (!area) return; @@ -1817,10 +1843,10 @@ void uprobe_end_dup_mmap(void) void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) { - if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) { - set_bit(MMF_HAS_UPROBES, &newmm->flags); + if (mm_flags_test(MMF_HAS_UPROBES, oldmm)) { + mm_flags_set(MMF_HAS_UPROBES, newmm); /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */ - set_bit(MMF_RECALC_UPROBES, &newmm->flags); + mm_flags_set(MMF_RECALC_UPROBES, newmm); } } @@ -2154,7 +2180,7 @@ static void dup_xol_work(struct callback_head *work) /* * Called in context of a new clone/fork from copy_process. */ -void uprobe_copy_process(struct task_struct *t, unsigned long flags) +void uprobe_copy_process(struct task_struct *t, u64 flags) { struct uprobe_task *utask = current->utask; struct mm_struct *mm = current->mm; @@ -2364,7 +2390,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm) return; } - clear_bit(MMF_HAS_UPROBES, &mm->flags); + mm_flags_clear(MMF_HAS_UPROBES, mm); } static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) @@ -2387,7 +2413,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) if (result < 0) return result; - copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + uprobe_copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); put_page(page); out: /* This needs to return true for any variant of the trap insn */ @@ -2462,7 +2488,7 @@ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb *is_swbp = -EFAULT; } - if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags)) + if (!uprobe && mm_flags_test_and_clear(MMF_RECALC_UPROBES, mm)) mmf_recalc_uprobes(mm); mmap_read_unlock(mm); @@ -2671,6 +2697,10 @@ bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c return true; } +void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -2735,6 +2765,16 @@ static void handle_swbp(struct pt_regs *regs) handler_chain(uprobe, regs); + /* Try to optimize after first hit. */ + arch_uprobe_optimize(&uprobe->arch, bp_vaddr); + + /* + * If user decided to take execution elsewhere, it makes little sense + * to execute the original instruction, so let's skip it. + */ + if (instruction_pointer(regs) != bp_vaddr) + goto out; + if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; @@ -2746,6 +2786,23 @@ out: rcu_read_unlock_trace(); } +void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr) +{ + struct uprobe *uprobe; + int is_swbp; + + guard(rcu_tasks_trace)(); + + uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); + if (!uprobe) + return; + if (!get_utask()) + return; + if (arch_uprobe_ignore(&uprobe->arch, regs)) + return; + handler_chain(uprobe, regs); +} + /* * Perform required fix-ups and disable singlestep. * Allow pending signals to take effect. @@ -2812,7 +2869,7 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs) if (!current->mm) return 0; - if (!test_bit(MMF_HAS_UPROBES, ¤t->mm->flags) && + if (!mm_flags_test(MMF_HAS_UPROBES, current->mm) && (!current->utask || !current->utask->return_instances)) return 0; diff --git a/kernel/exit.c b/kernel/exit.c index 1b51dc099f1e..8a87021211ae 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -68,6 +68,7 @@ #include <linux/rethook.h> #include <linux/sysfs.h> #include <linux/user_events.h> +#include <linux/unwind_deferred.h> #include <linux/uaccess.h> #include <linux/pidfs.h> @@ -133,8 +134,13 @@ struct release_task_post { static void __unhash_process(struct release_task_post *post, struct task_struct *p, bool group_dead) { + struct pid *pid = task_pid(p); + nr_threads--; + detach_pid(post->pids, p, PIDTYPE_PID); + wake_up_all(&pid->wait_pidfd); + if (group_dead) { detach_pid(post->pids, p, PIDTYPE_TGID); detach_pid(post->pids, p, PIDTYPE_PGID); @@ -245,15 +251,14 @@ repeat: memset(&post, 0, sizeof(post)); /* don't need to get the RCU readlock here - the process is dead and - * can't be modifying its own credentials. But shut RCU-lockdep up */ - rcu_read_lock(); + * can't be modifying its own credentials. */ dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); - rcu_read_unlock(); pidfs_exit(p); - cgroup_release(p); + cgroup_task_release(p); - thread_pid = get_pid(p->thread_pid); + /* Retrieve @thread_pid before __unhash_process() may set it to NULL. */ + thread_pid = task_pid(p); write_lock_irq(&tasklist_lock); ptrace_release_task(p); @@ -282,8 +287,9 @@ repeat: } write_unlock_irq(&tasklist_lock); + /* @thread_pid can't go away until free_pids() below */ proc_flush_pid(thread_pid); - put_pid(thread_pid); + exit_cred_namespaces(p); add_device_randomness(&p->se.sum_exec_runtime, sizeof(p->se.sum_exec_runtime)); free_pids(post.pids); @@ -415,44 +421,30 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) } } -static void coredump_task_exit(struct task_struct *tsk) +static void coredump_task_exit(struct task_struct *tsk, + struct core_state *core_state) { - struct core_state *core_state; + struct core_thread self; + self.task = tsk; + if (self.task->flags & PF_SIGNALED) + self.next = xchg(&core_state->dumper.next, &self); + else + self.task = NULL; /* - * Serialize with any possible pending coredump. - * We must hold siglock around checking core_state - * and setting PF_POSTCOREDUMP. The core-inducing thread - * will increment ->nr_threads for each thread in the - * group without PF_POSTCOREDUMP set. + * Implies mb(), the result of xchg() must be visible + * to core_state->dumper. */ - spin_lock_irq(&tsk->sighand->siglock); - tsk->flags |= PF_POSTCOREDUMP; - core_state = tsk->signal->core_state; - spin_unlock_irq(&tsk->sighand->siglock); - if (core_state) { - struct core_thread self; - - self.task = current; - if (self.task->flags & PF_SIGNALED) - self.next = xchg(&core_state->dumper.next, &self); - else - self.task = NULL; - /* - * Implies mb(), the result of xchg() must be visible - * to core_state->dumper. - */ - if (atomic_dec_and_test(&core_state->nr_threads)) - complete(&core_state->startup); + if (atomic_dec_and_test(&core_state->nr_threads)) + complete(&core_state->startup); - for (;;) { - set_current_state(TASK_IDLE|TASK_FREEZABLE); - if (!self.task) /* see coredump_finish() */ - break; - schedule(); - } - __set_current_state(TASK_RUNNING); + for (;;) { + set_current_state(TASK_IDLE|TASK_FREEZABLE); + if (!self.task) /* see coredump_finish() */ + break; + schedule(); } + __set_current_state(TASK_RUNNING); } #ifdef CONFIG_MEMCG @@ -700,12 +692,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, } /* - * This does two things: - * - * A. Make init inherit all the child processes - * B. Check to see if any process groups have become orphaned - * as a result of our exiting, and if they have any stopped - * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + * Make init inherit all the child processes */ static void forget_original_parent(struct task_struct *father, struct list_head *dead) @@ -792,24 +779,29 @@ static void exit_notify(struct task_struct *tsk, int group_dead) } #ifdef CONFIG_DEBUG_STACK_USAGE +#ifdef CONFIG_STACK_GROWSUP unsigned long stack_not_used(struct task_struct *p) { unsigned long *n = end_of_stack(p); do { /* Skip over canary */ -# ifdef CONFIG_STACK_GROWSUP n--; -# else - n++; -# endif } while (!*n); -# ifdef CONFIG_STACK_GROWSUP return (unsigned long)end_of_stack(p) - (unsigned long)n; -# else +} +#else /* !CONFIG_STACK_GROWSUP */ +unsigned long stack_not_used(struct task_struct *p) +{ + unsigned long *n = end_of_stack(p); + + do { /* Skip over canary */ + n++; + } while (!*n); + return (unsigned long)n - (unsigned long)end_of_stack(p); -# endif } +#endif /* CONFIG_STACK_GROWSUP */ /* Count the maximum pages reached in kernel stacks */ static inline void kstack_histogram(unsigned long used_stack) @@ -868,14 +860,15 @@ static void check_stack_usage(void) } spin_unlock(&low_water_lock); } -#else +#else /* !CONFIG_DEBUG_STACK_USAGE */ static inline void check_stack_usage(void) {} -#endif +#endif /* CONFIG_DEBUG_STACK_USAGE */ static void synchronize_group_exit(struct task_struct *tsk, long code) { struct sighand_struct *sighand = tsk->sighand; struct signal_struct *signal = tsk->signal; + struct core_state *core_state; spin_lock_irq(&sighand->siglock); signal->quick_threads--; @@ -885,7 +878,19 @@ static void synchronize_group_exit(struct task_struct *tsk, long code) signal->group_exit_code = code; signal->group_stop_count = 0; } + /* + * Serialize with any possible pending coredump. + * We must hold siglock around checking core_state + * and setting PF_POSTCOREDUMP. The core-inducing thread + * will increment ->nr_threads for each thread in the + * group without PF_POSTCOREDUMP set. + */ + tsk->flags |= PF_POSTCOREDUMP; + core_state = signal->core_state; spin_unlock_irq(&sighand->siglock); + + if (unlikely(core_state)) + coredump_task_exit(tsk, core_state); } void __noreturn do_exit(long code) @@ -894,19 +899,17 @@ void __noreturn do_exit(long code) int group_dead; WARN_ON(irqs_disabled()); - - synchronize_group_exit(tsk, code); - WARN_ON(tsk->plug); kcov_task_exit(tsk); kmsan_task_exit(tsk); - coredump_task_exit(tsk); + synchronize_group_exit(tsk, code); ptrace_event(PTRACE_EVENT_EXIT, code); user_events_exit(tsk); io_uring_files_cancel(); + sched_mm_cid_exit(tsk); exit_signals(tsk); /* sets PF_EXITING */ seccomp_filter_release(tsk); @@ -936,12 +939,27 @@ void __noreturn do_exit(long code) tsk->exit_code = code; taskstats_exit(tsk, group_dead); + trace_sched_process_exit(tsk, group_dead); + + /* + * Since sampling can touch ->mm, make sure to stop everything before we + * tear it down. + * + * Also flushes inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_event_exit_task(tsk); + /* + * PF_EXITING (above) ensures unwind_deferred_request() will no + * longer add new unwinds. While exit_mm() (below) will destroy the + * abaility to do unwinds. So flush any pending unwinds here. + */ + unwind_deferred_task_exit(tsk); exit_mm(); if (group_dead) acct_process(); - trace_sched_process_exit(tsk); exit_sem(tsk); exit_shm(tsk); @@ -949,20 +967,12 @@ void __noreturn do_exit(long code) exit_fs(tsk); if (group_dead) disassociate_ctty(1); - exit_task_namespaces(tsk); + exit_nsproxy_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - * - * because of cgroup mode, must be called before cgroup_exit() - */ - perf_event_exit_task(tsk); - sched_autogroup_exit_task(tsk); - cgroup_exit(tsk); + cgroup_task_exit(tsk); /* * FIXME: do that only when needed, using sched_exit tracepoint diff --git a/kernel/fork.c b/kernel/fork.c index c4b26cd8998b..b1f3915d5f8e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -93,7 +93,7 @@ #include <linux/kcov.h> #include <linux/livepatch.h> #include <linux/thread_info.h> -#include <linux/stackleak.h> +#include <linux/kstack_erase.h> #include <linux/kasan.h> #include <linux/scs.h> #include <linux/io_uring.h> @@ -105,13 +105,17 @@ #include <uapi/linux/pidfd.h> #include <linux/pidfs.h> #include <linux/tick.h> - -#include <asm/pgalloc.h> +#include <linux/unwind_deferred.h> +#include <linux/pgalloc.h> #include <linux/uaccess.h> + #include <asm/mmu_context.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> +/* For dup_mmap(). */ +#include "../mm/internal.h" + #include <trace/events/sched.h> #define CREATE_TRACE_POINTS @@ -185,34 +189,81 @@ static inline void free_task_struct(struct task_struct *tsk) kmem_cache_free(task_struct_cachep, tsk); } -/* - * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a - * kmemcache based allocator. - */ -# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) - -# ifdef CONFIG_VMAP_STACK +#ifdef CONFIG_VMAP_STACK /* * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB * flush. Try to minimize the number of calls by caching stacks. */ #define NR_CACHED_STACKS 2 static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); +/* + * Allocated stacks are cached and later reused by new threads, so memcg + * accounting is performed by the code assigning/releasing stacks to tasks. + * We need a zeroed memory without __GFP_ACCOUNT. + */ +#define GFP_VMAP_STACK (GFP_KERNEL | __GFP_ZERO) struct vm_stack { struct rcu_head rcu; struct vm_struct *stack_vm_area; }; -static bool try_release_thread_stack_to_cache(struct vm_struct *vm) +static struct vm_struct *alloc_thread_stack_node_from_cache(struct task_struct *tsk, int node) { + struct vm_struct *vm_area; unsigned int i; - for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *tmp = NULL; + /* + * If the node has memory, we are guaranteed the stacks are backed by local pages. + * Otherwise the pages are arbitrary. + * + * Note that depending on cpuset it is possible we will get migrated to a different + * node immediately after allocating here, so this does *not* guarantee locality for + * arbitrary callers. + */ + scoped_guard(preempt) { + if (node != NUMA_NO_NODE && numa_node_id() != node) + return NULL; + + for (i = 0; i < NR_CACHED_STACKS; i++) { + vm_area = this_cpu_xchg(cached_stacks[i], NULL); + if (vm_area) + return vm_area; + } + } + + return NULL; +} - if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm)) - return true; +static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area) +{ + unsigned int i; + int nid; + + /* + * Don't cache stacks if any of the pages don't match the local domain, unless + * there is no local memory to begin with. + * + * Note that lack of local memory does not automatically mean it makes no difference + * performance-wise which other domain backs the stack. In this case we are merely + * trying to avoid constantly going to vmalloc. + */ + scoped_guard(preempt) { + nid = numa_node_id(); + if (node_state(nid, N_MEMORY)) { + for (i = 0; i < vm_area->nr_pages; i++) { + struct page *page = vm_area->pages[i]; + if (page_to_nid(page) != nid) + return false; + } + } + + for (i = 0; i < NR_CACHED_STACKS; i++) { + struct vm_struct *tmp = NULL; + + if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area)) + return true; + } } return false; } @@ -220,11 +271,12 @@ static bool try_release_thread_stack_to_cache(struct vm_struct *vm) static void thread_stack_free_rcu(struct rcu_head *rh) { struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu); + struct vm_struct *vm_area = vm_stack->stack_vm_area; if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area)) return; - vfree(vm_stack); + vfree(vm_area->addr); } static void thread_stack_delayed_free(struct task_struct *tsk) @@ -237,32 +289,32 @@ static void thread_stack_delayed_free(struct task_struct *tsk) static int free_vm_stack_cache(unsigned int cpu) { - struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu); + struct vm_struct **cached_vm_stack_areas = per_cpu_ptr(cached_stacks, cpu); int i; for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *vm_stack = cached_vm_stacks[i]; + struct vm_struct *vm_area = cached_vm_stack_areas[i]; - if (!vm_stack) + if (!vm_area) continue; - vfree(vm_stack->addr); - cached_vm_stacks[i] = NULL; + vfree(vm_area->addr); + cached_vm_stack_areas[i] = NULL; } return 0; } -static int memcg_charge_kernel_stack(struct vm_struct *vm) +static int memcg_charge_kernel_stack(struct vm_struct *vm_area) { int i; int ret; int nr_charged = 0; - BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); + BUG_ON(vm_area->nr_pages != THREAD_SIZE / PAGE_SIZE); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { - ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); + ret = memcg_kmem_charge_page(vm_area->pages[i], GFP_KERNEL, 0); if (ret) goto err; nr_charged++; @@ -270,55 +322,43 @@ static int memcg_charge_kernel_stack(struct vm_struct *vm) return 0; err: for (i = 0; i < nr_charged; i++) - memcg_kmem_uncharge_page(vm->pages[i], 0); + memcg_kmem_uncharge_page(vm_area->pages[i], 0); return ret; } static int alloc_thread_stack_node(struct task_struct *tsk, int node) { - struct vm_struct *vm; + struct vm_struct *vm_area; void *stack; - int i; - for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *s; - - s = this_cpu_xchg(cached_stacks[i], NULL); - - if (!s) - continue; + vm_area = alloc_thread_stack_node_from_cache(tsk, node); + if (vm_area) { + if (memcg_charge_kernel_stack(vm_area)) { + vfree(vm_area->addr); + return -ENOMEM; + } /* Reset stack metadata. */ - kasan_unpoison_range(s->addr, THREAD_SIZE); + kasan_unpoison_range(vm_area->addr, THREAD_SIZE); - stack = kasan_reset_tag(s->addr); + stack = kasan_reset_tag(vm_area->addr); /* Clear stale pointers from reused stack. */ memset(stack, 0, THREAD_SIZE); - if (memcg_charge_kernel_stack(s)) { - vfree(s->addr); - return -ENOMEM; - } - - tsk->stack_vm_area = s; + tsk->stack_vm_area = vm_area; tsk->stack = stack; return 0; } - /* - * Allocated stacks are cached and later reused by new threads, - * so memcg accounting is performed manually on assigning/releasing - * stacks to tasks. Drop __GFP_ACCOUNT. - */ stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, - THREADINFO_GFP & ~__GFP_ACCOUNT, + GFP_VMAP_STACK, node, __builtin_return_address(0)); if (!stack) return -ENOMEM; - vm = find_vm_area(stack); - if (memcg_charge_kernel_stack(vm)) { + vm_area = find_vm_area(stack); + if (memcg_charge_kernel_stack(vm_area)) { vfree(stack); return -ENOMEM; } @@ -327,7 +367,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) * free_thread_stack() can be called in interrupt context, * so cache the vm_struct. */ - tsk->stack_vm_area = vm; + tsk->stack_vm_area = vm_area; stack = kasan_reset_tag(stack); tsk->stack = stack; return 0; @@ -342,7 +382,13 @@ static void free_thread_stack(struct task_struct *tsk) tsk->stack_vm_area = NULL; } -# else /* !CONFIG_VMAP_STACK */ +#else /* !CONFIG_VMAP_STACK */ + +/* + * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a + * kmemcache based allocator. + */ +#if THREAD_SIZE >= PAGE_SIZE static void thread_stack_free_rcu(struct rcu_head *rh) { @@ -374,8 +420,7 @@ static void free_thread_stack(struct task_struct *tsk) tsk->stack = NULL; } -# endif /* CONFIG_VMAP_STACK */ -# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */ +#else /* !(THREAD_SIZE >= PAGE_SIZE) */ static struct kmem_cache *thread_stack_cache; @@ -414,7 +459,8 @@ void thread_stack_cache_init(void) BUG_ON(thread_stack_cache == NULL); } -# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */ +#endif /* THREAD_SIZE >= PAGE_SIZE */ +#endif /* CONFIG_VMAP_STACK */ /* SLAB cache for signal_struct structures (tsk->signal) */ static struct kmem_cache *signal_cachep; @@ -428,100 +474,17 @@ struct kmem_cache *files_cachep; /* SLAB cache for fs_struct structures (tsk->fs) */ struct kmem_cache *fs_cachep; -/* SLAB cache for vm_area_struct structures */ -static struct kmem_cache *vm_area_cachep; - /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) -{ - struct vm_area_struct *vma; - - vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - if (!vma) - return NULL; - - vma_init(vma, mm); - - return vma; -} - -static void vm_area_init_from(const struct vm_area_struct *src, - struct vm_area_struct *dest) -{ - dest->vm_mm = src->vm_mm; - dest->vm_ops = src->vm_ops; - dest->vm_start = src->vm_start; - dest->vm_end = src->vm_end; - dest->anon_vma = src->anon_vma; - dest->vm_pgoff = src->vm_pgoff; - dest->vm_file = src->vm_file; - dest->vm_private_data = src->vm_private_data; - vm_flags_init(dest, src->vm_flags); - memcpy(&dest->vm_page_prot, &src->vm_page_prot, - sizeof(dest->vm_page_prot)); - /* - * src->shared.rb may be modified concurrently when called from - * dup_mmap(), but the clone will reinitialize it. - */ - data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared))); - memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx, - sizeof(dest->vm_userfaultfd_ctx)); -#ifdef CONFIG_ANON_VMA_NAME - dest->anon_name = src->anon_name; -#endif -#ifdef CONFIG_SWAP - memcpy(&dest->swap_readahead_info, &src->swap_readahead_info, - sizeof(dest->swap_readahead_info)); -#endif -#ifndef CONFIG_MMU - dest->vm_region = src->vm_region; -#endif -#ifdef CONFIG_NUMA - dest->vm_policy = src->vm_policy; -#endif -} - -struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) -{ - struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - - if (!new) - return NULL; - - ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); - ASSERT_EXCLUSIVE_WRITER(orig->vm_file); - vm_area_init_from(orig, new); - vma_lock_init(new, true); - INIT_LIST_HEAD(&new->anon_vma_chain); - vma_numab_state_init(new); - dup_anon_vma_name(orig, new); - - /* track_pfn_copy() will later take care of copying internal state. */ - if (unlikely(new->vm_flags & VM_PFNMAP)) - untrack_pfn_clear(new); - - return new; -} - -void vm_area_free(struct vm_area_struct *vma) -{ - /* The vma should be detached while being destroyed. */ - vma_assert_detached(vma); - vma_numab_state_free(vma); - free_anon_vma_name(vma); - kmem_cache_free(vm_area_cachep, vma); -} - static void account_kernel_stack(struct task_struct *tsk, int account) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { - struct vm_struct *vm = task_stack_vm_area(tsk); + struct vm_struct *vm_area = task_stack_vm_area(tsk); int i; for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) - mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB, + mod_lruvec_page_state(vm_area->pages[i], NR_KERNEL_STACK_KB, account * (PAGE_SIZE / 1024)); } else { void *stack = task_stack_page(tsk); @@ -537,12 +500,12 @@ void exit_task_stack_account(struct task_struct *tsk) account_kernel_stack(tsk, -1); if (IS_ENABLED(CONFIG_VMAP_STACK)) { - struct vm_struct *vm; + struct vm_struct *vm_area; int i; - vm = task_stack_vm_area(tsk); + vm_area = task_stack_vm_area(tsk); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) - memcg_kmem_uncharge_page(vm->pages[i], 0); + memcg_kmem_uncharge_page(vm_area->pages[i], 0); } } @@ -593,7 +556,7 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); -static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) +void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) { struct file *exe_file; @@ -608,178 +571,6 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) } #ifdef CONFIG_MMU -static __latent_entropy int dup_mmap(struct mm_struct *mm, - struct mm_struct *oldmm) -{ - struct vm_area_struct *mpnt, *tmp; - int retval; - unsigned long charge = 0; - LIST_HEAD(uf); - VMA_ITERATOR(vmi, mm, 0); - - if (mmap_write_lock_killable(oldmm)) - return -EINTR; - flush_cache_dup_mm(oldmm); - uprobe_dup_mmap(oldmm, mm); - /* - * Not linked in yet - no deadlock potential: - */ - mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); - - /* No ordering required: file already has been exposed. */ - dup_mm_exe_file(mm, oldmm); - - mm->total_vm = oldmm->total_vm; - mm->data_vm = oldmm->data_vm; - mm->exec_vm = oldmm->exec_vm; - mm->stack_vm = oldmm->stack_vm; - - /* Use __mt_dup() to efficiently build an identical maple tree. */ - retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); - if (unlikely(retval)) - goto out; - - mt_clear_in_rcu(vmi.mas.tree); - for_each_vma(vmi, mpnt) { - struct file *file; - - vma_start_write(mpnt); - if (mpnt->vm_flags & VM_DONTCOPY) { - retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, - mpnt->vm_end, GFP_KERNEL); - if (retval) - goto loop_out; - - vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); - continue; - } - charge = 0; - /* - * Don't duplicate many vmas if we've been oom-killed (for - * example) - */ - if (fatal_signal_pending(current)) { - retval = -EINTR; - goto loop_out; - } - if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned long len = vma_pages(mpnt); - - if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ - goto fail_nomem; - charge = len; - } - tmp = vm_area_dup(mpnt); - if (!tmp) - goto fail_nomem; - retval = vma_dup_policy(mpnt, tmp); - if (retval) - goto fail_nomem_policy; - tmp->vm_mm = mm; - retval = dup_userfaultfd(tmp, &uf); - if (retval) - goto fail_nomem_anon_vma_fork; - if (tmp->vm_flags & VM_WIPEONFORK) { - /* - * VM_WIPEONFORK gets a clean slate in the child. - * Don't prepare anon_vma until fault since we don't - * copy page for current vma. - */ - tmp->anon_vma = NULL; - } else if (anon_vma_fork(tmp, mpnt)) - goto fail_nomem_anon_vma_fork; - vm_flags_clear(tmp, VM_LOCKED_MASK); - /* - * Copy/update hugetlb private vma information. - */ - if (is_vm_hugetlb_page(tmp)) - hugetlb_dup_vma_private(tmp); - - /* - * Link the vma into the MT. After using __mt_dup(), memory - * allocation is not necessary here, so it cannot fail. - */ - vma_iter_bulk_store(&vmi, tmp); - - mm->map_count++; - - if (tmp->vm_ops && tmp->vm_ops->open) - tmp->vm_ops->open(tmp); - - file = tmp->vm_file; - if (file) { - struct address_space *mapping = file->f_mapping; - - get_file(file); - i_mmap_lock_write(mapping); - if (vma_is_shared_maywrite(tmp)) - mapping_allow_writable(mapping); - flush_dcache_mmap_lock(mapping); - /* insert tmp into the share list, just after mpnt */ - vma_interval_tree_insert_after(tmp, mpnt, - &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); - i_mmap_unlock_write(mapping); - } - - if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(tmp, mpnt); - - if (retval) { - mpnt = vma_next(&vmi); - goto loop_out; - } - } - /* a new mm has just been created */ - retval = arch_dup_mmap(oldmm, mm); -loop_out: - vma_iter_free(&vmi); - if (!retval) { - mt_set_in_rcu(vmi.mas.tree); - ksm_fork(mm, oldmm); - khugepaged_fork(mm, oldmm); - } else { - - /* - * The entire maple tree has already been duplicated. If the - * mmap duplication fails, mark the failure point with - * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, - * stop releasing VMAs that have not been duplicated after this - * point. - */ - if (mpnt) { - mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); - mas_store(&vmi.mas, XA_ZERO_ENTRY); - /* Avoid OOM iterating a broken tree */ - set_bit(MMF_OOM_SKIP, &mm->flags); - } - /* - * The mm_struct is going to exit, but the locks will be dropped - * first. Set the mm_struct as unstable is advisable as it is - * not fully initialised. - */ - set_bit(MMF_UNSTABLE, &mm->flags); - } -out: - mmap_write_unlock(mm); - flush_tlb_mm(oldmm); - mmap_write_unlock(oldmm); - if (!retval) - dup_userfaultfd_complete(&uf); - else - dup_userfaultfd_fail(&uf); - return retval; - -fail_nomem_anon_vma_fork: - mpol_put(vma_policy(tmp)); -fail_nomem_policy: - vm_area_free(tmp); -fail_nomem: - retval = -ENOMEM; - vm_unacct_memory(charge); - goto loop_out; -} - static inline int mm_alloc_pgd(struct mm_struct *mm) { mm->pgd = pgd_alloc(mm); @@ -793,13 +584,6 @@ static inline void mm_free_pgd(struct mm_struct *mm) pgd_free(mm, mm->pgd); } #else -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) -{ - mmap_write_lock(oldmm); - dup_mm_exe_file(mm, oldmm); - mmap_write_unlock(oldmm); - return 0; -} #define mm_alloc_pgd(mm) (0) #define mm_free_pgd(mm) #endif /* CONFIG_MMU */ @@ -844,9 +628,12 @@ static void check_mm(struct mm_struct *mm) for (i = 0; i < NR_MM_COUNTERS; i++) { long x = percpu_counter_sum(&mm->rss_stat[i]); - if (unlikely(x)) - pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", - mm, resident_page_types[i], x); + if (unlikely(x)) { + pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld Comm:%s Pid:%d\n", + mm, resident_page_types[i], x, + current->comm, + task_pid_nr(current)); + } } if (mm_pgtables_bytes(mm)) @@ -991,9 +778,9 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); - sched_ext_free(tsk); + unwind_task_free(tsk); io_uring_free(tsk); - cgroup_free(tsk); + cgroup_task_free(tsk); task_numa_free(tsk, true); security_task_free(tsk); exit_creds(tsk); @@ -1210,10 +997,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #endif #ifdef CONFIG_SCHED_MM_CID - tsk->mm_cid = -1; - tsk->last_mm_cid = -1; - tsk->mm_cid_active = 0; - tsk->migrate_from_cpu = -1; + tsk->mm_cid.cid = MM_CID_UNSET; + tsk->mm_cid.active = 0; #endif return tsk; @@ -1269,6 +1054,7 @@ static void mm_init_uprobes_state(struct mm_struct *mm) { #ifdef CONFIG_UPROBES mm->uprobes_state.xol_area = NULL; + arch_uprobe_init_state(mm); #endif } @@ -1311,14 +1097,20 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_uprobes_state(mm); hugetlb_count_init(mm); + mm_flags_clear_all(mm); if (current->mm) { - mm->flags = mmf_init_flags(current->mm->flags); + unsigned long flags = __mm_flags_get_word(current->mm); + + __mm_flags_overwrite_word(mm, mmf_init_legacy_flags(flags)); mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK; } else { - mm->flags = default_dump_filter; + __mm_flags_overwrite_word(mm, default_dump_filter); mm->def_flags = 0; } + if (futex_mm_init(mm)) + goto fail_mm_init; + if (mm_alloc_pgd(mm)) goto fail_nopgd; @@ -1348,6 +1140,8 @@ fail_nocontext: fail_noid: mm_free_pgd(mm); fail_nopgd: + futex_hash_free(mm); +fail_mm_init: free_mm(mm); return NULL; } @@ -1387,6 +1181,7 @@ static inline void __mmput(struct mm_struct *mm) if (mm->binfmt) module_put(mm->binfmt->module); lru_gen_del_mm(mm); + futex_hash_free(mm); mmdrop(mm); } @@ -1402,7 +1197,7 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); -#ifdef CONFIG_MMU +#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH) static void mmput_async_fn(struct work_struct *work) { struct mm_struct *mm = container_of(work, struct mm_struct, @@ -1756,7 +1551,7 @@ fail_nomem: return NULL; } -static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) +static int copy_mm(u64 clone_flags, struct task_struct *tsk) { struct mm_struct *mm, *oldmm; @@ -1794,19 +1589,19 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) +static int copy_fs(u64 clone_flags, struct task_struct *tsk) { struct fs_struct *fs = current->fs; if (clone_flags & CLONE_FS) { /* tsk->fs is already what we want */ - spin_lock(&fs->lock); + read_seqlock_excl(&fs->seq); /* "users" and "in_exec" locked for check_unsafe_exec() */ if (fs->in_exec) { - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); return -EAGAIN; } fs->users++; - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); return 0; } tsk->fs = copy_fs_struct(fs); @@ -1815,7 +1610,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_files(unsigned long clone_flags, struct task_struct *tsk, +static int copy_files(u64 clone_flags, struct task_struct *tsk, int no_files) { struct files_struct *oldf, *newf; @@ -1845,7 +1640,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk, return 0; } -static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) +static int copy_sighand(u64 clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; @@ -1894,7 +1689,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) posix_cputimers_group_init(pct, cpu_limit); } -static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) +static int copy_signal(u64 clone_flags, struct task_struct *tsk) { struct signal_struct *sig; @@ -1937,6 +1732,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); sched_autogroup_fork(sig); +#ifdef CONFIG_CGROUPS + init_rwsem(&sig->cgroup_threadgroup_rwsem); +#endif + sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; @@ -2036,17 +1835,16 @@ static inline void rcu_copy_process(struct task_struct *p) } /** - * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd + * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd * @pid: the struct pid for which to create a pidfd * @flags: flags of the new @pidfd - * @ret: Where to return the file for the pidfd. + * @ret_file: return the new pidfs file * * Allocate a new file that stashes @pid and reserve a new pidfd number in the * caller's file descriptor table. The pidfd is reserved but not installed yet. * - * The helper doesn't perform checks on @pid which makes it useful for pidfds - * created via CLONE_PIDFD where @pid has no task attached when the pidfd and - * pidfd file are prepared. + * The helper verifies that @pid is still in use, without PIDFD_THREAD the + * task identified by @pid must be a thread-group leader. * * If this function returns successfully the caller is responsible to either * call fd_install() passing the returned pidfd and pidfd file as arguments in @@ -2063,59 +1861,50 @@ static inline void rcu_copy_process(struct task_struct *p) * error, a negative error code is returned from the function and the * last argument remains unchanged. */ -static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) +int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file) { - struct file *pidfd_file; + struct file *pidfs_file; + + /* + * PIDFD_STALE is only allowed to be passed if the caller knows + * that @pid is already registered in pidfs and thus + * PIDFD_INFO_EXIT information is guaranteed to be available. + */ + if (!(flags & PIDFD_STALE)) { + /* + * While holding the pidfd waitqueue lock removing the + * task linkage for the thread-group leader pid + * (PIDTYPE_TGID) isn't possible. Thus, if there's still + * task linkage for PIDTYPE_PID not having thread-group + * leader linkage for the pid means it wasn't a + * thread-group leader in the first place. + */ + guard(spinlock_irq)(&pid->wait_pidfd.lock); + + /* Task has already been reaped. */ + if (!pid_has_task(pid, PIDTYPE_PID)) + return -ESRCH; + /* + * If this struct pid isn't used as a thread-group + * leader but the caller requested to create a + * thread-group leader pidfd then report ENOENT. + */ + if (!(flags & PIDFD_THREAD) && !pid_has_task(pid, PIDTYPE_TGID)) + return -ENOENT; + } CLASS(get_unused_fd, pidfd)(O_CLOEXEC); if (pidfd < 0) return pidfd; - pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR); - if (IS_ERR(pidfd_file)) - return PTR_ERR(pidfd_file); + pidfs_file = pidfs_alloc_file(pid, flags | O_RDWR); + if (IS_ERR(pidfs_file)) + return PTR_ERR(pidfs_file); - *ret = pidfd_file; + *ret_file = pidfs_file; return take_fd(pidfd); } -/** - * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd - * @pid: the struct pid for which to create a pidfd - * @flags: flags of the new @pidfd - * @ret: Where to return the pidfd. - * - * Allocate a new file that stashes @pid and reserve a new pidfd number in the - * caller's file descriptor table. The pidfd is reserved but not installed yet. - * - * The helper verifies that @pid is still in use, without PIDFD_THREAD the - * task identified by @pid must be a thread-group leader. - * - * If this function returns successfully the caller is responsible to either - * call fd_install() passing the returned pidfd and pidfd file as arguments in - * order to install the pidfd into its file descriptor table or they must use - * put_unused_fd() and fput() on the returned pidfd and pidfd file - * respectively. - * - * This function is useful when a pidfd must already be reserved but there - * might still be points of failure afterwards and the caller wants to ensure - * that no pidfd is leaked into its file descriptor table. - * - * Return: On success, a reserved pidfd is returned from the function and a new - * pidfd file is returned in the last argument to the function. On - * error, a negative error code is returned from the function and the - * last argument remains unchanged. - */ -int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) -{ - bool thread = flags & PIDFD_THREAD; - - if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID)) - return -EINVAL; - - return __pidfd_prepare(pid, flags, ret); -} - static void __delayed_free_task(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); @@ -2143,7 +1932,7 @@ static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) /* We need to synchronize with __set_oom_adj */ mutex_lock(&oom_adj_mutex); - set_bit(MMF_MULTIPROCESS, &tsk->mm->flags); + mm_flags_set(MMF_MULTIPROCESS, tsk->mm); /* Update the values in case they were changed after copy_signal */ tsk->signal->oom_score_adj = current->signal->oom_score_adj; tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min; @@ -2153,15 +1942,19 @@ static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) #ifdef CONFIG_RV static void rv_task_fork(struct task_struct *p) { - int i; - - for (i = 0; i < RV_PER_TASK_MONITORS; i++) - p->rv[i].da_mon.monitoring = false; + memset(&p->rv, 0, sizeof(p->rv)); } #else #define rv_task_fork(p) do {} while (0) #endif +static bool need_futex_hash_allocate_default(u64 clone_flags) +{ + if ((clone_flags & (CLONE_THREAD | CLONE_VM)) != (CLONE_THREAD | CLONE_VM)) + return false; + return true; +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -2379,13 +2172,10 @@ __latent_entropy struct task_struct *copy_process( p->pagefault_disabled = 0; -#ifdef CONFIG_LOCKDEP lockdep_init_task(p); -#endif -#ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ -#endif + #ifdef CONFIG_BCACHE p->sequential_io = 0; p->sequential_io_avg = 0; @@ -2395,6 +2185,8 @@ __latent_entropy struct task_struct *copy_process( p->bpf_ctx = NULL; #endif + unwind_task_init(p); + /* Perform scheduler related setup. Assign this task to a CPU. */ retval = sched_fork(clone_flags, p); if (retval) @@ -2462,7 +2254,7 @@ __latent_entropy struct task_struct *copy_process( * Note that no task has been attached to @pid yet indicate * that via CLONE_PIDFD. */ - retval = __pidfd_prepare(pid, flags | PIDFD_CLONE, &pidfile); + retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile); if (retval < 0) goto bad_fork_free_pid; pidfd = retval; @@ -2543,6 +2335,21 @@ __latent_entropy struct task_struct *copy_process( goto bad_fork_cancel_cgroup; /* + * Allocate a default futex hash for the user process once the first + * thread spawns. + */ + if (need_futex_hash_allocate_default(clone_flags)) { + retval = futex_hash_allocate_default(); + if (retval) + goto bad_fork_cancel_cgroup; + /* + * If we fail beyond this point we don't free the allocated + * futex hash map. We assume that another thread will be created + * and makes use of it. The hash map will be freed once the main + * thread terminates. + */ + } + /* * From this point on we must avoid any synchronous user-space * communication until we take the tasklist-lock. In particular, we do * not want user-space to be able to predict the process start-time by @@ -2686,9 +2493,10 @@ bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: - exit_task_namespaces(p); + exit_nsproxy_namespaces(p); bad_fork_cleanup_mm: if (p->mm) { + sched_mm_cid_exit(p); mm_clear_owner(p->mm, p); mmput(p->mm); } @@ -2720,6 +2528,7 @@ bad_fork_cleanup_delayacct: delayacct_tsk_free(p); bad_fork_cleanup_count: dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); + exit_cred_namespaces(p); exit_creds(p); bad_fork_free: WRITE_ONCE(p->__state, TASK_DEAD); @@ -2778,11 +2587,9 @@ struct task_struct * __init fork_idle(int cpu) struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) { unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| - CLONE_IO; + CLONE_IO|CLONE_VM|CLONE_UNTRACED; struct kernel_clone_args args = { - .flags = ((lower_32_bits(flags) | CLONE_VM | - CLONE_UNTRACED) & ~CSIGNAL), - .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .flags = flags, .fn = fn, .fn_arg = arg, .io_thread = 1, @@ -2894,9 +2701,8 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, unsigned long flags) { struct kernel_clone_args args = { - .flags = ((lower_32_bits(flags) | CLONE_VM | - CLONE_UNTRACED) & ~CSIGNAL), - .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (flags & CSIGNAL), .fn = fn, .fn_arg = arg, .name = name, @@ -2912,9 +2718,8 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags) { struct kernel_clone_args args = { - .flags = ((lower_32_bits(flags) | CLONE_VM | - CLONE_UNTRACED) & ~CSIGNAL), - .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (flags & CSIGNAL), .fn = fn, .fn_arg = arg, }; @@ -2988,7 +2793,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, } #endif -noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, +static noinline int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) { @@ -3213,11 +3018,6 @@ void __init mm_cache_init(void) void __init proc_caches_init(void) { - struct kmem_cache_args args = { - .use_freeptr_offset = true, - .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr), - }; - sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| @@ -3234,10 +3034,6 @@ void __init proc_caches_init(void) sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); - vm_area_cachep = kmem_cache_create("vm_area_struct", - sizeof(struct vm_area_struct), &args, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| - SLAB_ACCOUNT); mmap_init(); nsproxy_cache_init(); } @@ -3403,13 +3199,13 @@ int ksys_unshare(unsigned long unshare_flags) if (new_fs) { fs = current->fs; - spin_lock(&fs->lock); + read_seqlock_excl(&fs->seq); current->fs = new_fs; if (--fs->users) new_fs = NULL; else new_fs = fs; - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); } if (new_fd) @@ -3470,7 +3266,7 @@ int unshare_files(void) return 0; } -int sysctl_max_threads(const struct ctl_table *table, int write, +static int sysctl_max_threads(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; @@ -3492,3 +3288,21 @@ int sysctl_max_threads(const struct ctl_table *table, int write, return 0; } + +static const struct ctl_table fork_sysctl_table[] = { + { + .procname = "threads-max", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_max_threads, + }, +}; + +static int __init init_fork_sysctl(void) +{ + register_sysctl_init("kernel", fork_sysctl_table); + return 0; +} + +subsys_initcall(init_fork_sysctl); diff --git a/kernel/freezer.c b/kernel/freezer.c index 8d530d0949ff..a76bf957fb32 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -10,6 +10,7 @@ #include <linux/export.h> #include <linux/syscalls.h> #include <linux/freezer.h> +#include <linux/oom.h> #include <linux/kthread.h> /* total number of freezing conditions in effect */ @@ -40,10 +41,10 @@ bool freezing_slow_path(struct task_struct *p) if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) return false; - if (test_tsk_thread_flag(p, TIF_MEMDIE)) + if (tsk_is_oom_victim(p)) return false; - if (pm_nosig_freezing || cgroup_freezing(p)) + if (pm_nosig_freezing || cgroup1_freezing(p)) return true; if (pm_freezing && !(p->flags & PF_KTHREAD)) @@ -201,18 +202,26 @@ static int __restore_freezer_state(struct task_struct *p, void *arg) void __thaw_task(struct task_struct *p) { - unsigned long flags; - - spin_lock_irqsave(&freezer_lock, flags); - if (WARN_ON_ONCE(freezing(p))) - goto unlock; + guard(spinlock_irqsave)(&freezer_lock); + if (frozen(p) && !task_call_func(p, __restore_freezer_state, NULL)) + wake_up_state(p, TASK_FROZEN); +} - if (!frozen(p) || task_call_func(p, __restore_freezer_state, NULL)) - goto unlock; +/* + * thaw_process - Thaw a frozen process + * @p: the process to be thawed + * + * Iterate over all threads of @p and call __thaw_task() on each. + */ +void thaw_process(struct task_struct *p) +{ + struct task_struct *t; - wake_up_state(p, TASK_FROZEN); -unlock: - spin_unlock_irqrestore(&freezer_lock, flags); + rcu_read_lock(); + for_each_thread(p, t) { + __thaw_task(t); + } + rcu_read_unlock(); } /** diff --git a/kernel/futex/core.c b/kernel/futex/core.c index cca15859a50b..cf7e610eac42 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -36,9 +36,14 @@ #include <linux/pagemap.h> #include <linux/debugfs.h> #include <linux/plist.h> +#include <linux/gfp.h> +#include <linux/vmalloc.h> #include <linux/memblock.h> #include <linux/fault-inject.h> #include <linux/slab.h> +#include <linux/prctl.h> +#include <linux/mempolicy.h> +#include <linux/mmap_lock.h> #include "futex.h" #include "../locking/rtmutex_common.h" @@ -49,12 +54,23 @@ * reside in the same cacheline. */ static struct { - struct futex_hash_bucket *queues; unsigned long hashmask; + unsigned int hashshift; + struct futex_hash_bucket *queues[MAX_NUMNODES]; } __futex_data __read_mostly __aligned(2*sizeof(long)); -#define futex_queues (__futex_data.queues) -#define futex_hashmask (__futex_data.hashmask) +#define futex_hashmask (__futex_data.hashmask) +#define futex_hashshift (__futex_data.hashshift) +#define futex_queues (__futex_data.queues) + +struct futex_private_hash { + int state; + unsigned int hash_mask; + struct rcu_head rcu; + void *mm; + bool custom; + struct futex_hash_bucket queues[]; +}; /* * Fault injections for futexes. @@ -107,21 +123,331 @@ late_initcall(fail_futex_debugfs); #endif /* CONFIG_FAIL_FUTEX */ +static struct futex_hash_bucket * +__futex_hash(union futex_key *key, struct futex_private_hash *fph); + +#ifdef CONFIG_FUTEX_PRIVATE_HASH +static bool futex_ref_get(struct futex_private_hash *fph); +static bool futex_ref_put(struct futex_private_hash *fph); +static bool futex_ref_is_dead(struct futex_private_hash *fph); + +enum { FR_PERCPU = 0, FR_ATOMIC }; + +static inline bool futex_key_is_private(union futex_key *key) +{ + /* + * Relies on get_futex_key() to set either bit for shared + * futexes -- see comment with union futex_key. + */ + return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED)); +} + +static bool futex_private_hash_get(struct futex_private_hash *fph) +{ + return futex_ref_get(fph); +} + +void futex_private_hash_put(struct futex_private_hash *fph) +{ + if (futex_ref_put(fph)) + wake_up_var(fph->mm); +} + /** - * futex_hash - Return the hash bucket in the global hash - * @key: Pointer to the futex key for which the hash is calculated + * futex_hash_get - Get an additional reference for the local hash. + * @hb: ptr to the private local hash. * - * We hash on the keys returned from get_futex_key (see below) and return the - * corresponding hash bucket in the global hash. + * Obtain an additional reference for the already obtained hash bucket. The + * caller must already own an reference. */ +void futex_hash_get(struct futex_hash_bucket *hb) +{ + struct futex_private_hash *fph = hb->priv; + + if (!fph) + return; + WARN_ON_ONCE(!futex_private_hash_get(fph)); +} + +void futex_hash_put(struct futex_hash_bucket *hb) +{ + struct futex_private_hash *fph = hb->priv; + + if (!fph) + return; + futex_private_hash_put(fph); +} + +static struct futex_hash_bucket * +__futex_hash_private(union futex_key *key, struct futex_private_hash *fph) +{ + u32 hash; + + if (!futex_key_is_private(key)) + return NULL; + + if (!fph) + fph = rcu_dereference(key->private.mm->futex_phash); + if (!fph || !fph->hash_mask) + return NULL; + + hash = jhash2((void *)&key->private.address, + sizeof(key->private.address) / 4, + key->both.offset); + return &fph->queues[hash & fph->hash_mask]; +} + +static void futex_rehash_private(struct futex_private_hash *old, + struct futex_private_hash *new) +{ + struct futex_hash_bucket *hb_old, *hb_new; + unsigned int slots = old->hash_mask + 1; + unsigned int i; + + for (i = 0; i < slots; i++) { + struct futex_q *this, *tmp; + + hb_old = &old->queues[i]; + + spin_lock(&hb_old->lock); + plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) { + + plist_del(&this->list, &hb_old->chain); + futex_hb_waiters_dec(hb_old); + + WARN_ON_ONCE(this->lock_ptr != &hb_old->lock); + + hb_new = __futex_hash(&this->key, new); + futex_hb_waiters_inc(hb_new); + /* + * The new pointer isn't published yet but an already + * moved user can be unqueued due to timeout or signal. + */ + spin_lock_nested(&hb_new->lock, SINGLE_DEPTH_NESTING); + plist_add(&this->list, &hb_new->chain); + this->lock_ptr = &hb_new->lock; + spin_unlock(&hb_new->lock); + } + spin_unlock(&hb_old->lock); + } +} + +static bool __futex_pivot_hash(struct mm_struct *mm, + struct futex_private_hash *new) +{ + struct futex_private_hash *fph; + + WARN_ON_ONCE(mm->futex_phash_new); + + fph = rcu_dereference_protected(mm->futex_phash, + lockdep_is_held(&mm->futex_hash_lock)); + if (fph) { + if (!futex_ref_is_dead(fph)) { + mm->futex_phash_new = new; + return false; + } + + futex_rehash_private(fph, new); + } + new->state = FR_PERCPU; + scoped_guard(rcu) { + mm->futex_batches = get_state_synchronize_rcu(); + rcu_assign_pointer(mm->futex_phash, new); + } + kvfree_rcu(fph, rcu); + return true; +} + +static void futex_pivot_hash(struct mm_struct *mm) +{ + scoped_guard(mutex, &mm->futex_hash_lock) { + struct futex_private_hash *fph; + + fph = mm->futex_phash_new; + if (fph) { + mm->futex_phash_new = NULL; + __futex_pivot_hash(mm, fph); + } + } +} + +struct futex_private_hash *futex_private_hash(void) +{ + struct mm_struct *mm = current->mm; + /* + * Ideally we don't loop. If there is a replacement in progress + * then a new private hash is already prepared and a reference can't be + * obtained once the last user dropped it's. + * In that case we block on mm_struct::futex_hash_lock and either have + * to perform the replacement or wait while someone else is doing the + * job. Eitherway, on the second iteration we acquire a reference on the + * new private hash or loop again because a new replacement has been + * requested. + */ +again: + scoped_guard(rcu) { + struct futex_private_hash *fph; + + fph = rcu_dereference(mm->futex_phash); + if (!fph) + return NULL; + + if (futex_private_hash_get(fph)) + return fph; + } + futex_pivot_hash(mm); + goto again; +} + +struct futex_hash_bucket *futex_hash(union futex_key *key) +{ + struct futex_private_hash *fph; + struct futex_hash_bucket *hb; + +again: + scoped_guard(rcu) { + hb = __futex_hash(key, NULL); + fph = hb->priv; + + if (!fph || futex_private_hash_get(fph)) + return hb; + } + futex_pivot_hash(key->private.mm); + goto again; +} + +#else /* !CONFIG_FUTEX_PRIVATE_HASH */ + +static struct futex_hash_bucket * +__futex_hash_private(union futex_key *key, struct futex_private_hash *fph) +{ + return NULL; +} + struct futex_hash_bucket *futex_hash(union futex_key *key) { - u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, - key->both.offset); + return __futex_hash(key, NULL); +} + +#endif /* CONFIG_FUTEX_PRIVATE_HASH */ + +#ifdef CONFIG_FUTEX_MPOL + +static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma = vma_lookup(mm, addr); + struct mempolicy *mpol; + int node = FUTEX_NO_NODE; + + if (!vma) + return FUTEX_NO_NODE; + + mpol = vma_policy(vma); + if (!mpol) + return FUTEX_NO_NODE; + + switch (mpol->mode) { + case MPOL_PREFERRED: + node = first_node(mpol->nodes); + break; + case MPOL_PREFERRED_MANY: + case MPOL_BIND: + if (mpol->home_node != NUMA_NO_NODE) + node = mpol->home_node; + break; + default: + break; + } + + return node; +} + +static int futex_key_to_node_opt(struct mm_struct *mm, unsigned long addr) +{ + int seq, node; + + guard(rcu)(); + + if (!mmap_lock_speculate_try_begin(mm, &seq)) + return -EBUSY; - return &futex_queues[hash & futex_hashmask]; + node = __futex_key_to_node(mm, addr); + + if (mmap_lock_speculate_retry(mm, seq)) + return -EAGAIN; + + return node; } +static int futex_mpol(struct mm_struct *mm, unsigned long addr) +{ + int node; + + node = futex_key_to_node_opt(mm, addr); + if (node >= FUTEX_NO_NODE) + return node; + + guard(mmap_read_lock)(mm); + return __futex_key_to_node(mm, addr); +} + +#else /* !CONFIG_FUTEX_MPOL */ + +static int futex_mpol(struct mm_struct *mm, unsigned long addr) +{ + return FUTEX_NO_NODE; +} + +#endif /* CONFIG_FUTEX_MPOL */ + +/** + * __futex_hash - Return the hash bucket + * @key: Pointer to the futex key for which the hash is calculated + * @fph: Pointer to private hash if known + * + * We hash on the keys returned from get_futex_key (see below) and return the + * corresponding hash bucket. + * If the FUTEX is PROCESS_PRIVATE then a per-process hash bucket (from the + * private hash) is returned if existing. Otherwise a hash bucket from the + * global hash is returned. + */ +static struct futex_hash_bucket * +__futex_hash(union futex_key *key, struct futex_private_hash *fph) +{ + int node = key->both.node; + u32 hash; + + if (node == FUTEX_NO_NODE) { + struct futex_hash_bucket *hb; + + hb = __futex_hash_private(key, fph); + if (hb) + return hb; + } + + hash = jhash2((u32 *)key, + offsetof(typeof(*key), both.offset) / sizeof(u32), + key->both.offset); + + if (node == FUTEX_NO_NODE) { + /* + * In case of !FLAGS_NUMA, use some unused hash bits to pick a + * node -- this ensures regular futexes are interleaved across + * the nodes and avoids having to allocate multiple + * hash-tables. + * + * NOTE: this isn't perfectly uniform, but it is fast and + * handles sparse node masks. + */ + node = (hash >> futex_hashshift) % nr_node_ids; + if (!node_possible(node)) { + node = find_next_bit_wrap(node_possible_map.bits, + nr_node_ids, node); + } + } + + return &futex_queues[node][hash & futex_hashmask]; +} /** * futex_setup_timer - set up the sleeping hrtimer. @@ -206,7 +532,7 @@ static u64 get_inode_sequence_number(struct inode *inode) * * For shared mappings (when @fshared), the key is: * - * ( inode->i_sequence, page->index, offset_within_page ) + * ( inode->i_sequence, page offset within mapping, offset_within_page ) * * [ also see get_inode_sequence_number() ] * @@ -227,25 +553,60 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, struct page *page; struct folio *folio; struct address_space *mapping; - int err, ro = 0; + int node, err, size, ro = 0; + bool node_updated = false; bool fshared; fshared = flags & FLAGS_SHARED; + size = futex_size(flags); + if (flags & FLAGS_NUMA) + size *= 2; /* * The futex address must be "naturally" aligned. */ key->both.offset = address % PAGE_SIZE; - if (unlikely((address % sizeof(u32)) != 0)) + if (unlikely((address % size) != 0)) return -EINVAL; address -= key->both.offset; - if (unlikely(!access_ok(uaddr, sizeof(u32)))) + if (unlikely(!access_ok(uaddr, size))) return -EFAULT; if (unlikely(should_fail_futex(fshared))) return -EFAULT; + node = FUTEX_NO_NODE; + + if (flags & FLAGS_NUMA) { + u32 __user *naddr = (void *)uaddr + size / 2; + + if (get_user_inline(node, naddr)) + return -EFAULT; + + if ((node != FUTEX_NO_NODE) && + ((unsigned int)node >= MAX_NUMNODES || !node_possible(node))) + return -EINVAL; + } + + if (node == FUTEX_NO_NODE && (flags & FLAGS_MPOL)) { + node = futex_mpol(mm, address); + node_updated = true; + } + + if (flags & FLAGS_NUMA) { + u32 __user *naddr = (void *)uaddr + size / 2; + + if (node == FUTEX_NO_NODE) { + node = numa_node_id(); + node_updated = true; + } + if (node_updated && put_user_inline(node, naddr)) + return -EFAULT; + } + + key->both.node = node; + /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs @@ -502,13 +863,9 @@ void __futex_unqueue(struct futex_q *q) } /* The key must be already stored in q->key. */ -struct futex_hash_bucket *futex_q_lock(struct futex_q *q) +void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb) __acquires(&hb->lock) { - struct futex_hash_bucket *hb; - - hb = futex_hash(&q->key); - /* * Increment the counter before taking the lock so that * a potential waker won't miss a to-be-slept task that is @@ -522,14 +879,13 @@ struct futex_hash_bucket *futex_q_lock(struct futex_q *q) q->lock_ptr = &hb->lock; spin_lock(&hb->lock); - return hb; } void futex_q_unlock(struct futex_hash_bucket *hb) __releases(&hb->lock) { - spin_unlock(&hb->lock); futex_hb_waiters_dec(hb); + spin_unlock(&hb->lock); } void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, @@ -568,6 +924,8 @@ int futex_unqueue(struct futex_q *q) spinlock_t *lock_ptr; int ret = 0; + /* RCU so lock_ptr is not going away during locking. */ + guard(rcu)(); /* In the common case we don't take the spinlock, which is nice. */ retry: /* @@ -606,6 +964,24 @@ retry: return ret; } +void futex_q_lockptr_lock(struct futex_q *q) +{ + spinlock_t *lock_ptr; + + /* + * See futex_unqueue() why lock_ptr can change. + */ + guard(rcu)(); +retry: + lock_ptr = READ_ONCE(q->lock_ptr); + spin_lock(lock_ptr); + + if (unlikely(lock_ptr != q->lock_ptr)) { + spin_unlock(lock_ptr); + goto retry; + } +} + /* * PI futexes can not be requeued and must remove themselves from the hash * bucket. The hash bucket lock (i.e. lock_ptr) is held. @@ -949,10 +1325,20 @@ static void exit_pi_state_list(struct task_struct *curr) { struct list_head *next, *head = &curr->pi_state_list; struct futex_pi_state *pi_state; - struct futex_hash_bucket *hb; union futex_key key = FUTEX_KEY_INIT; /* + * The mutex mm_struct::futex_hash_lock might be acquired. + */ + might_sleep(); + /* + * Ensure the hash remains stable (no resize) during the while loop + * below. The hb pointer is acquired under the pi_lock so we can't block + * on the mutex. + */ + WARN_ON(curr != current); + guard(private_hash)(); + /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful * versus waiters unqueueing themselves: @@ -962,50 +1348,52 @@ static void exit_pi_state_list(struct task_struct *curr) next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; - hb = futex_hash(&key); - - /* - * We can race against put_pi_state() removing itself from the - * list (a waiter going away). put_pi_state() will first - * decrement the reference count and then modify the list, so - * its possible to see the list entry but fail this reference - * acquire. - * - * In that case; drop the locks to let put_pi_state() make - * progress and retry the loop. - */ - if (!refcount_inc_not_zero(&pi_state->refcount)) { + if (1) { + CLASS(hb, hb)(&key); + + /* + * We can race against put_pi_state() removing itself from the + * list (a waiter going away). put_pi_state() will first + * decrement the reference count and then modify the list, so + * its possible to see the list entry but fail this reference + * acquire. + * + * In that case; drop the locks to let put_pi_state() make + * progress and retry the loop. + */ + if (!refcount_inc_not_zero(&pi_state->refcount)) { + raw_spin_unlock_irq(&curr->pi_lock); + cpu_relax(); + raw_spin_lock_irq(&curr->pi_lock); + continue; + } raw_spin_unlock_irq(&curr->pi_lock); - cpu_relax(); - raw_spin_lock_irq(&curr->pi_lock); - continue; - } - raw_spin_unlock_irq(&curr->pi_lock); - spin_lock(&hb->lock); - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); - raw_spin_lock(&curr->pi_lock); - /* - * We dropped the pi-lock, so re-check whether this - * task still owns the PI-state: - */ - if (head->next != next) { - /* retain curr->pi_lock for the loop invariant */ - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + spin_lock(&hb->lock); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + raw_spin_lock(&curr->pi_lock); + /* + * We dropped the pi-lock, so re-check whether this + * task still owns the PI-state: + */ + if (head->next != next) { + /* retain curr->pi_lock for the loop invariant */ + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + spin_unlock(&hb->lock); + put_pi_state(pi_state); + continue; + } + + WARN_ON(pi_state->owner != curr); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + pi_state->owner = NULL; + + raw_spin_unlock(&curr->pi_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); - put_pi_state(pi_state); - continue; } - WARN_ON(pi_state->owner != curr); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - pi_state->owner = NULL; - - raw_spin_unlock(&curr->pi_lock); - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - spin_unlock(&hb->lock); - rt_mutex_futex_unlock(&pi_state->pi_mutex); put_pi_state(pi_state); @@ -1125,30 +1513,500 @@ void futex_exit_release(struct task_struct *tsk) futex_cleanup_end(tsk, FUTEX_STATE_DEAD); } +static void futex_hash_bucket_init(struct futex_hash_bucket *fhb, + struct futex_private_hash *fph) +{ +#ifdef CONFIG_FUTEX_PRIVATE_HASH + fhb->priv = fph; +#endif + atomic_set(&fhb->waiters, 0); + plist_head_init(&fhb->chain); + spin_lock_init(&fhb->lock); +} + +#define FH_CUSTOM 0x01 + +#ifdef CONFIG_FUTEX_PRIVATE_HASH + +/* + * futex-ref + * + * Heavily inspired by percpu-rwsem/percpu-refcount; not reusing any of that + * code because it just doesn't fit right. + * + * Dual counter, per-cpu / atomic approach like percpu-refcount, except it + * re-initializes the state automatically, such that the fph swizzle is also a + * transition back to per-cpu. + */ + +static void futex_ref_rcu(struct rcu_head *head); + +static void __futex_ref_atomic_begin(struct futex_private_hash *fph) +{ + struct mm_struct *mm = fph->mm; + + /* + * The counter we're about to switch to must have fully switched; + * otherwise it would be impossible for it to have reported success + * from futex_ref_is_dead(). + */ + WARN_ON_ONCE(atomic_long_read(&mm->futex_atomic) != 0); + + /* + * Set the atomic to the bias value such that futex_ref_{get,put}() + * will never observe 0. Will be fixed up in __futex_ref_atomic_end() + * when folding in the percpu count. + */ + atomic_long_set(&mm->futex_atomic, LONG_MAX); + smp_store_release(&fph->state, FR_ATOMIC); + + call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu); +} + +static void __futex_ref_atomic_end(struct futex_private_hash *fph) +{ + struct mm_struct *mm = fph->mm; + unsigned int count = 0; + long ret; + int cpu; + + /* + * Per __futex_ref_atomic_begin() the state of the fph must be ATOMIC + * and per this RCU callback, everybody must now observe this state and + * use the atomic variable. + */ + WARN_ON_ONCE(fph->state != FR_ATOMIC); + + /* + * Therefore the per-cpu counter is now stable, sum and reset. + */ + for_each_possible_cpu(cpu) { + unsigned int *ptr = per_cpu_ptr(mm->futex_ref, cpu); + count += *ptr; + *ptr = 0; + } + + /* + * Re-init for the next cycle. + */ + this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */ + + /* + * Add actual count, subtract bias and initial refcount. + * + * The moment this atomic operation happens, futex_ref_is_dead() can + * become true. + */ + ret = atomic_long_add_return(count - LONG_MAX - 1, &mm->futex_atomic); + if (!ret) + wake_up_var(mm); + + WARN_ON_ONCE(ret < 0); + mmput_async(mm); +} + +static void futex_ref_rcu(struct rcu_head *head) +{ + struct mm_struct *mm = container_of(head, struct mm_struct, futex_rcu); + struct futex_private_hash *fph = rcu_dereference_raw(mm->futex_phash); + + if (fph->state == FR_PERCPU) { + /* + * Per this extra grace-period, everybody must now observe + * fph as the current fph and no previously observed fph's + * are in-flight. + * + * Notably, nobody will now rely on the atomic + * futex_ref_is_dead() state anymore so we can begin the + * migration of the per-cpu counter into the atomic. + */ + __futex_ref_atomic_begin(fph); + return; + } + + __futex_ref_atomic_end(fph); +} + +/* + * Drop the initial refcount and transition to atomics. + */ +static void futex_ref_drop(struct futex_private_hash *fph) +{ + struct mm_struct *mm = fph->mm; + + /* + * Can only transition the current fph; + */ + WARN_ON_ONCE(rcu_dereference_raw(mm->futex_phash) != fph); + /* + * We enqueue at least one RCU callback. Ensure mm stays if the task + * exits before the transition is completed. + */ + mmget(mm); + + /* + * In order to avoid the following scenario: + * + * futex_hash() __futex_pivot_hash() + * guard(rcu); guard(mm->futex_hash_lock); + * fph = mm->futex_phash; + * rcu_assign_pointer(&mm->futex_phash, new); + * futex_hash_allocate() + * futex_ref_drop() + * fph->state = FR_ATOMIC; + * atomic_set(, BIAS); + * + * futex_private_hash_get(fph); // OOPS + * + * Where an old fph (which is FR_ATOMIC) and should fail on + * inc_not_zero, will succeed because a new transition is started and + * the atomic is bias'ed away from 0. + * + * There must be at least one full grace-period between publishing a + * new fph and trying to replace it. + */ + if (poll_state_synchronize_rcu(mm->futex_batches)) { + /* + * There was a grace-period, we can begin now. + */ + __futex_ref_atomic_begin(fph); + return; + } + + call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu); +} + +static bool futex_ref_get(struct futex_private_hash *fph) +{ + struct mm_struct *mm = fph->mm; + + guard(preempt)(); + + if (READ_ONCE(fph->state) == FR_PERCPU) { + __this_cpu_inc(*mm->futex_ref); + return true; + } + + return atomic_long_inc_not_zero(&mm->futex_atomic); +} + +static bool futex_ref_put(struct futex_private_hash *fph) +{ + struct mm_struct *mm = fph->mm; + + guard(preempt)(); + + if (READ_ONCE(fph->state) == FR_PERCPU) { + __this_cpu_dec(*mm->futex_ref); + return false; + } + + return atomic_long_dec_and_test(&mm->futex_atomic); +} + +static bool futex_ref_is_dead(struct futex_private_hash *fph) +{ + struct mm_struct *mm = fph->mm; + + guard(rcu)(); + + if (smp_load_acquire(&fph->state) == FR_PERCPU) + return false; + + return atomic_long_read(&mm->futex_atomic) == 0; +} + +int futex_mm_init(struct mm_struct *mm) +{ + mutex_init(&mm->futex_hash_lock); + RCU_INIT_POINTER(mm->futex_phash, NULL); + mm->futex_phash_new = NULL; + /* futex-ref */ + mm->futex_ref = NULL; + atomic_long_set(&mm->futex_atomic, 0); + mm->futex_batches = get_state_synchronize_rcu(); + return 0; +} + +void futex_hash_free(struct mm_struct *mm) +{ + struct futex_private_hash *fph; + + free_percpu(mm->futex_ref); + kvfree(mm->futex_phash_new); + fph = rcu_dereference_raw(mm->futex_phash); + if (fph) + kvfree(fph); +} + +static bool futex_pivot_pending(struct mm_struct *mm) +{ + struct futex_private_hash *fph; + + guard(rcu)(); + + if (!mm->futex_phash_new) + return true; + + fph = rcu_dereference(mm->futex_phash); + return futex_ref_is_dead(fph); +} + +static bool futex_hash_less(struct futex_private_hash *a, + struct futex_private_hash *b) +{ + /* user provided always wins */ + if (!a->custom && b->custom) + return true; + if (a->custom && !b->custom) + return false; + + /* zero-sized hash wins */ + if (!b->hash_mask) + return true; + if (!a->hash_mask) + return false; + + /* keep the biggest */ + if (a->hash_mask < b->hash_mask) + return true; + if (a->hash_mask > b->hash_mask) + return false; + + return false; /* equal */ +} + +static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) +{ + struct mm_struct *mm = current->mm; + struct futex_private_hash *fph; + bool custom = flags & FH_CUSTOM; + int i; + + if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots))) + return -EINVAL; + + /* + * Once we've disabled the global hash there is no way back. + */ + scoped_guard(rcu) { + fph = rcu_dereference(mm->futex_phash); + if (fph && !fph->hash_mask) { + if (custom) + return -EBUSY; + return 0; + } + } + + if (!mm->futex_ref) { + /* + * This will always be allocated by the first thread and + * therefore requires no locking. + */ + mm->futex_ref = alloc_percpu(unsigned int); + if (!mm->futex_ref) + return -ENOMEM; + this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */ + } + + fph = kvzalloc(struct_size(fph, queues, hash_slots), + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); + if (!fph) + return -ENOMEM; + + fph->hash_mask = hash_slots ? hash_slots - 1 : 0; + fph->custom = custom; + fph->mm = mm; + + for (i = 0; i < hash_slots; i++) + futex_hash_bucket_init(&fph->queues[i], fph); + + if (custom) { + /* + * Only let prctl() wait / retry; don't unduly delay clone(). + */ +again: + wait_var_event(mm, futex_pivot_pending(mm)); + } + + scoped_guard(mutex, &mm->futex_hash_lock) { + struct futex_private_hash *free __free(kvfree) = NULL; + struct futex_private_hash *cur, *new; + + cur = rcu_dereference_protected(mm->futex_phash, + lockdep_is_held(&mm->futex_hash_lock)); + new = mm->futex_phash_new; + mm->futex_phash_new = NULL; + + if (fph) { + if (cur && !cur->hash_mask) { + /* + * If two threads simultaneously request the global + * hash then the first one performs the switch, + * the second one returns here. + */ + free = fph; + mm->futex_phash_new = new; + return -EBUSY; + } + if (cur && !new) { + /* + * If we have an existing hash, but do not yet have + * allocated a replacement hash, drop the initial + * reference on the existing hash. + */ + futex_ref_drop(cur); + } + + if (new) { + /* + * Two updates raced; throw out the lesser one. + */ + if (futex_hash_less(new, fph)) { + free = new; + new = fph; + } else { + free = fph; + } + } else { + new = fph; + } + fph = NULL; + } + + if (new) { + /* + * Will set mm->futex_phash_new on failure; + * futex_private_hash_get() will try again. + */ + if (!__futex_pivot_hash(mm, new) && custom) + goto again; + } + } + return 0; +} + +int futex_hash_allocate_default(void) +{ + unsigned int threads, buckets, current_buckets = 0; + struct futex_private_hash *fph; + + if (!current->mm) + return 0; + + scoped_guard(rcu) { + threads = min_t(unsigned int, + get_nr_threads(current), + num_online_cpus()); + + fph = rcu_dereference(current->mm->futex_phash); + if (fph) { + if (fph->custom) + return 0; + + current_buckets = fph->hash_mask + 1; + } + } + + /* + * The default allocation will remain within + * 16 <= threads * 4 <= global hash size + */ + buckets = roundup_pow_of_two(4 * threads); + buckets = clamp(buckets, 16, futex_hashmask + 1); + + if (current_buckets >= buckets) + return 0; + + return futex_hash_allocate(buckets, 0); +} + +static int futex_hash_get_slots(void) +{ + struct futex_private_hash *fph; + + guard(rcu)(); + fph = rcu_dereference(current->mm->futex_phash); + if (fph && fph->hash_mask) + return fph->hash_mask + 1; + return 0; +} + +#else + +static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) +{ + return -EINVAL; +} + +static int futex_hash_get_slots(void) +{ + return 0; +} + +#endif + +int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) +{ + unsigned int flags = FH_CUSTOM; + int ret; + + switch (arg2) { + case PR_FUTEX_HASH_SET_SLOTS: + if (arg4) + return -EINVAL; + ret = futex_hash_allocate(arg3, flags); + break; + + case PR_FUTEX_HASH_GET_SLOTS: + ret = futex_hash_get_slots(); + break; + + default: + ret = -EINVAL; + break; + } + return ret; +} + static int __init futex_init(void) { unsigned long hashsize, i; - unsigned int futex_shift; + unsigned int order, n; + unsigned long size; #ifdef CONFIG_BASE_SMALL hashsize = 16; #else - hashsize = roundup_pow_of_two(256 * num_possible_cpus()); + hashsize = 256 * num_possible_cpus(); + hashsize /= num_possible_nodes(); + hashsize = max(4, hashsize); + hashsize = roundup_pow_of_two(hashsize); #endif + futex_hashshift = ilog2(hashsize); + size = sizeof(struct futex_hash_bucket) * hashsize; + order = get_order(size); + + for_each_node(n) { + struct futex_hash_bucket *table; + + if (order > MAX_PAGE_ORDER) + table = vmalloc_huge_node(size, GFP_KERNEL, n); + else + table = alloc_pages_exact_nid(n, size, GFP_KERNEL); + + BUG_ON(!table); - futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), - hashsize, 0, 0, - &futex_shift, NULL, - hashsize, hashsize); - hashsize = 1UL << futex_shift; + for (i = 0; i < hashsize; i++) + futex_hash_bucket_init(&table[i], NULL); - for (i = 0; i < hashsize; i++) { - atomic_set(&futex_queues[i].waiters, 0); - plist_head_init(&futex_queues[i].chain); - spin_lock_init(&futex_queues[i].lock); + futex_queues[n] = table; } futex_hashmask = hashsize - 1; + pr_info("futex hash table entries: %lu (%lu bytes on %d NUMA nodes, total %lu KiB, %s).\n", + hashsize, size, num_possible_nodes(), size * num_possible_nodes() / 1024, + order > MAX_PAGE_ORDER ? "vmalloc" : "linear"); return 0; } core_initcall(futex_init); diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index 6b2f4c7eb720..30c2afa03889 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -7,6 +7,7 @@ #include <linux/sched/wake_q.h> #include <linux/compat.h> #include <linux/uaccess.h> +#include <linux/cleanup.h> #ifdef CONFIG_PREEMPT_RT #include <linux/rcuwait.h> @@ -38,6 +39,7 @@ #define FLAGS_HAS_TIMEOUT 0x0040 #define FLAGS_NUMA 0x0080 #define FLAGS_STRICT 0x0100 +#define FLAGS_MPOL 0x0200 /* FUTEX_ to FLAGS_ */ static inline unsigned int futex_to_flags(unsigned int op) @@ -53,7 +55,7 @@ static inline unsigned int futex_to_flags(unsigned int op) return flags; } -#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE) +#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_NUMA | FUTEX2_MPOL | FUTEX2_PRIVATE) /* FUTEX2_ to FLAGS_ */ static inline unsigned int futex2_to_flags(unsigned int flags2) @@ -66,6 +68,9 @@ static inline unsigned int futex2_to_flags(unsigned int flags2) if (flags2 & FUTEX2_NUMA) flags |= FLAGS_NUMA; + if (flags2 & FUTEX2_MPOL) + flags |= FLAGS_MPOL; + return flags; } @@ -86,6 +91,19 @@ static inline bool futex_flags_valid(unsigned int flags) if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32) return false; + /* + * Must be able to represent both FUTEX_NO_NODE and every valid nodeid + * in a futex word. + */ + if (flags & FLAGS_NUMA) { + int bits = 8 * futex_size(flags); + u64 max = ~0ULL; + + max >>= 64 - bits; + if (nr_node_ids >= max) + return false; + } + return true; } @@ -117,6 +135,7 @@ struct futex_hash_bucket { atomic_t waiters; spinlock_t lock; struct plist_head chain; + struct futex_private_hash *priv; } ____cacheline_aligned_in_smp; /* @@ -156,6 +175,7 @@ typedef void (futex_wake_fn)(struct wake_q_head *wake_q, struct futex_q *q); * @requeue_pi_key: the requeue_pi target futex key * @bitset: bitset for the optional bitmasked wakeup * @requeue_state: State field for futex_requeue_pi() + * @drop_hb_ref: Waiter should drop the extra hash bucket reference if true * @requeue_wait: RCU wait for futex_requeue_pi() (RT only) * * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so @@ -182,6 +202,7 @@ struct futex_q { union futex_key *requeue_pi_key; u32 bitset; atomic_t requeue_state; + bool drop_hb_ref; #ifdef CONFIG_PREEMPT_RT struct rcuwait requeue_wait; #endif @@ -196,12 +217,33 @@ enum futex_access { extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, enum futex_access rw); - +extern void futex_q_lockptr_lock(struct futex_q *q); extern struct hrtimer_sleeper * futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, int flags, u64 range_ns); extern struct futex_hash_bucket *futex_hash(union futex_key *key); +#ifdef CONFIG_FUTEX_PRIVATE_HASH +extern void futex_hash_get(struct futex_hash_bucket *hb); +extern void futex_hash_put(struct futex_hash_bucket *hb); + +extern struct futex_private_hash *futex_private_hash(void); +extern void futex_private_hash_put(struct futex_private_hash *fph); + +#else /* !CONFIG_FUTEX_PRIVATE_HASH */ +static inline void futex_hash_get(struct futex_hash_bucket *hb) { } +static inline void futex_hash_put(struct futex_hash_bucket *hb) { } +static inline struct futex_private_hash *futex_private_hash(void) { return NULL; } +static inline void futex_private_hash_put(struct futex_private_hash *fph) { } +#endif + +DEFINE_CLASS(hb, struct futex_hash_bucket *, + if (_T) futex_hash_put(_T), + futex_hash(key), union futex_key *key); + +DEFINE_CLASS(private_hash, struct futex_private_hash *, + if (_T) futex_private_hash_put(_T), + futex_private_hash(), void); /** * futex_match - Check whether two futex keys are equal @@ -219,9 +261,9 @@ static inline int futex_match(union futex_key *key1, union futex_key *key2) } extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, - struct futex_q *q, struct futex_hash_bucket **hb); -extern void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, - struct hrtimer_sleeper *timeout); + struct futex_q *q, union futex_key *key2, + struct task_struct *task); +extern void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout); extern bool __futex_wake_mark(struct futex_q *q); extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q); @@ -239,49 +281,11 @@ static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 return ret; } -/* - * This does a plain atomic user space read, and the user pointer has - * already been verified earlier by get_futex_key() to be both aligned - * and actually in user space, just like futex_atomic_cmpxchg_inatomic(). - * - * We still want to avoid any speculation, and while __get_user() is - * the traditional model for this, it's actually slower than doing - * this manually these days. - * - * We could just have a per-architecture special function for it, - * the same way we do futex_atomic_cmpxchg_inatomic(), but rather - * than force everybody to do that, write it out long-hand using - * the low-level user-access infrastructure. - * - * This looks a bit overkill, but generally just results in a couple - * of instructions. - */ -static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from) -{ - u32 val; - - if (can_do_masked_user_access()) - from = masked_user_access_begin(from); - else if (!user_read_access_begin(from, sizeof(*from))) - return -EFAULT; - unsafe_get_user(val, from, Efault); - user_read_access_end(); - *dest = val; - return 0; -Efault: - user_read_access_end(); - return -EFAULT; -} - +/* Read from user memory with pagefaults disabled */ static inline int futex_get_value_locked(u32 *dest, u32 __user *from) { - int ret; - - pagefault_disable(); - ret = futex_read_inatomic(dest, from); - pagefault_enable(); - - return ret; + guard(pagefault)(); + return get_user_inline(*dest, from); } extern void __futex_unqueue(struct futex_q *q); @@ -354,7 +358,7 @@ static inline int futex_hb_waiters_pending(struct futex_hash_bucket *hb) #endif } -extern struct futex_hash_bucket *futex_q_lock(struct futex_q *q); +extern void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb); extern void futex_q_unlock(struct futex_hash_bucket *hb); diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c index 7a941845f7ee..dacb2330f1fb 100644 --- a/kernel/futex/pi.c +++ b/kernel/futex/pi.c @@ -806,7 +806,7 @@ handle_err: break; } - spin_lock(q->lock_ptr); + futex_q_lockptr_lock(q); raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); /* @@ -920,7 +920,6 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl struct hrtimer_sleeper timeout, *to; struct task_struct *exiting = NULL; struct rt_mutex_waiter rt_waiter; - struct futex_hash_bucket *hb; struct futex_q q = futex_q_init; DEFINE_WAKE_Q(wake_q); int res, ret; @@ -939,151 +938,183 @@ retry: goto out; retry_private: - hb = futex_q_lock(&q); + if (1) { + CLASS(hb, hb)(&q.key); - ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, - &exiting, 0); - if (unlikely(ret)) { - /* - * Atomic work succeeded and we got the lock, - * or failed. Either way, we do _not_ block. - */ - switch (ret) { - case 1: - /* We got the lock. */ - ret = 0; - goto out_unlock_put_key; - case -EFAULT: - goto uaddr_faulted; - case -EBUSY: - case -EAGAIN: - /* - * Two reasons for this: - * - EBUSY: Task is exiting and we just wait for the - * exit to complete. - * - EAGAIN: The user space value changed. - */ - futex_q_unlock(hb); + futex_q_lock(&q, hb); + + ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, + &exiting, 0); + if (unlikely(ret)) { /* - * Handle the case where the owner is in the middle of - * exiting. Wait for the exit to complete otherwise - * this task might loop forever, aka. live lock. + * Atomic work succeeded and we got the lock, + * or failed. Either way, we do _not_ block. */ - wait_for_owner_exiting(ret, exiting); - cond_resched(); - goto retry; - default: - goto out_unlock_put_key; + switch (ret) { + case 1: + /* We got the lock. */ + ret = 0; + goto out_unlock_put_key; + case -EFAULT: + goto uaddr_faulted; + case -EBUSY: + case -EAGAIN: + /* + * Two reasons for this: + * - EBUSY: Task is exiting and we just wait for the + * exit to complete. + * - EAGAIN: The user space value changed. + */ + futex_q_unlock(hb); + /* + * Handle the case where the owner is in the middle of + * exiting. Wait for the exit to complete otherwise + * this task might loop forever, aka. live lock. + */ + wait_for_owner_exiting(ret, exiting); + cond_resched(); + goto retry; + default: + goto out_unlock_put_key; + } } - } - WARN_ON(!q.pi_state); + WARN_ON(!q.pi_state); - /* - * Only actually queue now that the atomic ops are done: - */ - __futex_queue(&q, hb, current); + /* + * Only actually queue now that the atomic ops are done: + */ + __futex_queue(&q, hb, current); - if (trylock) { - ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); - /* Fixup the trylock return value: */ - ret = ret ? 0 : -EWOULDBLOCK; - goto no_block; - } + if (trylock) { + ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); + /* Fixup the trylock return value: */ + ret = ret ? 0 : -EWOULDBLOCK; + goto no_block; + } - /* - * Must be done before we enqueue the waiter, here is unfortunately - * under the hb lock, but that *should* work because it does nothing. - */ - rt_mutex_pre_schedule(); + /* + * Caution; releasing @hb in-scope. The hb->lock is still locked + * while the reference is dropped. The reference can not be dropped + * after the unlock because if a user initiated resize is in progress + * then we might need to wake him. This can not be done after the + * rt_mutex_pre_schedule() invocation. The hb will remain valid because + * the thread, performing resize, will block on hb->lock during + * the requeue. + */ + futex_hash_put(no_free_ptr(hb)); + /* + * Must be done before we enqueue the waiter, here is unfortunately + * under the hb lock, but that *should* work because it does nothing. + */ + rt_mutex_pre_schedule(); - rt_mutex_init_waiter(&rt_waiter); + rt_mutex_init_waiter(&rt_waiter); - /* - * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not - * hold it while doing rt_mutex_start_proxy(), because then it will - * include hb->lock in the blocking chain, even through we'll not in - * fact hold it while blocking. This will lead it to report -EDEADLK - * and BUG when futex_unlock_pi() interleaves with this. - * - * Therefore acquire wait_lock while holding hb->lock, but drop the - * latter before calling __rt_mutex_start_proxy_lock(). This - * interleaves with futex_unlock_pi() -- which does a similar lock - * handoff -- such that the latter can observe the futex_q::pi_state - * before __rt_mutex_start_proxy_lock() is done. - */ - raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); - spin_unlock(q.lock_ptr); - /* - * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter - * such that futex_unlock_pi() is guaranteed to observe the waiter when - * it sees the futex_q::pi_state. - */ - ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q); - raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q); + /* + * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not + * hold it while doing rt_mutex_start_proxy(), because then it will + * include hb->lock in the blocking chain, even through we'll not in + * fact hold it while blocking. This will lead it to report -EDEADLK + * and BUG when futex_unlock_pi() interleaves with this. + * + * Therefore acquire wait_lock while holding hb->lock, but drop the + * latter before calling __rt_mutex_start_proxy_lock(). This + * interleaves with futex_unlock_pi() -- which does a similar lock + * handoff -- such that the latter can observe the futex_q::pi_state + * before __rt_mutex_start_proxy_lock() is done. + */ + raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); + spin_unlock(q.lock_ptr); + /* + * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter + * such that futex_unlock_pi() is guaranteed to observe the waiter when + * it sees the futex_q::pi_state. + */ + ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q); + raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q); - if (ret) { - if (ret == 1) - ret = 0; - goto cleanup; - } + if (ret) { + if (ret == 1) + ret = 0; + goto cleanup; + } - if (unlikely(to)) - hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); + if (unlikely(to)) + hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); - ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); + ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); cleanup: - /* - * If we failed to acquire the lock (deadlock/signal/timeout), we must - * must unwind the above, however we canont lock hb->lock because - * rt_mutex already has a waiter enqueued and hb->lock can itself try - * and enqueue an rt_waiter through rtlock. - * - * Doing the cleanup without holding hb->lock can cause inconsistent - * state between hb and pi_state, but only in the direction of not - * seeing a waiter that is leaving. - * - * See futex_unlock_pi(), it deals with this inconsistency. - * - * There be dragons here, since we must deal with the inconsistency on - * the way out (here), it is impossible to detect/warn about the race - * the other way around (missing an incoming waiter). - * - * What could possibly go wrong... - */ - if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) - ret = 0; + /* + * If we failed to acquire the lock (deadlock/signal/timeout), we must + * unwind the above, however we canont lock hb->lock because + * rt_mutex already has a waiter enqueued and hb->lock can itself try + * and enqueue an rt_waiter through rtlock. + * + * Doing the cleanup without holding hb->lock can cause inconsistent + * state between hb and pi_state, but only in the direction of not + * seeing a waiter that is leaving. + * + * See futex_unlock_pi(), it deals with this inconsistency. + * + * There be dragons here, since we must deal with the inconsistency on + * the way out (here), it is impossible to detect/warn about the race + * the other way around (missing an incoming waiter). + * + * What could possibly go wrong... + */ + if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) + ret = 0; - /* - * Now that the rt_waiter has been dequeued, it is safe to use - * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up - * the - */ - spin_lock(q.lock_ptr); - /* - * Waiter is unqueued. - */ - rt_mutex_post_schedule(); + /* + * Now that the rt_waiter has been dequeued, it is safe to use + * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up + * the + */ + futex_q_lockptr_lock(&q); + /* + * Waiter is unqueued. + */ + rt_mutex_post_schedule(); no_block: - /* - * Fixup the pi_state owner and possibly acquire the lock if we - * haven't already. - */ - res = fixup_pi_owner(uaddr, &q, !ret); - /* - * If fixup_pi_owner() returned an error, propagate that. If it acquired - * the lock, clear our -ETIMEDOUT or -EINTR. - */ - if (res) - ret = (res < 0) ? res : 0; - - futex_unqueue_pi(&q); - spin_unlock(q.lock_ptr); - goto out; + /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. + */ + res = fixup_pi_owner(uaddr, &q, !ret); + /* + * If fixup_pi_owner() returned an error, propagate that. If it acquired + * the lock, clear our -ETIMEDOUT or -EINTR. + */ + if (res) + ret = (res < 0) ? res : 0; + + futex_unqueue_pi(&q); + spin_unlock(q.lock_ptr); + if (q.drop_hb_ref) { + CLASS(hb, hb)(&q.key); + /* Additional reference from futex_unlock_pi() */ + futex_hash_put(hb); + } + goto out; out_unlock_put_key: - futex_q_unlock(hb); + futex_q_unlock(hb); + goto out; + +uaddr_faulted: + futex_q_unlock(hb); + + ret = fault_in_user_writeable(uaddr); + if (ret) + goto out; + + if (!(flags & FLAGS_SHARED)) + goto retry_private; + + goto retry; + } out: if (to) { @@ -1091,18 +1122,6 @@ out: destroy_hrtimer_on_stack(&to->timer); } return ret != -EINTR ? ret : -ERESTARTNOINTR; - -uaddr_faulted: - futex_q_unlock(hb); - - ret = fault_in_user_writeable(uaddr); - if (ret) - goto out; - - if (!(flags & FLAGS_SHARED)) - goto retry_private; - - goto retry; } /* @@ -1114,7 +1133,6 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) { u32 curval, uval, vpid = task_pid_vnr(current); union futex_key key = FUTEX_KEY_INIT; - struct futex_hash_bucket *hb; struct futex_q *top_waiter; int ret; @@ -1134,7 +1152,7 @@ retry: if (ret) return ret; - hb = futex_hash(&key); + CLASS(hb, hb)(&key); spin_lock(&hb->lock); retry_hb: @@ -1187,6 +1205,12 @@ retry_hb: */ rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); if (!rt_waiter) { + /* + * Acquire a reference for the leaving waiter to ensure + * valid futex_q::lock_ptr. + */ + futex_hash_get(hb); + top_waiter->drop_hb_ref = true; __futex_unqueue(top_waiter); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); goto retry_hb; diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c index b47bb764b352..d818b4d47f1b 100644 --- a/kernel/futex/requeue.c +++ b/kernel/futex/requeue.c @@ -87,6 +87,11 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, futex_hb_waiters_inc(hb2); plist_add(&q->list, &hb2->chain); q->lock_ptr = &hb2->lock; + /* + * hb1 and hb2 belong to the same futex_hash_bucket_private + * because if we managed get a reference on hb1 then it can't be + * replaced. Therefore we avoid put(hb1)+get(hb2) here. + */ } q->key = *key2; } @@ -225,18 +230,25 @@ static inline void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, struct futex_hash_bucket *hb) { - q->key = *key; + struct task_struct *task; + q->key = *key; __futex_unqueue(q); WARN_ON(!q->rt_waiter); q->rt_waiter = NULL; - + /* + * Acquire a reference for the waiter to ensure valid + * futex_q::lock_ptr. + */ + futex_hash_get(hb); + q->drop_hb_ref = true; q->lock_ptr = &hb->lock; + task = READ_ONCE(q->task); /* Signal locked state to the waiter */ futex_requeue_pi_complete(q, 1); - wake_up_state(q->task, TASK_NORMAL); + wake_up_state(task, TASK_NORMAL); } /** @@ -371,7 +383,6 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags1, union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; int task_count = 0, ret; struct futex_pi_state *pi_state = NULL; - struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; DEFINE_WAKE_Q(wake_q); @@ -443,240 +454,242 @@ retry: if (requeue_pi && futex_match(&key1, &key2)) return -EINVAL; - hb1 = futex_hash(&key1); - hb2 = futex_hash(&key2); - retry_private: - futex_hb_waiters_inc(hb2); - double_lock_hb(hb1, hb2); + if (1) { + CLASS(hb, hb1)(&key1); + CLASS(hb, hb2)(&key2); - if (likely(cmpval != NULL)) { - u32 curval; - - ret = futex_get_value_locked(&curval, uaddr1); + futex_hb_waiters_inc(hb2); + double_lock_hb(hb1, hb2); - if (unlikely(ret)) { - double_unlock_hb(hb1, hb2); - futex_hb_waiters_dec(hb2); + if (likely(cmpval != NULL)) { + u32 curval; - ret = get_user(curval, uaddr1); - if (ret) - return ret; + ret = futex_get_value_locked(&curval, uaddr1); - if (!(flags1 & FLAGS_SHARED)) - goto retry_private; + if (unlikely(ret)) { + futex_hb_waiters_dec(hb2); + double_unlock_hb(hb1, hb2); - goto retry; - } - if (curval != *cmpval) { - ret = -EAGAIN; - goto out_unlock; - } - } + ret = get_user(curval, uaddr1); + if (ret) + return ret; - if (requeue_pi) { - struct task_struct *exiting = NULL; + if (!(flags1 & FLAGS_SHARED)) + goto retry_private; - /* - * Attempt to acquire uaddr2 and wake the top waiter. If we - * intend to requeue waiters, force setting the FUTEX_WAITERS - * bit. We force this here where we are able to easily handle - * faults rather in the requeue loop below. - * - * Updates topwaiter::requeue_state if a top waiter exists. - */ - ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, - &key2, &pi_state, - &exiting, nr_requeue); + goto retry; + } + if (curval != *cmpval) { + ret = -EAGAIN; + goto out_unlock; + } + } - /* - * At this point the top_waiter has either taken uaddr2 or - * is waiting on it. In both cases pi_state has been - * established and an initial refcount on it. In case of an - * error there's nothing. - * - * The top waiter's requeue_state is up to date: - * - * - If the lock was acquired atomically (ret == 1), then - * the state is Q_REQUEUE_PI_LOCKED. - * - * The top waiter has been dequeued and woken up and can - * return to user space immediately. The kernel/user - * space state is consistent. In case that there must be - * more waiters requeued the WAITERS bit in the user - * space futex is set so the top waiter task has to go - * into the syscall slowpath to unlock the futex. This - * will block until this requeue operation has been - * completed and the hash bucket locks have been - * dropped. - * - * - If the trylock failed with an error (ret < 0) then - * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing - * happened", or Q_REQUEUE_PI_IGNORE when there was an - * interleaved early wakeup. - * - * - If the trylock did not succeed (ret == 0) then the - * state is either Q_REQUEUE_PI_IN_PROGRESS or - * Q_REQUEUE_PI_WAIT if an early wakeup interleaved. - * This will be cleaned up in the loop below, which - * cannot fail because futex_proxy_trylock_atomic() did - * the same sanity checks for requeue_pi as the loop - * below does. - */ - switch (ret) { - case 0: - /* We hold a reference on the pi state. */ - break; + if (requeue_pi) { + struct task_struct *exiting = NULL; - case 1: /* - * futex_proxy_trylock_atomic() acquired the user space - * futex. Adjust task_count. + * Attempt to acquire uaddr2 and wake the top waiter. If we + * intend to requeue waiters, force setting the FUTEX_WAITERS + * bit. We force this here where we are able to easily handle + * faults rather in the requeue loop below. + * + * Updates topwaiter::requeue_state if a top waiter exists. */ - task_count++; - ret = 0; - break; + ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, + &key2, &pi_state, + &exiting, nr_requeue); - /* - * If the above failed, then pi_state is NULL and - * waiter::requeue_state is correct. - */ - case -EFAULT: - double_unlock_hb(hb1, hb2); - futex_hb_waiters_dec(hb2); - ret = fault_in_user_writeable(uaddr2); - if (!ret) - goto retry; - return ret; - case -EBUSY: - case -EAGAIN: - /* - * Two reasons for this: - * - EBUSY: Owner is exiting and we just wait for the - * exit to complete. - * - EAGAIN: The user space value changed. - */ - double_unlock_hb(hb1, hb2); - futex_hb_waiters_dec(hb2); /* - * Handle the case where the owner is in the middle of - * exiting. Wait for the exit to complete otherwise - * this task might loop forever, aka. live lock. + * At this point the top_waiter has either taken uaddr2 or + * is waiting on it. In both cases pi_state has been + * established and an initial refcount on it. In case of an + * error there's nothing. + * + * The top waiter's requeue_state is up to date: + * + * - If the lock was acquired atomically (ret == 1), then + * the state is Q_REQUEUE_PI_LOCKED. + * + * The top waiter has been dequeued and woken up and can + * return to user space immediately. The kernel/user + * space state is consistent. In case that there must be + * more waiters requeued the WAITERS bit in the user + * space futex is set so the top waiter task has to go + * into the syscall slowpath to unlock the futex. This + * will block until this requeue operation has been + * completed and the hash bucket locks have been + * dropped. + * + * - If the trylock failed with an error (ret < 0) then + * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing + * happened", or Q_REQUEUE_PI_IGNORE when there was an + * interleaved early wakeup. + * + * - If the trylock did not succeed (ret == 0) then the + * state is either Q_REQUEUE_PI_IN_PROGRESS or + * Q_REQUEUE_PI_WAIT if an early wakeup interleaved. + * This will be cleaned up in the loop below, which + * cannot fail because futex_proxy_trylock_atomic() did + * the same sanity checks for requeue_pi as the loop + * below does. */ - wait_for_owner_exiting(ret, exiting); - cond_resched(); - goto retry; - default: - goto out_unlock; - } - } - - plist_for_each_entry_safe(this, next, &hb1->chain, list) { - if (task_count - nr_wake >= nr_requeue) - break; - - if (!futex_match(&this->key, &key1)) - continue; - - /* - * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always - * be paired with each other and no other futex ops. - * - * We should never be requeueing a futex_q with a pi_state, - * which is awaiting a futex_unlock_pi(). - */ - if ((requeue_pi && !this->rt_waiter) || - (!requeue_pi && this->rt_waiter) || - this->pi_state) { - ret = -EINVAL; - break; - } - - /* Plain futexes just wake or requeue and are done */ - if (!requeue_pi) { - if (++task_count <= nr_wake) - this->wake(&wake_q, this); - else - requeue_futex(this, hb1, hb2, &key2); - continue; + switch (ret) { + case 0: + /* We hold a reference on the pi state. */ + break; + + case 1: + /* + * futex_proxy_trylock_atomic() acquired the user space + * futex. Adjust task_count. + */ + task_count++; + ret = 0; + break; + + /* + * If the above failed, then pi_state is NULL and + * waiter::requeue_state is correct. + */ + case -EFAULT: + futex_hb_waiters_dec(hb2); + double_unlock_hb(hb1, hb2); + ret = fault_in_user_writeable(uaddr2); + if (!ret) + goto retry; + return ret; + case -EBUSY: + case -EAGAIN: + /* + * Two reasons for this: + * - EBUSY: Owner is exiting and we just wait for the + * exit to complete. + * - EAGAIN: The user space value changed. + */ + futex_hb_waiters_dec(hb2); + double_unlock_hb(hb1, hb2); + /* + * Handle the case where the owner is in the middle of + * exiting. Wait for the exit to complete otherwise + * this task might loop forever, aka. live lock. + */ + wait_for_owner_exiting(ret, exiting); + cond_resched(); + goto retry; + default: + goto out_unlock; + } } - /* Ensure we requeue to the expected futex for requeue_pi. */ - if (!futex_match(this->requeue_pi_key, &key2)) { - ret = -EINVAL; - break; - } + plist_for_each_entry_safe(this, next, &hb1->chain, list) { + if (task_count - nr_wake >= nr_requeue) + break; - /* - * Requeue nr_requeue waiters and possibly one more in the case - * of requeue_pi if we couldn't acquire the lock atomically. - * - * Prepare the waiter to take the rt_mutex. Take a refcount - * on the pi_state and store the pointer in the futex_q - * object of the waiter. - */ - get_pi_state(pi_state); + if (!futex_match(&this->key, &key1)) + continue; - /* Don't requeue when the waiter is already on the way out. */ - if (!futex_requeue_pi_prepare(this, pi_state)) { /* - * Early woken waiter signaled that it is on the - * way out. Drop the pi_state reference and try the - * next waiter. @this->pi_state is still NULL. + * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always + * be paired with each other and no other futex ops. + * + * We should never be requeueing a futex_q with a pi_state, + * which is awaiting a futex_unlock_pi(). */ - put_pi_state(pi_state); - continue; - } - - ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, - this->rt_waiter, - this->task); + if ((requeue_pi && !this->rt_waiter) || + (!requeue_pi && this->rt_waiter) || + this->pi_state) { + ret = -EINVAL; + break; + } + + /* Plain futexes just wake or requeue and are done */ + if (!requeue_pi) { + if (++task_count <= nr_wake) + this->wake(&wake_q, this); + else + requeue_futex(this, hb1, hb2, &key2); + continue; + } + + /* Ensure we requeue to the expected futex for requeue_pi. */ + if (!futex_match(this->requeue_pi_key, &key2)) { + ret = -EINVAL; + break; + } - if (ret == 1) { - /* - * We got the lock. We do neither drop the refcount - * on pi_state nor clear this->pi_state because the - * waiter needs the pi_state for cleaning up the - * user space value. It will drop the refcount - * after doing so. this::requeue_state is updated - * in the wakeup as well. - */ - requeue_pi_wake_futex(this, &key2, hb2); - task_count++; - } else if (!ret) { - /* Waiter is queued, move it to hb2 */ - requeue_futex(this, hb1, hb2, &key2); - futex_requeue_pi_complete(this, 0); - task_count++; - } else { /* - * rt_mutex_start_proxy_lock() detected a potential - * deadlock when we tried to queue that waiter. - * Drop the pi_state reference which we took above - * and remove the pointer to the state from the - * waiters futex_q object. + * Requeue nr_requeue waiters and possibly one more in the case + * of requeue_pi if we couldn't acquire the lock atomically. + * + * Prepare the waiter to take the rt_mutex. Take a refcount + * on the pi_state and store the pointer in the futex_q + * object of the waiter. */ - this->pi_state = NULL; - put_pi_state(pi_state); - futex_requeue_pi_complete(this, ret); - /* - * We stop queueing more waiters and let user space - * deal with the mess. - */ - break; + get_pi_state(pi_state); + + /* Don't requeue when the waiter is already on the way out. */ + if (!futex_requeue_pi_prepare(this, pi_state)) { + /* + * Early woken waiter signaled that it is on the + * way out. Drop the pi_state reference and try the + * next waiter. @this->pi_state is still NULL. + */ + put_pi_state(pi_state); + continue; + } + + ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, + this->rt_waiter, + this->task); + + if (ret == 1) { + /* + * We got the lock. We do neither drop the refcount + * on pi_state nor clear this->pi_state because the + * waiter needs the pi_state for cleaning up the + * user space value. It will drop the refcount + * after doing so. this::requeue_state is updated + * in the wakeup as well. + */ + requeue_pi_wake_futex(this, &key2, hb2); + task_count++; + } else if (!ret) { + /* Waiter is queued, move it to hb2 */ + requeue_futex(this, hb1, hb2, &key2); + futex_requeue_pi_complete(this, 0); + task_count++; + } else { + /* + * rt_mutex_start_proxy_lock() detected a potential + * deadlock when we tried to queue that waiter. + * Drop the pi_state reference which we took above + * and remove the pointer to the state from the + * waiters futex_q object. + */ + this->pi_state = NULL; + put_pi_state(pi_state); + futex_requeue_pi_complete(this, ret); + /* + * We stop queueing more waiters and let user space + * deal with the mess. + */ + break; + } } - } - /* - * We took an extra initial reference to the pi_state in - * futex_proxy_trylock_atomic(). We need to drop it here again. - */ - put_pi_state(pi_state); + /* + * We took an extra initial reference to the pi_state in + * futex_proxy_trylock_atomic(). We need to drop it here again. + */ + put_pi_state(pi_state); out_unlock: - double_unlock_hb(hb1, hb2); + futex_hb_waiters_dec(hb2); + double_unlock_hb(hb1, hb2); + } wake_up_q(&wake_q); - futex_hb_waiters_dec(hb2); return ret ? ret : task_count; } @@ -769,7 +782,6 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, { struct hrtimer_sleeper timeout, *to; struct rt_mutex_waiter rt_waiter; - struct futex_hash_bucket *hb; union futex_key key2 = FUTEX_KEY_INIT; struct futex_q q = futex_q_init; struct rt_mutex_base *pi_mutex; @@ -805,35 +817,28 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * Prepare to wait on uaddr. On success, it holds hb->lock and q * is initialized. */ - ret = futex_wait_setup(uaddr, val, flags, &q, &hb); + ret = futex_wait_setup(uaddr, val, flags, &q, &key2, current); if (ret) goto out; - /* - * The check above which compares uaddrs is not sufficient for - * shared futexes. We need to compare the keys: - */ - if (futex_match(&q.key, &key2)) { - futex_q_unlock(hb); - ret = -EINVAL; - goto out; - } - /* Queue the futex_q, drop the hb lock, wait for wakeup. */ - futex_wait_queue(hb, &q, to); + futex_do_wait(&q, to); switch (futex_requeue_pi_wakeup_sync(&q)) { case Q_REQUEUE_PI_IGNORE: - /* The waiter is still on uaddr1 */ - spin_lock(&hb->lock); - ret = handle_early_requeue_pi_wakeup(hb, &q, to); - spin_unlock(&hb->lock); + { + CLASS(hb, hb)(&q.key); + /* The waiter is still on uaddr1 */ + spin_lock(&hb->lock); + ret = handle_early_requeue_pi_wakeup(hb, &q, to); + spin_unlock(&hb->lock); + } break; case Q_REQUEUE_PI_LOCKED: /* The requeue acquired the lock */ if (q.pi_state && (q.pi_state->owner != current)) { - spin_lock(q.lock_ptr); + futex_q_lockptr_lock(&q); ret = fixup_pi_owner(uaddr2, &q, true); /* * Drop the reference to the pi state which the @@ -860,7 +865,7 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) ret = 0; - spin_lock(q.lock_ptr); + futex_q_lockptr_lock(&q); debug_rt_mutex_free_waiter(&rt_waiter); /* * Fixup the pi_state owner and possibly acquire the lock if we @@ -892,6 +897,11 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, default: BUG(); } + if (q.drop_hb_ref) { + CLASS(hb, hb)(&q.key); + /* Additional reference from requeue_pi_wake_futex() */ + futex_hash_put(hb); + } out: if (to) { diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index 4b6da9116aa6..880c9bf2f315 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -39,6 +39,56 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, return 0; } +static inline void __user *futex_task_robust_list(struct task_struct *p, bool compat) +{ +#ifdef CONFIG_COMPAT + if (compat) + return p->compat_robust_list; +#endif + return p->robust_list; +} + +static void __user *futex_get_robust_list_common(int pid, bool compat) +{ + struct task_struct *p = current; + void __user *head; + int ret; + + scoped_guard(rcu) { + if (pid) { + p = find_task_by_vpid(pid); + if (!p) + return (void __user *)ERR_PTR(-ESRCH); + } + get_task_struct(p); + } + + /* + * Hold exec_update_lock to serialize with concurrent exec() + * so ptrace_may_access() is checked against stable credentials + */ + ret = down_read_killable(&p->signal->exec_update_lock); + if (ret) + goto err_put; + + ret = -EPERM; + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) + goto err_unlock; + + head = futex_task_robust_list(p, compat); + + up_read(&p->signal->exec_update_lock); + put_task_struct(p); + + return head; + +err_unlock: + up_read(&p->signal->exec_update_lock); +err_put: + put_task_struct(p); + return (void __user *)ERR_PTR(ret); +} + /** * sys_get_robust_list() - Get the robust-futex list head of a task * @pid: pid of the process [zero for current task] @@ -49,36 +99,14 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, struct robust_list_head __user * __user *, head_ptr, size_t __user *, len_ptr) { - struct robust_list_head __user *head; - unsigned long ret; - struct task_struct *p; - - rcu_read_lock(); - - ret = -ESRCH; - if (!pid) - p = current; - else { - p = find_task_by_vpid(pid); - if (!p) - goto err_unlock; - } - - ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) - goto err_unlock; + struct robust_list_head __user *head = futex_get_robust_list_common(pid, false); - head = p->robust_list; - rcu_read_unlock(); + if (IS_ERR(head)) + return PTR_ERR(head); if (put_user(sizeof(*head), len_ptr)) return -EFAULT; return put_user(head, head_ptr); - -err_unlock: - rcu_read_unlock(); - - return ret; } long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, @@ -455,36 +483,14 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, compat_uptr_t __user *, head_ptr, compat_size_t __user *, len_ptr) { - struct compat_robust_list_head __user *head; - unsigned long ret; - struct task_struct *p; - - rcu_read_lock(); - - ret = -ESRCH; - if (!pid) - p = current; - else { - p = find_task_by_vpid(pid); - if (!p) - goto err_unlock; - } - - ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) - goto err_unlock; + struct compat_robust_list_head __user *head = futex_get_robust_list_common(pid, true); - head = p->compat_robust_list; - rcu_read_unlock(); + if (IS_ERR(head)) + return PTR_ERR(head); if (put_user(sizeof(*head), len_ptr)) return -EFAULT; return put_user(ptr_to_compat(head), head_ptr); - -err_unlock: - rcu_read_unlock(); - - return ret; } #endif /* CONFIG_COMPAT */ diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index 25877d4f2f8f..1c2dd03f11ec 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -154,7 +154,6 @@ void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q) */ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) { - struct futex_hash_bucket *hb; struct futex_q *this, *next; union futex_key key = FUTEX_KEY_INIT; DEFINE_WAKE_Q(wake_q); @@ -170,7 +169,7 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) if ((flags & FLAGS_STRICT) && !nr_wake) return 0; - hb = futex_hash(&key); + CLASS(hb, hb)(&key); /* Make sure we really have tasks to wakeup */ if (!futex_hb_waiters_pending(hb)) @@ -253,7 +252,6 @@ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, int nr_wake, int nr_wake2, int op) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; - struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; int ret, op_ret; DEFINE_WAKE_Q(wake_q); @@ -266,67 +264,69 @@ retry: if (unlikely(ret != 0)) return ret; - hb1 = futex_hash(&key1); - hb2 = futex_hash(&key2); - retry_private: - double_lock_hb(hb1, hb2); - op_ret = futex_atomic_op_inuser(op, uaddr2); - if (unlikely(op_ret < 0)) { - double_unlock_hb(hb1, hb2); - - if (!IS_ENABLED(CONFIG_MMU) || - unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { - /* - * we don't get EFAULT from MMU faults if we don't have - * an MMU, but we might get them from range checking - */ - ret = op_ret; - return ret; - } - - if (op_ret == -EFAULT) { - ret = fault_in_user_writeable(uaddr2); - if (ret) + if (1) { + CLASS(hb, hb1)(&key1); + CLASS(hb, hb2)(&key2); + + double_lock_hb(hb1, hb2); + op_ret = futex_atomic_op_inuser(op, uaddr2); + if (unlikely(op_ret < 0)) { + double_unlock_hb(hb1, hb2); + + if (!IS_ENABLED(CONFIG_MMU) || + unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { + /* + * we don't get EFAULT from MMU faults if we don't have + * an MMU, but we might get them from range checking + */ + ret = op_ret; return ret; - } - - cond_resched(); - if (!(flags & FLAGS_SHARED)) - goto retry_private; - goto retry; - } + } - plist_for_each_entry_safe(this, next, &hb1->chain, list) { - if (futex_match (&this->key, &key1)) { - if (this->pi_state || this->rt_waiter) { - ret = -EINVAL; - goto out_unlock; + if (op_ret == -EFAULT) { + ret = fault_in_user_writeable(uaddr2); + if (ret) + return ret; } - this->wake(&wake_q, this); - if (++ret >= nr_wake) - break; + + cond_resched(); + if (!(flags & FLAGS_SHARED)) + goto retry_private; + goto retry; } - } - if (op_ret > 0) { - op_ret = 0; - plist_for_each_entry_safe(this, next, &hb2->chain, list) { - if (futex_match (&this->key, &key2)) { + plist_for_each_entry_safe(this, next, &hb1->chain, list) { + if (futex_match(&this->key, &key1)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; goto out_unlock; } this->wake(&wake_q, this); - if (++op_ret >= nr_wake2) + if (++ret >= nr_wake) break; } } - ret += op_ret; - } + + if (op_ret > 0) { + op_ret = 0; + plist_for_each_entry_safe(this, next, &hb2->chain, list) { + if (futex_match(&this->key, &key2)) { + if (this->pi_state || this->rt_waiter) { + ret = -EINVAL; + goto out_unlock; + } + this->wake(&wake_q, this); + if (++op_ret >= nr_wake2) + break; + } + } + ret += op_ret; + } out_unlock: - double_unlock_hb(hb1, hb2); + double_unlock_hb(hb1, hb2); + } wake_up_q(&wake_q); return ret; } @@ -334,23 +334,12 @@ out_unlock: static long futex_wait_restart(struct restart_block *restart); /** - * futex_wait_queue() - futex_queue() and wait for wakeup, timeout, or signal - * @hb: the futex hash bucket, must be locked by the caller + * futex_do_wait() - wait for wakeup, timeout, or signal * @q: the futex_q to queue up on * @timeout: the prepared hrtimer_sleeper, or null for no timeout */ -void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, - struct hrtimer_sleeper *timeout) +void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout) { - /* - * The task state is guaranteed to be set before another task can - * wake it. set_current_state() is implemented using smp_store_mb() and - * futex_queue() calls spin_unlock() upon completion, both serializing - * access to the hash list and forcing another memory barrier. - */ - set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); - futex_queue(q, hb, current); - /* Arm the timer */ if (timeout) hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); @@ -412,12 +401,17 @@ int futex_unqueue_multiple(struct futex_vector *v, int count) */ int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken) { - struct futex_hash_bucket *hb; bool retry = false; int ret, i; u32 uval; /* + * Make sure to have a reference on the private_hash such that we + * don't block on rehash after changing the task state below. + */ + guard(private_hash)(); + + /* * Enqueuing multiple futexes is tricky, because we need to enqueue * each futex on the list before dealing with the next one to avoid * deadlocking on the hash bucket. But, before enqueuing, we need to @@ -451,20 +445,24 @@ retry: struct futex_q *q = &vs[i].q; u32 val = vs[i].w.val; - hb = futex_q_lock(q); - ret = futex_get_value_locked(&uval, uaddr); + if (1) { + CLASS(hb, hb)(&q->key); - if (!ret && uval == val) { - /* - * The bucket lock can't be held while dealing with the - * next futex. Queue each futex at this moment so hb can - * be unlocked. - */ - futex_queue(q, hb, current); - continue; - } + futex_q_lock(q, hb); + ret = futex_get_value_locked(&uval, uaddr); + + if (!ret && uval == val) { + /* + * The bucket lock can't be held while dealing with the + * next futex. Queue each futex at this moment so hb can + * be unlocked. + */ + futex_queue(q, hb, current); + continue; + } - futex_q_unlock(hb); + futex_q_unlock(hb); + } __set_current_state(TASK_RUNNING); /* @@ -578,7 +576,8 @@ int futex_wait_multiple(struct futex_vector *vs, unsigned int count, * @val: the expected value * @flags: futex flags (FLAGS_SHARED, etc.) * @q: the associated futex_q - * @hb: storage for hash_bucket pointer to be returned to caller + * @key2: the second futex_key if used for requeue PI + * @task: Task queueing this futex * * Setup the futex_q and locate the hash_bucket. Get the futex value and * compare it with the expected value. Handle atomic faults internally. @@ -586,10 +585,12 @@ int futex_wait_multiple(struct futex_vector *vs, unsigned int count, * * Return: * - 0 - uaddr contains val and hb has been locked; - * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked + * - <0 - On error and the hb is unlocked. A possible reason: the uaddr can not + * be read, does not contain the expected value or is not properly aligned. */ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, - struct futex_q *q, struct futex_hash_bucket **hb) + struct futex_q *q, union futex_key *key2, + struct task_struct *task) { u32 uval; int ret; @@ -618,26 +619,45 @@ retry: return ret; retry_private: - *hb = futex_q_lock(q); + if (1) { + CLASS(hb, hb)(&q->key); - ret = futex_get_value_locked(&uval, uaddr); + futex_q_lock(q, hb); - if (ret) { - futex_q_unlock(*hb); + ret = futex_get_value_locked(&uval, uaddr); - ret = get_user(uval, uaddr); - if (ret) - return ret; + if (ret) { + futex_q_unlock(hb); - if (!(flags & FLAGS_SHARED)) - goto retry_private; + ret = get_user(uval, uaddr); + if (ret) + return ret; - goto retry; - } + if (!(flags & FLAGS_SHARED)) + goto retry_private; + + goto retry; + } + + if (uval != val) { + futex_q_unlock(hb); + return -EWOULDBLOCK; + } + + if (key2 && futex_match(&q->key, key2)) { + futex_q_unlock(hb); + return -EINVAL; + } - if (uval != val) { - futex_q_unlock(*hb); - ret = -EWOULDBLOCK; + /* + * The task state is guaranteed to be set before another task can + * wake it. set_current_state() is implemented using smp_store_mb() and + * futex_queue() calls spin_unlock() upon completion, both serializing + * access to the hash list and forcing another memory barrier. + */ + if (task == current) + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); + futex_queue(q, hb, task); } return ret; @@ -647,7 +667,6 @@ int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, struct hrtimer_sleeper *to, u32 bitset) { struct futex_q q = futex_q_init; - struct futex_hash_bucket *hb; int ret; if (!bitset) @@ -660,12 +679,12 @@ retry: * Prepare to wait on uaddr. On success, it holds hb->lock and q * is initialized. */ - ret = futex_wait_setup(uaddr, val, flags, &q, &hb); + ret = futex_wait_setup(uaddr, val, flags, &q, NULL, current); if (ret) return ret; /* futex_queue and wait for wakeup, timeout, or a signal. */ - futex_wait_queue(hb, &q, to); + futex_do_wait(&q, to); /* If we were woken (and unqueued), we succeeded, whatever. */ if (!futex_unqueue(&q)) @@ -719,12 +738,11 @@ int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = restart->futex.uaddr; - ktime_t t, *tp = NULL; + ktime_t *tp = NULL; + + if (restart->futex.flags & FLAGS_HAS_TIMEOUT) + tp = &restart->futex.time; - if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { - t = restart->futex.time; - tp = &t; - } restart->fn = do_no_restart_syscall; return (long)futex_wait(uaddr, restart->futex.flags, diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index fd75b4a484d7..ffde93d051a4 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,14 +18,12 @@ #include <linux/mm.h> #include "gcov.h" -#if (__GNUC__ >= 14) +#if (__GNUC__ >= 15) +#define GCOV_COUNTERS 10 +#elif (__GNUC__ >= 14) #define GCOV_COUNTERS 9 #elif (__GNUC__ >= 10) #define GCOV_COUNTERS 8 -#elif (__GNUC__ >= 7) -#define GCOV_COUNTERS 9 -#elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) -#define GCOV_COUNTERS 10 #else #define GCOV_COUNTERS 9 #endif diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index c9e5dc068e85..896a503dfb29 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -4,79 +4,34 @@ # This script generates an archive consisting of kernel headers # for CONFIG_IKHEADERS. set -e -sfile="$(readlink -f "$0")" -outdir="$(pwd)" tarfile=$1 -tmpdir=$outdir/${tarfile%/*}/.tmp_dir - -dir_list=" -include/ -arch/$SRCARCH/include/ -" - -# Support incremental builds by skipping archive generation -# if timestamps of files being archived are not changed. - -# This block is useful for debugging the incremental builds. -# Uncomment it for debugging. -# if [ ! -f /tmp/iter ]; then iter=1; echo 1 > /tmp/iter; -# else iter=$(($(cat /tmp/iter) + 1)); echo $iter > /tmp/iter; fi -# find $all_dirs -name "*.h" | xargs ls -l > /tmp/ls-$iter - -all_dirs= -if [ "$building_out_of_srctree" ]; then - for d in $dir_list; do - all_dirs="$all_dirs $srctree/$d" - done -fi -all_dirs="$all_dirs $dir_list" - -# include/generated/utsversion.h is ignored because it is generated after this -# script is executed. (utsversion.h is unneeded for kheaders) -# -# When Kconfig regenerates include/generated/autoconf.h, its timestamp is -# updated, but the contents might be still the same. When any CONFIG option is -# changed, Kconfig touches the corresponding timestamp file include/config/*. -# Hence, the md5sum detects the configuration change anyway. We do not need to -# check include/generated/autoconf.h explicitly. -# -# Ignore them for md5 calculation to avoid pointless regeneration. -headers_md5="$(find $all_dirs -name "*.h" -a \ - ! -path include/generated/utsversion.h -a \ - ! -path include/generated/autoconf.h | - xargs ls -l | md5sum | cut -d ' ' -f1)" - -# Any changes to this script will also cause a rebuild of the archive. -this_file_md5="$(ls -l $sfile | md5sum | cut -d ' ' -f1)" -if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi -if [ -f kernel/kheaders.md5 ] && - [ "$(head -n 1 kernel/kheaders.md5)" = "$headers_md5" ] && - [ "$(head -n 2 kernel/kheaders.md5 | tail -n 1)" = "$this_file_md5" ] && - [ "$(tail -n 1 kernel/kheaders.md5)" = "$tarfile_md5" ]; then - exit -fi - -echo " GEN $tarfile" +srclist=$2 +objlist=$3 +timestamp=$4 + +dir=$(dirname "${tarfile}") +tmpdir=${dir}/.tmp_dir +depfile=${dir}/.$(basename "${tarfile}").d + +# generate dependency list. +{ + echo + echo "deps_${tarfile} := \\" + sed 's:\(.*\): \1 \\:' "${srclist}" + sed -n '/^include\/generated\/autoconf\.h$/!s:\(.*\): \1 \\:p' "${objlist}" + echo + echo "${tarfile}: \$(deps_${tarfile})" + echo + echo "\$(deps_${tarfile}):" + +} > "${depfile}" rm -rf "${tmpdir}" mkdir "${tmpdir}" -if [ "$building_out_of_srctree" ]; then - ( - cd $srctree - for f in $dir_list - do find "$f" -name "*.h"; - done | tar -c -f - -T - | tar -xf - -C "${tmpdir}" - ) -fi - -for f in $dir_list; - do find "$f" -name "*.h"; -done | tar -c -f - -T - | tar -xf - -C "${tmpdir}" - -# Always exclude include/generated/utsversion.h -# Otherwise, the contents of the tarball may vary depending on the build steps. -rm -f "${tmpdir}/include/generated/utsversion.h" +# shellcheck disable=SC2154 # srctree is passed as an env variable +sed "s:^${srctree}/::" "${srclist}" | ${TAR} -c -f - -C "${srctree}" -T - | ${TAR} -xf - -C "${tmpdir}" +${TAR} -c -f - -T "${objlist}" | ${TAR} -xf - -C "${tmpdir}" # Remove comments except SDPX lines # Use a temporary file to store directory contents to prevent find/xargs from @@ -88,12 +43,8 @@ xargs -0 -P8 -n1 \ rm -f "${tmpdir}.contents.txt" # Create archive and try to normalize metadata for reproducibility. -tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ +${TAR} "${timestamp:+--mtime=$timestamp}" \ --owner=0 --group=0 --sort=name --numeric-owner --mode=u=rw,go=r,a+X \ - -I $XZ -cf $tarfile -C "${tmpdir}/" . > /dev/null - -echo $headers_md5 > kernel/kheaders.md5 -echo "$this_file_md5" >> kernel/kheaders.md5 -echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5 + -I "${XZ}" -cf "${tarfile}" -C "${tmpdir}/" . > /dev/null rm -rf "${tmpdir}" diff --git a/kernel/hung_task.c b/kernel/hung_task.c index dc898ec93463..d2254c91450b 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -22,6 +22,9 @@ #include <linux/sched/signal.h> #include <linux/sched/debug.h> #include <linux/sched/sysctl.h> +#include <linux/hung_task.h> +#include <linux/rwsem.h> +#include <linux/sys_info.h> #include <trace/events/sched.h> @@ -48,7 +51,6 @@ static unsigned long __read_mostly sysctl_hung_task_detect_count; * Zero means infinite timeout - no checking done: */ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; -EXPORT_SYMBOL_GPL(sysctl_hung_task_timeout_secs); /* * Zero (default value) means use sysctl_hung_task_timeout_secs: @@ -58,12 +60,17 @@ static unsigned long __read_mostly sysctl_hung_task_check_interval_secs; static int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; -static bool hung_task_show_lock; static bool hung_task_call_panic; -static bool hung_task_show_all_bt; static struct task_struct *watchdog_task; +/* + * A bitmask to control what kinds of system info to be printed when + * a hung task is detected, it could be task, memory, lock etc. Refer + * include/linux/sys_info.h for detailed bit definition. + */ +static unsigned long hung_task_si_mask; + #ifdef CONFIG_SMP /* * Should we dump all CPUs backtraces in a hung task event? @@ -79,7 +86,7 @@ static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; * hung task is detected: */ static unsigned int __read_mostly sysctl_hung_task_panic = - IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC); + CONFIG_BOOTPARAM_HUNG_TASK_PANIC; static int hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) @@ -93,68 +100,135 @@ static struct notifier_block panic_block = { .notifier_call = hung_task_panic, }; +static bool task_is_hung(struct task_struct *t, unsigned long timeout) +{ + unsigned long switch_count = t->nvcsw + t->nivcsw; + unsigned int state = READ_ONCE(t->__state); + + /* + * skip the TASK_KILLABLE tasks -- these can be killed + * skip the TASK_IDLE tasks -- those are genuinely idle + * skip the TASK_FROZEN task -- it reasonably stops scheduling by freezer + */ + if (!(state & TASK_UNINTERRUPTIBLE) || + (state & (TASK_WAKEKILL | TASK_NOLOAD | TASK_FROZEN))) + return false; + + /* + * When a freshly created task is scheduled once, changes its state to + * TASK_UNINTERRUPTIBLE without having ever been switched out once, it + * musn't be checked. + */ + if (unlikely(!switch_count)) + return false; + + if (switch_count != t->last_switch_count) { + t->last_switch_count = switch_count; + t->last_switch_time = jiffies; + return false; + } + if (time_is_after_jiffies(t->last_switch_time + timeout * HZ)) + return false; + + return true; +} #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER -static void debug_show_blocker(struct task_struct *task) +static void debug_show_blocker(struct task_struct *task, unsigned long timeout) { struct task_struct *g, *t; - unsigned long owner; - struct mutex *lock; + unsigned long owner, blocker, blocker_type; + const char *rwsem_blocked_by, *rwsem_blocked_as; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held"); - lock = READ_ONCE(task->blocker_mutex); - if (!lock) + blocker = READ_ONCE(task->blocker); + if (!blocker) return; - owner = mutex_get_owner(lock); + blocker_type = hung_task_get_blocker_type(blocker); + + switch (blocker_type) { + case BLOCKER_TYPE_MUTEX: + owner = mutex_get_owner(hung_task_blocker_to_lock(blocker)); + break; + case BLOCKER_TYPE_SEM: + owner = sem_last_holder(hung_task_blocker_to_lock(blocker)); + break; + case BLOCKER_TYPE_RWSEM_READER: + case BLOCKER_TYPE_RWSEM_WRITER: + owner = (unsigned long)rwsem_owner( + hung_task_blocker_to_lock(blocker)); + rwsem_blocked_as = (blocker_type == BLOCKER_TYPE_RWSEM_READER) ? + "reader" : "writer"; + rwsem_blocked_by = is_rwsem_reader_owned( + hung_task_blocker_to_lock(blocker)) ? + "reader" : "writer"; + break; + default: + WARN_ON_ONCE(1); + return; + } + + if (unlikely(!owner)) { - pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n", - task->comm, task->pid); + switch (blocker_type) { + case BLOCKER_TYPE_MUTEX: + pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n", + task->comm, task->pid); + break; + case BLOCKER_TYPE_SEM: + pr_err("INFO: task %s:%d is blocked on a semaphore, but the last holder is not found.\n", + task->comm, task->pid); + break; + case BLOCKER_TYPE_RWSEM_READER: + case BLOCKER_TYPE_RWSEM_WRITER: + pr_err("INFO: task %s:%d is blocked on an rw-semaphore, but the owner is not found.\n", + task->comm, task->pid); + break; + } return; } /* Ensure the owner information is correct. */ for_each_process_thread(g, t) { - if ((unsigned long)t == owner) { + if ((unsigned long)t != owner) + continue; + + switch (blocker_type) { + case BLOCKER_TYPE_MUTEX: pr_err("INFO: task %s:%d is blocked on a mutex likely owned by task %s:%d.\n", - task->comm, task->pid, t->comm, t->pid); - sched_show_task(t); - return; + task->comm, task->pid, t->comm, t->pid); + break; + case BLOCKER_TYPE_SEM: + pr_err("INFO: task %s:%d blocked on a semaphore likely last held by task %s:%d\n", + task->comm, task->pid, t->comm, t->pid); + break; + case BLOCKER_TYPE_RWSEM_READER: + case BLOCKER_TYPE_RWSEM_WRITER: + pr_err("INFO: task %s:%d <%s> blocked on an rw-semaphore likely owned by task %s:%d <%s>\n", + task->comm, task->pid, rwsem_blocked_as, t->comm, + t->pid, rwsem_blocked_by); + break; } + /* Avoid duplicated task dump, skip if the task is also hung. */ + if (!task_is_hung(t, timeout)) + sched_show_task(t); + return; } } #else -static inline void debug_show_blocker(struct task_struct *task) +static inline void debug_show_blocker(struct task_struct *task, unsigned long timeout) { } #endif -static void check_hung_task(struct task_struct *t, unsigned long timeout) +static void check_hung_task(struct task_struct *t, unsigned long timeout, + unsigned long prev_detect_count) { - unsigned long switch_count = t->nvcsw + t->nivcsw; + unsigned long total_hung_task; - /* - * Ensure the task is not frozen. - * Also, skip vfork and any other user process that freezer should skip. - */ - if (unlikely(READ_ONCE(t->__state) & TASK_FROZEN)) - return; - - /* - * When a freshly created task is scheduled once, changes its state to - * TASK_UNINTERRUPTIBLE without having ever been switched out once, it - * musn't be checked. - */ - if (unlikely(!switch_count)) - return; - - if (switch_count != t->last_switch_count) { - t->last_switch_count = switch_count; - t->last_switch_time = jiffies; - return; - } - if (time_is_after_jiffies(t->last_switch_time + timeout * HZ)) + if (!task_is_hung(t, timeout)) return; /* @@ -163,11 +237,11 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) */ sysctl_hung_task_detect_count++; + total_hung_task = sysctl_hung_task_detect_count - prev_detect_count; trace_sched_process_hang(t); - if (sysctl_hung_task_panic) { + if (sysctl_hung_task_panic && total_hung_task >= sysctl_hung_task_panic) { console_verbose(); - hung_task_show_lock = true; hung_task_call_panic = true; } @@ -189,11 +263,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); - debug_show_blocker(t); - hung_task_show_lock = true; + debug_show_blocker(t, timeout); - if (sysctl_hung_task_all_cpu_backtrace) - hung_task_show_all_bt = true; if (!sysctl_hung_task_warnings) pr_info("Future hung task reports are suppressed, see sysctl kernel.hung_task_warnings\n"); } @@ -234,6 +305,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) int max_count = sysctl_hung_task_check_count; unsigned long last_break = jiffies; struct task_struct *g, *t; + unsigned long prev_detect_count = sysctl_hung_task_detect_count; + int need_warning = sysctl_hung_task_warnings; + unsigned long si_mask = hung_task_si_mask; /* * If the system crashed already then all bets are off, @@ -242,10 +316,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) if (test_taint(TAINT_DIE) || did_panic) return; - hung_task_show_lock = false; + rcu_read_lock(); for_each_process_thread(g, t) { - unsigned int state; if (!max_count--) goto unlock; @@ -254,26 +327,24 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) goto unlock; last_break = jiffies; } - /* - * skip the TASK_KILLABLE tasks -- these can be killed - * skip the TASK_IDLE tasks -- those are genuinely idle - */ - state = READ_ONCE(t->__state); - if ((state & TASK_UNINTERRUPTIBLE) && - !(state & TASK_WAKEKILL) && - !(state & TASK_NOLOAD)) - check_hung_task(t, timeout); + + check_hung_task(t, timeout, prev_detect_count); } unlock: rcu_read_unlock(); - if (hung_task_show_lock) - debug_show_all_locks(); - if (hung_task_show_all_bt) { - hung_task_show_all_bt = false; - trigger_all_cpu_backtrace(); + if (!(sysctl_hung_task_detect_count - prev_detect_count)) + return; + + if (need_warning || hung_task_call_panic) { + si_mask |= SYS_INFO_LOCKS; + + if (sysctl_hung_task_all_cpu_backtrace) + si_mask |= SYS_INFO_ALL_BT; } + sys_info(si_mask); + if (hung_task_call_panic) panic("hung_task: blocked tasks"); } @@ -331,7 +402,7 @@ static const struct ctl_table hung_task_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "hung_task_check_count", @@ -372,6 +443,13 @@ static const struct ctl_table hung_task_sysctls[] = { .mode = 0444, .proc_handler = proc_doulongvec_minmax, }, + { + .procname = "hung_task_sys_info", + .data = &hung_task_si_mask, + .maxlen = sizeof(hung_task_si_mask), + .mode = 0644, + .proc_handler = sysctl_sys_info_handler, + }, }; static void __init hung_task_sysctl_init(void) diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 3f02a0e45254..1b4254d19a73 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -6,10 +6,6 @@ menu "IRQ subsystem" config MAY_HAVE_SPARSE_IRQ bool -# Legacy support, required for itanic -config GENERIC_IRQ_LEGACY - bool - # Enable the generic irq autoprobe mechanism config GENERIC_IRQ_PROBE bool @@ -144,6 +140,19 @@ config GENERIC_IRQ_DEBUGFS config GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD bool +config IRQ_KUNIT_TEST + bool "KUnit tests for IRQ management APIs" if !KUNIT_ALL_TESTS + depends on KUNIT=y + depends on SPARSE_IRQ + default KUNIT_ALL_TESTS + select IRQ_DOMAIN + imply SMP + help + This option enables KUnit tests for the IRQ subsystem API. These are + only for development and testing, not for regular kernel use cases. + + If unsure, say N. + endmenu config GENERIC_IRQ_MULTI_HANDLER diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index c0f44c06d69d..6ab3a4055667 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_GENERIC_IRQ_IPI_MUX) += ipi-mux.o obj-$(CONFIG_SMP) += affinity.o obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o +obj-$(CONFIG_IRQ_KUNIT_TEST) += irq_test.o diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 44a4eba80315..4013e6ad2b2f 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -69,21 +69,20 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) * have multiple sets, build each sets affinity mask separately. */ for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) { - unsigned int this_vecs = affd->set_size[i]; - int j; - struct cpumask *result = group_cpus_evenly(this_vecs); + unsigned int nr_masks, this_vecs = affd->set_size[i]; + struct cpumask *result = group_cpus_evenly(this_vecs, &nr_masks); if (!result) { kfree(masks); return NULL; } - for (j = 0; j < this_vecs; j++) + for (int j = 0; j < nr_masks; j++) cpumask_copy(&masks[curvec + j].mask, &result[j]); kfree(result); - curvec += this_vecs; - usedvecs += this_vecs; + curvec += nr_masks; + usedvecs += nr_masks; } /* Fill out vectors at the end that don't need affinity */ diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index ae60cae24e9a..d0af8a8b3ae6 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -43,18 +43,16 @@ unsigned long probe_irq_on(void) * flush such a longstanding irq before considering it as spurious. */ for_each_irq_desc_reverse(i, desc) { - raw_spin_lock_irq(&desc->lock); + guard(raw_spinlock_irq)(&desc->lock); if (!desc->action && irq_settings_can_probe(desc)) { /* * Some chips need to know about probing in * progress: */ if (desc->irq_data.chip->irq_set_type) - desc->irq_data.chip->irq_set_type(&desc->irq_data, - IRQ_TYPE_PROBE); + desc->irq_data.chip->irq_set_type(&desc->irq_data, IRQ_TYPE_PROBE); irq_activate_and_startup(desc, IRQ_NORESEND); } - raw_spin_unlock_irq(&desc->lock); } /* Wait for longstanding interrupts to trigger. */ @@ -66,13 +64,12 @@ unsigned long probe_irq_on(void) * happened in the previous stage, it may have masked itself) */ for_each_irq_desc_reverse(i, desc) { - raw_spin_lock_irq(&desc->lock); + guard(raw_spinlock_irq)(&desc->lock); if (!desc->action && irq_settings_can_probe(desc)) { desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; if (irq_activate_and_startup(desc, IRQ_NORESEND)) desc->istate |= IRQS_PENDING; } - raw_spin_unlock_irq(&desc->lock); } /* @@ -84,18 +81,16 @@ unsigned long probe_irq_on(void) * Now filter out any obviously spurious interrupts */ for_each_irq_desc(i, desc) { - raw_spin_lock_irq(&desc->lock); - + guard(raw_spinlock_irq)(&desc->lock); if (desc->istate & IRQS_AUTODETECT) { /* It triggered already - consider it spurious. */ if (!(desc->istate & IRQS_WAITING)) { desc->istate &= ~IRQS_AUTODETECT; irq_shutdown_and_deactivate(desc); - } else - if (i < 32) - mask |= 1 << i; + } else if (i < 32) { + mask |= 1 << i; + } } - raw_spin_unlock_irq(&desc->lock); } return mask; @@ -121,7 +116,7 @@ unsigned int probe_irq_mask(unsigned long val) int i; for_each_irq_desc(i, desc) { - raw_spin_lock_irq(&desc->lock); + guard(raw_spinlock_irq)(&desc->lock); if (desc->istate & IRQS_AUTODETECT) { if (i < 16 && !(desc->istate & IRQS_WAITING)) mask |= 1 << i; @@ -129,7 +124,6 @@ unsigned int probe_irq_mask(unsigned long val) desc->istate &= ~IRQS_AUTODETECT; irq_shutdown_and_deactivate(desc); } - raw_spin_unlock_irq(&desc->lock); } mutex_unlock(&probing_active); @@ -160,8 +154,7 @@ int probe_irq_off(unsigned long val) struct irq_desc *desc; for_each_irq_desc(i, desc) { - raw_spin_lock_irq(&desc->lock); - + guard(raw_spinlock_irq)(&desc->lock); if (desc->istate & IRQS_AUTODETECT) { if (!(desc->istate & IRQS_WAITING)) { if (!nr_of_irqs) @@ -171,7 +164,6 @@ int probe_irq_off(unsigned long val) desc->istate &= ~IRQS_AUTODETECT; irq_shutdown_and_deactivate(desc); } - raw_spin_unlock_irq(&desc->lock); } mutex_unlock(&probing_active); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 36cf1b09cc84..678f094d261a 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -34,98 +34,80 @@ struct irqaction chained_action = { }; /** - * irq_set_chip - set the irq chip for an irq - * @irq: irq number - * @chip: pointer to irq chip description structure + * irq_set_chip - set the irq chip for an irq + * @irq: irq number + * @chip: pointer to irq chip description structure */ int irq_set_chip(unsigned int irq, const struct irq_chip *chip) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); + int ret = -EINVAL; - if (!desc) - return -EINVAL; - - desc->irq_data.chip = (struct irq_chip *)(chip ?: &no_irq_chip); - irq_put_desc_unlock(desc, flags); - /* - * For !CONFIG_SPARSE_IRQ make the irq show up in - * allocated_irqs. - */ - irq_mark_irq(irq); - return 0; + scoped_irqdesc_get_and_lock(irq, 0) { + scoped_irqdesc->irq_data.chip = (struct irq_chip *)(chip ?: &no_irq_chip); + ret = 0; + } + /* For !CONFIG_SPARSE_IRQ make the irq show up in allocated_irqs. */ + if (!ret) + irq_mark_irq(irq); + return ret; } EXPORT_SYMBOL(irq_set_chip); /** - * irq_set_irq_type - set the irq trigger type for an irq - * @irq: irq number - * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h + * irq_set_irq_type - set the irq trigger type for an irq + * @irq: irq number + * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h */ int irq_set_irq_type(unsigned int irq, unsigned int type) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); - int ret = 0; - - if (!desc) - return -EINVAL; - - ret = __irq_set_trigger(desc, type); - irq_put_desc_busunlock(desc, flags); - return ret; + scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) + return __irq_set_trigger(scoped_irqdesc, type); + return -EINVAL; } EXPORT_SYMBOL(irq_set_irq_type); /** - * irq_set_handler_data - set irq handler data for an irq - * @irq: Interrupt number - * @data: Pointer to interrupt specific data + * irq_set_handler_data - set irq handler data for an irq + * @irq: Interrupt number + * @data: Pointer to interrupt specific data * - * Set the hardware irq controller data for an irq + * Set the hardware irq controller data for an irq */ int irq_set_handler_data(unsigned int irq, void *data) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); - - if (!desc) - return -EINVAL; - desc->irq_common_data.handler_data = data; - irq_put_desc_unlock(desc, flags); - return 0; + scoped_irqdesc_get_and_lock(irq, 0) { + scoped_irqdesc->irq_common_data.handler_data = data; + return 0; + } + return -EINVAL; } EXPORT_SYMBOL(irq_set_handler_data); /** - * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset - * @irq_base: Interrupt number base - * @irq_offset: Interrupt number offset - * @entry: Pointer to MSI descriptor data + * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset + * @irq_base: Interrupt number base + * @irq_offset: Interrupt number offset + * @entry: Pointer to MSI descriptor data * - * Set the MSI descriptor entry for an irq at offset + * Set the MSI descriptor entry for an irq at offset */ -int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, - struct msi_desc *entry) -{ - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL); - - if (!desc) - return -EINVAL; - desc->irq_common_data.msi_desc = entry; - if (entry && !irq_offset) - entry->irq = irq_base; - irq_put_desc_unlock(desc, flags); - return 0; +int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, struct msi_desc *entry) +{ + scoped_irqdesc_get_and_lock(irq_base + irq_offset, IRQ_GET_DESC_CHECK_GLOBAL) { + scoped_irqdesc->irq_common_data.msi_desc = entry; + if (entry && !irq_offset) + entry->irq = irq_base; + return 0; + } + return -EINVAL; } /** - * irq_set_msi_desc - set MSI descriptor data for an irq - * @irq: Interrupt number - * @entry: Pointer to MSI descriptor data + * irq_set_msi_desc - set MSI descriptor data for an irq + * @irq: Interrupt number + * @entry: Pointer to MSI descriptor data * - * Set the MSI descriptor entry for an irq + * Set the MSI descriptor entry for an irq */ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) { @@ -133,22 +115,19 @@ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) } /** - * irq_set_chip_data - set irq chip data for an irq - * @irq: Interrupt number - * @data: Pointer to chip specific data + * irq_set_chip_data - set irq chip data for an irq + * @irq: Interrupt number + * @data: Pointer to chip specific data * - * Set the hardware irq chip data for an irq + * Set the hardware irq chip data for an irq */ int irq_set_chip_data(unsigned int irq, void *data) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); - - if (!desc) - return -EINVAL; - desc->irq_data.chip_data = data; - irq_put_desc_unlock(desc, flags); - return 0; + scoped_irqdesc_get_and_lock(irq, 0) { + scoped_irqdesc->irq_data.chip_data = data; + return 0; + } + return -EINVAL; } EXPORT_SYMBOL(irq_set_chip_data); @@ -223,6 +202,27 @@ __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff, return IRQ_STARTUP_ABORT; return IRQ_STARTUP_MANAGED; } + +void irq_startup_managed(struct irq_desc *desc) +{ + struct irq_data *d = irq_desc_get_irq_data(desc); + + /* + * Clear managed-shutdown flag, so we don't repeat managed-startup for + * multiple hotplugs, and cause imbalanced disable depth. + */ + irqd_clr_managed_shutdown(d); + + /* + * Only start it up when the disable depth is 1, so that a disable, + * hotunplug, hotplug sequence does not end up enabling it during + * hotplug unconditionally. + */ + desc->depth--; + if (!desc->depth) + irq_startup(desc, IRQ_RESEND, IRQ_START_COND); +} + #else static __always_inline int __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff, @@ -290,6 +290,7 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force) ret = __irq_startup(desc); break; case IRQ_STARTUP_ABORT: + desc->depth = 1; irqd_set_managed_shutdown(d); return 0; } @@ -322,7 +323,13 @@ void irq_shutdown(struct irq_desc *desc) { if (irqd_is_started(&desc->irq_data)) { clear_irq_resend(desc); - desc->depth = 1; + /* + * Increment disable depth, so that a managed shutdown on + * CPU hotunplug preserves the actual disabled state when the + * CPU comes back online. See irq_startup_managed(). + */ + desc->depth++; + if (desc->irq_data.chip->irq_shutdown) { desc->irq_data.chip->irq_shutdown(&desc->irq_data); irq_state_set_disabled(desc); @@ -450,64 +457,33 @@ void unmask_threaded_irq(struct irq_desc *desc) unmask_irq(desc); } -/* - * handle_nested_irq - Handle a nested irq from a irq thread - * @irq: the interrupt number - * - * Handle interrupts which are nested into a threaded interrupt - * handler. The handler function is called inside the calling - * threads context. - */ -void handle_nested_irq(unsigned int irq) +/* Busy wait until INPROGRESS is cleared */ +static bool irq_wait_on_inprogress(struct irq_desc *desc) { - struct irq_desc *desc = irq_to_desc(irq); - struct irqaction *action; - irqreturn_t action_ret; - - might_sleep(); - - raw_spin_lock_irq(&desc->lock); + if (IS_ENABLED(CONFIG_SMP)) { + do { + raw_spin_unlock(&desc->lock); + while (irqd_irq_inprogress(&desc->irq_data)) + cpu_relax(); + raw_spin_lock(&desc->lock); + } while (irqd_irq_inprogress(&desc->irq_data)); - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - - action = desc->action; - if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { - desc->istate |= IRQS_PENDING; - raw_spin_unlock_irq(&desc->lock); - return; + /* Might have been disabled in meantime */ + return !irqd_irq_disabled(&desc->irq_data) && desc->action; } - - kstat_incr_irqs_this_cpu(desc); - atomic_inc(&desc->threads_active); - raw_spin_unlock_irq(&desc->lock); - - action_ret = IRQ_NONE; - for_each_action_of_desc(desc, action) - action_ret |= action->thread_fn(action->irq, action->dev_id); - - if (!irq_settings_no_debug(desc)) - note_interrupt(desc, action_ret); - - wake_threads_waitq(desc); -} -EXPORT_SYMBOL_GPL(handle_nested_irq); - -static bool irq_check_poll(struct irq_desc *desc) -{ - if (!(desc->istate & IRQS_POLL_INPROGRESS)) - return false; - return irq_wait_for_poll(desc); + return false; } -static bool irq_may_run(struct irq_desc *desc) +static bool irq_can_handle_pm(struct irq_desc *desc) { - unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED; + struct irq_data *irqd = &desc->irq_data; + const struct cpumask *aff; /* * If the interrupt is not in progress and is not an armed * wakeup interrupt, proceed. */ - if (!irqd_has_set(&desc->irq_data, mask)) + if (!irqd_has_set(irqd, IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED)) return true; /* @@ -515,86 +491,167 @@ static bool irq_may_run(struct irq_desc *desc) * and suspended, disable it and notify the pm core about the * event. */ - if (irq_pm_check_wakeup(desc)) + if (unlikely(irqd_has_set(irqd, IRQD_WAKEUP_ARMED))) { + irq_pm_handle_wakeup(desc); + return false; + } + + /* Check whether the interrupt is polled on another CPU */ + if (unlikely(desc->istate & IRQS_POLL_INPROGRESS)) { + if (WARN_ONCE(irq_poll_cpu == smp_processor_id(), + "irq poll in progress on cpu %d for irq %d\n", + smp_processor_id(), desc->irq_data.irq)) + return false; + return irq_wait_on_inprogress(desc); + } + + /* The below works only for single target interrupts */ + if (!IS_ENABLED(CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK) || + !irqd_is_single_target(irqd) || desc->handle_irq != handle_edge_irq) return false; /* - * Handle a potential concurrent poll on a different core. + * If the interrupt affinity was moved to this CPU and the + * interrupt is currently handled on the previous target CPU, then + * busy wait for INPROGRESS to be cleared. Otherwise for edge type + * interrupts the handler might get stuck on the previous target: + * + * CPU 0 CPU 1 (new target) + * handle_edge_irq() + * repeat: + * handle_event() handle_edge_irq() + * if (INPROGESS) { + * set(PENDING); + * mask(); + * return; + * } + * if (PENDING) { + * clear(PENDING); + * unmask(); + * goto repeat; + * } + * + * This happens when the device raises interrupts with a high rate + * and always before handle_event() completes and the CPU0 handler + * can clear INPROGRESS. This has been observed in virtual machines. */ - return irq_check_poll(desc); + aff = irq_data_get_effective_affinity_mask(irqd); + if (cpumask_first(aff) != smp_processor_id()) + return false; + return irq_wait_on_inprogress(desc); +} + +static inline bool irq_can_handle_actions(struct irq_desc *desc) +{ + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { + desc->istate |= IRQS_PENDING; + return false; + } + return true; +} + +static inline bool irq_can_handle(struct irq_desc *desc) +{ + if (!irq_can_handle_pm(desc)) + return false; + + return irq_can_handle_actions(desc); } /** - * handle_simple_irq - Simple and software-decoded IRQs. - * @desc: the interrupt description structure for this irq - * - * Simple interrupts are either sent from a demultiplexing interrupt - * handler or come from hardware, where no interrupt hardware control - * is necessary. + * handle_nested_irq - Handle a nested irq from a irq thread + * @irq: the interrupt number * - * Note: The caller is expected to handle the ack, clear, mask and - * unmask issues if necessary. + * Handle interrupts which are nested into a threaded interrupt + * handler. The handler function is called inside the calling threads + * context. */ -void handle_simple_irq(struct irq_desc *desc) +void handle_nested_irq(unsigned int irq) { - raw_spin_lock(&desc->lock); + struct irq_desc *desc = irq_to_desc(irq); + struct irqaction *action; + irqreturn_t action_ret; - if (!irq_may_run(desc)) - goto out_unlock; + might_sleep(); - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + scoped_guard(raw_spinlock_irq, &desc->lock) { + if (!irq_can_handle_actions(desc)) + return; - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { - desc->istate |= IRQS_PENDING; - goto out_unlock; + action = desc->action; + kstat_incr_irqs_this_cpu(desc); + atomic_inc(&desc->threads_active); + } + + action_ret = IRQ_NONE; + for_each_action_of_desc(desc, action) + action_ret |= action->thread_fn(action->irq, action->dev_id); + + if (!irq_settings_no_debug(desc)) + note_interrupt(desc, action_ret); + + wake_threads_waitq(desc); +} +EXPORT_SYMBOL_GPL(handle_nested_irq); + +/** + * handle_simple_irq - Simple and software-decoded IRQs. + * @desc: the interrupt description structure for this irq + * + * Simple interrupts are either sent from a demultiplexing interrupt + * handler or come from hardware, where no interrupt hardware control is + * necessary. + * + * Note: The caller is expected to handle the ack, clear, mask and unmask + * issues if necessary. + */ +void handle_simple_irq(struct irq_desc *desc) +{ + guard(raw_spinlock)(&desc->lock); + + if (!irq_can_handle_pm(desc)) { + if (irqd_needs_resend_when_in_progress(&desc->irq_data)) + desc->istate |= IRQS_PENDING; + return; } + if (!irq_can_handle_actions(desc)) + return; + kstat_incr_irqs_this_cpu(desc); handle_irq_event(desc); - -out_unlock: - raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_simple_irq); /** - * handle_untracked_irq - Simple and software-decoded IRQs. - * @desc: the interrupt description structure for this irq + * handle_untracked_irq - Simple and software-decoded IRQs. + * @desc: the interrupt description structure for this irq * - * Untracked interrupts are sent from a demultiplexing interrupt - * handler when the demultiplexer does not know which device it its - * multiplexed irq domain generated the interrupt. IRQ's handled - * through here are not subjected to stats tracking, randomness, or - * spurious interrupt detection. + * Untracked interrupts are sent from a demultiplexing interrupt handler + * when the demultiplexer does not know which device it its multiplexed irq + * domain generated the interrupt. IRQ's handled through here are not + * subjected to stats tracking, randomness, or spurious interrupt + * detection. * - * Note: Like handle_simple_irq, the caller is expected to handle - * the ack, clear, mask and unmask issues if necessary. + * Note: Like handle_simple_irq, the caller is expected to handle the ack, + * clear, mask and unmask issues if necessary. */ void handle_untracked_irq(struct irq_desc *desc) { - raw_spin_lock(&desc->lock); - - if (!irq_may_run(desc)) - goto out_unlock; - - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + scoped_guard(raw_spinlock, &desc->lock) { + if (!irq_can_handle(desc)) + return; - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { - desc->istate |= IRQS_PENDING; - goto out_unlock; + desc->istate &= ~IRQS_PENDING; + irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); } - desc->istate &= ~IRQS_PENDING; - irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); - raw_spin_unlock(&desc->lock); - __handle_irq_event_percpu(desc); - raw_spin_lock(&desc->lock); - irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); - -out_unlock: - raw_spin_unlock(&desc->lock); + scoped_guard(raw_spinlock, &desc->lock) + irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); } EXPORT_SYMBOL_GPL(handle_untracked_irq); @@ -617,40 +674,26 @@ static void cond_unmask_irq(struct irq_desc *desc) } /** - * handle_level_irq - Level type irq handler - * @desc: the interrupt description structure for this irq + * handle_level_irq - Level type irq handler + * @desc: the interrupt description structure for this irq * - * Level type interrupts are active as long as the hardware line has - * the active level. This may require to mask the interrupt and unmask - * it after the associated handler has acknowledged the device, so the - * interrupt line is back to inactive. + * Level type interrupts are active as long as the hardware line has the + * active level. This may require to mask the interrupt and unmask it after + * the associated handler has acknowledged the device, so the interrupt + * line is back to inactive. */ void handle_level_irq(struct irq_desc *desc) { - raw_spin_lock(&desc->lock); + guard(raw_spinlock)(&desc->lock); mask_ack_irq(desc); - if (!irq_may_run(desc)) - goto out_unlock; - - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - - /* - * If its disabled or no action available - * keep it masked and get out of here - */ - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { - desc->istate |= IRQS_PENDING; - goto out_unlock; - } + if (!irq_can_handle(desc)) + return; kstat_incr_irqs_this_cpu(desc); handle_irq_event(desc); cond_unmask_irq(desc); - -out_unlock: - raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_level_irq); @@ -675,42 +718,43 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip) } } +static inline void cond_eoi_irq(struct irq_chip *chip, struct irq_data *data) +{ + if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED)) + chip->irq_eoi(data); +} + /** - * handle_fasteoi_irq - irq handler for transparent controllers - * @desc: the interrupt description structure for this irq + * handle_fasteoi_irq - irq handler for transparent controllers + * @desc: the interrupt description structure for this irq * - * Only a single callback will be issued to the chip: an ->eoi() - * call when the interrupt has been serviced. This enables support - * for modern forms of interrupt handlers, which handle the flow - * details in hardware, transparently. + * Only a single callback will be issued to the chip: an ->eoi() call when + * the interrupt has been serviced. This enables support for modern forms + * of interrupt handlers, which handle the flow details in hardware, + * transparently. */ void handle_fasteoi_irq(struct irq_desc *desc) { struct irq_chip *chip = desc->irq_data.chip; - raw_spin_lock(&desc->lock); + guard(raw_spinlock)(&desc->lock); /* * When an affinity change races with IRQ handling, the next interrupt * can arrive on the new CPU before the original CPU has completed * handling the previous one - it may need to be resent. */ - if (!irq_may_run(desc)) { + if (!irq_can_handle_pm(desc)) { if (irqd_needs_resend_when_in_progress(&desc->irq_data)) desc->istate |= IRQS_PENDING; - goto out; + cond_eoi_irq(chip, &desc->irq_data); + return; } - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - - /* - * If its disabled or no action available - * then mask it and get out of here: - */ - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { - desc->istate |= IRQS_PENDING; + if (!irq_can_handle_actions(desc)) { mask_irq(desc); - goto out; + cond_eoi_irq(chip, &desc->irq_data); + return; } kstat_incr_irqs_this_cpu(desc); @@ -726,13 +770,6 @@ void handle_fasteoi_irq(struct irq_desc *desc) */ if (unlikely(desc->istate & IRQS_PENDING)) check_irq_resend(desc, false); - - raw_spin_unlock(&desc->lock); - return; -out: - if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED)) - chip->irq_eoi(&desc->irq_data); - raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_fasteoi_irq); @@ -770,40 +807,27 @@ void handle_fasteoi_nmi(struct irq_desc *desc) EXPORT_SYMBOL_GPL(handle_fasteoi_nmi); /** - * handle_edge_irq - edge type IRQ handler - * @desc: the interrupt description structure for this irq + * handle_edge_irq - edge type IRQ handler + * @desc: the interrupt description structure for this irq * - * Interrupt occurs on the falling and/or rising edge of a hardware - * signal. The occurrence is latched into the irq controller hardware - * and must be acked in order to be reenabled. After the ack another - * interrupt can happen on the same source even before the first one - * is handled by the associated event handler. If this happens it - * might be necessary to disable (mask) the interrupt depending on the - * controller hardware. This requires to reenable the interrupt inside - * of the loop which handles the interrupts which have arrived while - * the handler was running. If all pending interrupts are handled, the - * loop is left. + * Interrupt occurs on the falling and/or rising edge of a hardware + * signal. The occurrence is latched into the irq controller hardware and + * must be acked in order to be reenabled. After the ack another interrupt + * can happen on the same source even before the first one is handled by + * the associated event handler. If this happens it might be necessary to + * disable (mask) the interrupt depending on the controller hardware. This + * requires to reenable the interrupt inside of the loop which handles the + * interrupts which have arrived while the handler was running. If all + * pending interrupts are handled, the loop is left. */ void handle_edge_irq(struct irq_desc *desc) { - raw_spin_lock(&desc->lock); - - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + guard(raw_spinlock)(&desc->lock); - if (!irq_may_run(desc)) { + if (!irq_can_handle(desc)) { desc->istate |= IRQS_PENDING; mask_ack_irq(desc); - goto out_unlock; - } - - /* - * If its disabled or no action available then mask it and get - * out of here. - */ - if (irqd_irq_disabled(&desc->irq_data) || !desc->action) { - desc->istate |= IRQS_PENDING; - mask_ack_irq(desc); - goto out_unlock; + return; } kstat_incr_irqs_this_cpu(desc); @@ -814,7 +838,7 @@ void handle_edge_irq(struct irq_desc *desc) do { if (unlikely(!desc->action)) { mask_irq(desc); - goto out_unlock; + return; } /* @@ -830,11 +854,7 @@ void handle_edge_irq(struct irq_desc *desc) handle_irq_event(desc); - } while ((desc->istate & IRQS_PENDING) && - !irqd_irq_disabled(&desc->irq_data)); - -out_unlock: - raw_spin_unlock(&desc->lock); + } while ((desc->istate & IRQS_PENDING) && !irqd_irq_disabled(&desc->irq_data)); } EXPORT_SYMBOL(handle_edge_irq); @@ -877,8 +897,9 @@ void handle_percpu_irq(struct irq_desc *desc) void handle_percpu_devid_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); - struct irqaction *action = desc->action; unsigned int irq = irq_desc_get_irq(desc); + unsigned int cpu = smp_processor_id(); + struct irqaction *action; irqreturn_t res; /* @@ -890,12 +911,15 @@ void handle_percpu_devid_irq(struct irq_desc *desc) if (chip->irq_ack) chip->irq_ack(&desc->irq_data); + for (action = desc->action; action; action = action->next) + if (cpumask_test_cpu(cpu, action->affinity)) + break; + if (likely(action)) { trace_irq_handler_entry(irq, action); res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); trace_irq_handler_exit(irq, action, res); } else { - unsigned int cpu = smp_processor_id(); bool enabled = cpumask_test_cpu(cpu, desc->percpu_enabled); if (enabled) @@ -909,31 +933,6 @@ void handle_percpu_devid_irq(struct irq_desc *desc) chip->irq_eoi(&desc->irq_data); } -/** - * handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu - * dev ids - * @desc: the interrupt description structure for this irq - * - * Similar to handle_fasteoi_nmi, but handling the dev_id cookie - * as a percpu pointer. - */ -void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc) -{ - struct irq_chip *chip = irq_desc_get_chip(desc); - struct irqaction *action = desc->action; - unsigned int irq = irq_desc_get_irq(desc); - irqreturn_t res; - - __kstat_incr_irqs_this_cpu(desc); - - trace_irq_handler_entry(irq, action); - res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); - trace_irq_handler_exit(irq, action, res); - - if (chip->irq_eoi) - chip->irq_eoi(&desc->irq_data); -} - static void __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, int is_chained, const char *name) @@ -1007,35 +1006,23 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, } } -void -__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, - const char *name) +void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, + const char *name) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0); - - if (!desc) - return; - - __irq_do_set_handler(desc, handle, is_chained, name); - irq_put_desc_busunlock(desc, flags); + scoped_irqdesc_get_and_buslock(irq, 0) + __irq_do_set_handler(scoped_irqdesc, handle, is_chained, name); } EXPORT_SYMBOL_GPL(__irq_set_handler); -void -irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, - void *data) +void irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, + void *data) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0); - - if (!desc) - return; + scoped_irqdesc_get_and_buslock(irq, 0) { + struct irq_desc *desc = scoped_irqdesc; - desc->irq_common_data.handler_data = data; - __irq_do_set_handler(desc, handle, 1, NULL); - - irq_put_desc_busunlock(desc, flags); + desc->irq_common_data.handler_data = data; + __irq_do_set_handler(desc, handle, 1, NULL); + } } EXPORT_SYMBOL_GPL(irq_set_chained_handler_and_data); @@ -1050,38 +1037,34 @@ EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name); void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) { - unsigned long flags, trigger, tmp; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); - - if (!desc) - return; - - /* - * Warn when a driver sets the no autoenable flag on an already - * active interrupt. - */ - WARN_ON_ONCE(!desc->depth && (set & _IRQ_NOAUTOEN)); - - irq_settings_clr_and_set(desc, clr, set); + scoped_irqdesc_get_and_lock(irq, 0) { + struct irq_desc *desc = scoped_irqdesc; + unsigned long trigger, tmp; + /* + * Warn when a driver sets the no autoenable flag on an already + * active interrupt. + */ + WARN_ON_ONCE(!desc->depth && (set & _IRQ_NOAUTOEN)); - trigger = irqd_get_trigger_type(&desc->irq_data); + irq_settings_clr_and_set(desc, clr, set); - irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | - IRQD_TRIGGER_MASK | IRQD_LEVEL); - if (irq_settings_has_no_balance_set(desc)) - irqd_set(&desc->irq_data, IRQD_NO_BALANCING); - if (irq_settings_is_per_cpu(desc)) - irqd_set(&desc->irq_data, IRQD_PER_CPU); - if (irq_settings_is_level(desc)) - irqd_set(&desc->irq_data, IRQD_LEVEL); + trigger = irqd_get_trigger_type(&desc->irq_data); - tmp = irq_settings_get_trigger_mask(desc); - if (tmp != IRQ_TYPE_NONE) - trigger = tmp; + irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | + IRQD_TRIGGER_MASK | IRQD_LEVEL); + if (irq_settings_has_no_balance_set(desc)) + irqd_set(&desc->irq_data, IRQD_NO_BALANCING); + if (irq_settings_is_per_cpu(desc)) + irqd_set(&desc->irq_data, IRQD_PER_CPU); + if (irq_settings_is_level(desc)) + irqd_set(&desc->irq_data, IRQD_LEVEL); - irqd_set(&desc->irq_data, trigger); + tmp = irq_settings_get_trigger_mask(desc); + if (tmp != IRQ_TYPE_NONE) + trigger = tmp; - irq_put_desc_unlock(desc, flags); + irqd_set(&desc->irq_data, trigger); + } } EXPORT_SYMBOL_GPL(irq_modify_status); @@ -1094,25 +1077,21 @@ EXPORT_SYMBOL_GPL(irq_modify_status); */ void irq_cpu_online(void) { - struct irq_desc *desc; - struct irq_chip *chip; - unsigned long flags; unsigned int irq; for_each_active_irq(irq) { - desc = irq_to_desc(irq); + struct irq_desc *desc = irq_to_desc(irq); + struct irq_chip *chip; + if (!desc) continue; - raw_spin_lock_irqsave(&desc->lock, flags); - + guard(raw_spinlock_irqsave)(&desc->lock); chip = irq_data_get_irq_chip(&desc->irq_data); if (chip && chip->irq_cpu_online && (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || !irqd_irq_disabled(&desc->irq_data))) chip->irq_cpu_online(&desc->irq_data); - - raw_spin_unlock_irqrestore(&desc->lock, flags); } } @@ -1124,25 +1103,21 @@ void irq_cpu_online(void) */ void irq_cpu_offline(void) { - struct irq_desc *desc; - struct irq_chip *chip; - unsigned long flags; unsigned int irq; for_each_active_irq(irq) { - desc = irq_to_desc(irq); + struct irq_desc *desc = irq_to_desc(irq); + struct irq_chip *chip; + if (!desc) continue; - raw_spin_lock_irqsave(&desc->lock, flags); - + guard(raw_spinlock_irqsave)(&desc->lock); chip = irq_data_get_irq_chip(&desc->irq_data); if (chip && chip->irq_cpu_offline && (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || !irqd_irq_disabled(&desc->irq_data))) chip->irq_cpu_offline(&desc->irq_data); - - raw_spin_unlock_irqrestore(&desc->lock, flags); } } #endif @@ -1151,102 +1126,69 @@ void irq_cpu_offline(void) #ifdef CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS /** - * handle_fasteoi_ack_irq - irq handler for edge hierarchy - * stacked on transparent controllers + * handle_fasteoi_ack_irq - irq handler for edge hierarchy stacked on + * transparent controllers * - * @desc: the interrupt description structure for this irq + * @desc: the interrupt description structure for this irq * - * Like handle_fasteoi_irq(), but for use with hierarchy where - * the irq_chip also needs to have its ->irq_ack() function - * called. + * Like handle_fasteoi_irq(), but for use with hierarchy where the irq_chip + * also needs to have its ->irq_ack() function called. */ void handle_fasteoi_ack_irq(struct irq_desc *desc) { struct irq_chip *chip = desc->irq_data.chip; - raw_spin_lock(&desc->lock); - - if (!irq_may_run(desc)) - goto out; + guard(raw_spinlock)(&desc->lock); - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + if (!irq_can_handle_pm(desc)) { + cond_eoi_irq(chip, &desc->irq_data); + return; + } - /* - * If its disabled or no action available - * then mask it and get out of here: - */ - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { - desc->istate |= IRQS_PENDING; + if (unlikely(!irq_can_handle_actions(desc))) { mask_irq(desc); - goto out; + cond_eoi_irq(chip, &desc->irq_data); + return; } kstat_incr_irqs_this_cpu(desc); if (desc->istate & IRQS_ONESHOT) mask_irq(desc); - /* Start handling the irq */ desc->irq_data.chip->irq_ack(&desc->irq_data); handle_irq_event(desc); cond_unmask_eoi_irq(desc, chip); - - raw_spin_unlock(&desc->lock); - return; -out: - if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED)) - chip->irq_eoi(&desc->irq_data); - raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_fasteoi_ack_irq); /** - * handle_fasteoi_mask_irq - irq handler for level hierarchy - * stacked on transparent controllers + * handle_fasteoi_mask_irq - irq handler for level hierarchy stacked on + * transparent controllers * - * @desc: the interrupt description structure for this irq + * @desc: the interrupt description structure for this irq * - * Like handle_fasteoi_irq(), but for use with hierarchy where - * the irq_chip also needs to have its ->irq_mask_ack() function - * called. + * Like handle_fasteoi_irq(), but for use with hierarchy where the irq_chip + * also needs to have its ->irq_mask_ack() function called. */ void handle_fasteoi_mask_irq(struct irq_desc *desc) { struct irq_chip *chip = desc->irq_data.chip; - raw_spin_lock(&desc->lock); + guard(raw_spinlock)(&desc->lock); mask_ack_irq(desc); - if (!irq_may_run(desc)) - goto out; - - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - - /* - * If its disabled or no action available - * then mask it and get out of here: - */ - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { - desc->istate |= IRQS_PENDING; - mask_irq(desc); - goto out; + if (!irq_can_handle(desc)) { + cond_eoi_irq(chip, &desc->irq_data); + return; } kstat_incr_irqs_this_cpu(desc); - if (desc->istate & IRQS_ONESHOT) - mask_irq(desc); handle_irq_event(desc); cond_unmask_eoi_irq(desc, chip); - - raw_spin_unlock(&desc->lock); - return; -out: - if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED)) - chip->irq_eoi(&desc->irq_data); - raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_fasteoi_mask_irq); @@ -1297,6 +1239,43 @@ int irq_chip_get_parent_state(struct irq_data *data, EXPORT_SYMBOL_GPL(irq_chip_get_parent_state); /** + * irq_chip_shutdown_parent - Shutdown the parent interrupt + * @data: Pointer to interrupt specific data + * + * Invokes the irq_shutdown() callback of the parent if available or falls + * back to irq_chip_disable_parent(). + */ +void irq_chip_shutdown_parent(struct irq_data *data) +{ + struct irq_data *parent = data->parent_data; + + if (parent->chip->irq_shutdown) + parent->chip->irq_shutdown(parent); + else + irq_chip_disable_parent(data); +} +EXPORT_SYMBOL_GPL(irq_chip_shutdown_parent); + +/** + * irq_chip_startup_parent - Startup the parent interrupt + * @data: Pointer to interrupt specific data + * + * Invokes the irq_startup() callback of the parent if available or falls + * back to irq_chip_enable_parent(). + */ +unsigned int irq_chip_startup_parent(struct irq_data *data) +{ + struct irq_data *parent = data->parent_data; + + if (parent->chip->irq_startup) + return parent->chip->irq_startup(parent); + + irq_chip_enable_parent(data); + return 0; +} +EXPORT_SYMBOL_GPL(irq_chip_startup_parent); + +/** * irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if * NULL) * @data: Pointer to interrupt specific data diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 15a7654eff68..755346ea9819 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -177,9 +177,8 @@ void irq_migrate_all_off_this_cpu(void) bool affinity_broken; desc = irq_to_desc(irq); - raw_spin_lock(&desc->lock); - affinity_broken = migrate_one_irq(desc); - raw_spin_unlock(&desc->lock); + scoped_guard(raw_spinlock, &desc->lock) + affinity_broken = migrate_one_irq(desc); if (affinity_broken) { pr_debug_ratelimited("IRQ %u: no longer affine to CPU%u\n", @@ -211,15 +210,8 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu) !irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity)) return; - /* - * Don't restore suspended interrupts here when a system comes back - * from S3. They are reenabled via resume_device_irqs(). - */ - if (desc->istate & IRQS_SUSPENDED) - return; - if (irqd_is_managed_and_shutdown(data)) - irq_startup(desc, IRQ_RESEND, IRQ_START_COND); + irq_startup_managed(desc); /* * If the interrupt can only be directed to a single target @@ -244,9 +236,8 @@ int irq_affinity_online_cpu(unsigned int cpu) irq_lock_sparse(); for_each_active_irq(irq) { desc = irq_to_desc(irq); - raw_spin_lock_irq(&desc->lock); - irq_restore_affinity_of_irq(desc, cpu); - raw_spin_unlock_irq(&desc->lock); + scoped_guard(raw_spinlock_irq, &desc->lock) + irq_restore_affinity_of_irq(desc, cpu); } irq_unlock_sparse(); diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index ca142b9a4db3..3527defd2890 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -160,7 +160,7 @@ static int irq_debug_show(struct seq_file *m, void *p) struct irq_desc *desc = m->private; struct irq_data *data; - raw_spin_lock_irq(&desc->lock); + guard(raw_spinlock_irq)(&desc->lock); data = irq_desc_get_irq_data(desc); seq_printf(m, "handler: %ps\n", desc->handle_irq); seq_printf(m, "device: %s\n", desc->dev_name); @@ -178,7 +178,6 @@ static int irq_debug_show(struct seq_file *m, void *p) seq_printf(m, "node: %d\n", irq_data_get_node(data)); irq_debug_show_masks(m, desc); irq_debug_show_data(m, data, 0); - raw_spin_unlock_irq(&desc->lock); return 0; } @@ -226,12 +225,12 @@ void irq_debugfs_copy_devname(int irq, struct device *dev) void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc) { - char name [10]; + char name [12]; if (!irq_dir || !desc || desc->debugfs_file) return; - sprintf(name, "%d", irq); + sprintf(name, "%u", irq); desc->debugfs_file = debugfs_create_file(name, 0644, irq_dir, desc, &dfs_irq_ops); } diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index eb16a58e0322..b41188698622 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -30,29 +30,22 @@ static int devm_irq_match(struct device *dev, void *res, void *data) return this->irq == match->irq && this->dev_id == match->dev_id; } -/** - * devm_request_threaded_irq - allocate an interrupt line for a managed device - * @dev: device to request interrupt for - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs - * @thread_fn: function to be called in a threaded interrupt context. NULL - * for devices which handle everything in @handler - * @irqflags: Interrupt type flags - * @devname: An ascii name for the claiming device, dev_name(dev) if NULL - * @dev_id: A cookie passed back to the handler function - * - * Except for the extra @dev argument, this function takes the - * same arguments and performs the same function as - * request_threaded_irq(). IRQs requested with this function will be - * automatically freed on driver detach. - * - * If an IRQ allocated with this function needs to be freed - * separately, devm_free_irq() must be used. - */ -int devm_request_threaded_irq(struct device *dev, unsigned int irq, - irq_handler_t handler, irq_handler_t thread_fn, - unsigned long irqflags, const char *devname, - void *dev_id) +static int devm_request_result(struct device *dev, int rc, unsigned int irq, + irq_handler_t handler, irq_handler_t thread_fn, + const char *devname) +{ + if (rc >= 0) + return rc; + + return dev_err_probe(dev, rc, "request_irq(%u) %ps %ps %s\n", + irq, handler, thread_fn, devname ? : ""); +} + +static int __devm_request_threaded_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, + irq_handler_t thread_fn, + unsigned long irqflags, + const char *devname, void *dev_id) { struct irq_devres *dr; int rc; @@ -78,28 +71,48 @@ int devm_request_threaded_irq(struct device *dev, unsigned int irq, return 0; } -EXPORT_SYMBOL(devm_request_threaded_irq); /** - * devm_request_any_context_irq - allocate an interrupt line for a managed device - * @dev: device to request interrupt for - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs - * @irqflags: Interrupt type flags - * @devname: An ascii name for the claiming device, dev_name(dev) if NULL - * @dev_id: A cookie passed back to the handler function + * devm_request_threaded_irq - allocate an interrupt line for a managed device with error logging + * @dev: Device to request interrupt for + * @irq: Interrupt line to allocate + * @handler: Function to be called when the interrupt occurs + * @thread_fn: Function to be called in a threaded interrupt context. NULL + * for devices which handle everything in @handler + * @irqflags: Interrupt type flags + * @devname: An ascii name for the claiming device, dev_name(dev) if NULL + * @dev_id: A cookie passed back to the handler function * - * Except for the extra @dev argument, this function takes the - * same arguments and performs the same function as - * request_any_context_irq(). IRQs requested with this function will be - * automatically freed on driver detach. + * Except for the extra @dev argument, this function takes the same + * arguments and performs the same function as request_threaded_irq(). + * Interrupts requested with this function will be automatically freed on + * driver detach. + * + * If an interrupt allocated with this function needs to be freed + * separately, devm_free_irq() must be used. + * + * When the request fails, an error message is printed with contextual + * information (device name, interrupt number, handler functions and + * error code). Don't add extra error messages at the call sites. * - * If an IRQ allocated with this function needs to be freed - * separately, devm_free_irq() must be used. + * Return: 0 on success or a negative error number. */ -int devm_request_any_context_irq(struct device *dev, unsigned int irq, - irq_handler_t handler, unsigned long irqflags, - const char *devname, void *dev_id) +int devm_request_threaded_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, irq_handler_t thread_fn, + unsigned long irqflags, const char *devname, + void *dev_id) +{ + int rc = __devm_request_threaded_irq(dev, irq, handler, thread_fn, + irqflags, devname, dev_id); + + return devm_request_result(dev, rc, irq, handler, thread_fn, devname); +} +EXPORT_SYMBOL(devm_request_threaded_irq); + +static int __devm_request_any_context_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, void *dev_id) { struct irq_devres *dr; int rc; @@ -124,6 +137,40 @@ int devm_request_any_context_irq(struct device *dev, unsigned int irq, return rc; } + +/** + * devm_request_any_context_irq - allocate an interrupt line for a managed device with error logging + * @dev: Device to request interrupt for + * @irq: Interrupt line to allocate + * @handler: Function to be called when the interrupt occurs + * @irqflags: Interrupt type flags + * @devname: An ascii name for the claiming device, dev_name(dev) if NULL + * @dev_id: A cookie passed back to the handler function + * + * Except for the extra @dev argument, this function takes the same + * arguments and performs the same function as request_any_context_irq(). + * Interrupts requested with this function will be automatically freed on + * driver detach. + * + * If an interrupt allocated with this function needs to be freed + * separately, devm_free_irq() must be used. + * + * When the request fails, an error message is printed with contextual + * information (device name, interrupt number, handler functions and + * error code). Don't add extra error messages at the call sites. + * + * Return: IRQC_IS_HARDIRQ or IRQC_IS_NESTED on success, or a negative error + * number. + */ +int devm_request_any_context_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id) +{ + int rc = __devm_request_any_context_irq(dev, irq, handler, irqflags, + devname, dev_id); + + return devm_request_result(dev, rc, irq, handler, NULL, devname); +} EXPORT_SYMBOL(devm_request_any_context_irq); /** diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index c4a8bca5f2b0..3cd0c40282c0 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -40,10 +40,9 @@ void irq_gc_mask_disable_reg(struct irq_data *d) struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = d->mask; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); irq_reg_writel(gc, mask, ct->regs.disable); *ct->mask_cache &= ~mask; - irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_mask_disable_reg); @@ -60,10 +59,9 @@ void irq_gc_mask_set_bit(struct irq_data *d) struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = d->mask; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); *ct->mask_cache |= mask; irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask); - irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit); @@ -80,10 +78,9 @@ void irq_gc_mask_clr_bit(struct irq_data *d) struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = d->mask; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); *ct->mask_cache &= ~mask; irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask); - irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit); @@ -100,10 +97,9 @@ void irq_gc_unmask_enable_reg(struct irq_data *d) struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = d->mask; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); irq_reg_writel(gc, mask, ct->regs.enable); *ct->mask_cache |= mask; - irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_unmask_enable_reg); @@ -117,9 +113,8 @@ void irq_gc_ack_set_bit(struct irq_data *d) struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = d->mask; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); irq_reg_writel(gc, mask, ct->regs.ack); - irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit); @@ -133,9 +128,8 @@ void irq_gc_ack_clr_bit(struct irq_data *d) struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = ~d->mask; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); irq_reg_writel(gc, mask, ct->regs.ack); - irq_gc_unlock(gc); } /** @@ -156,11 +150,10 @@ void irq_gc_mask_disable_and_ack_set(struct irq_data *d) struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = d->mask; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); irq_reg_writel(gc, mask, ct->regs.disable); *ct->mask_cache &= ~mask; irq_reg_writel(gc, mask, ct->regs.ack); - irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_mask_disable_and_ack_set); @@ -174,9 +167,8 @@ void irq_gc_eoi(struct irq_data *d) struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = d->mask; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); irq_reg_writel(gc, mask, ct->regs.eoi); - irq_gc_unlock(gc); } /** @@ -196,12 +188,11 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on) if (!(mask & gc->wake_enabled)) return -EINVAL; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); if (on) gc->wake_active |= mask; else gc->wake_active &= ~mask; - irq_gc_unlock(gc); return 0; } EXPORT_SYMBOL_GPL(irq_gc_set_wake); @@ -288,7 +279,6 @@ int irq_domain_alloc_generic_chips(struct irq_domain *d, { struct irq_domain_chip_generic *dgc; struct irq_chip_generic *gc; - unsigned long flags; int numchips, i; size_t dgc_sz; size_t gc_sz; @@ -340,9 +330,8 @@ int irq_domain_alloc_generic_chips(struct irq_domain *d, goto err; } - raw_spin_lock_irqsave(&gc_lock, flags); - list_add_tail(&gc->list, &gc_list); - raw_spin_unlock_irqrestore(&gc_lock, flags); + scoped_guard (raw_spinlock_irqsave, &gc_lock) + list_add_tail(&gc->list, &gc_list); /* Calc pointer to the next generic chip */ tmp += gc_sz; } @@ -459,7 +448,6 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, struct irq_chip_generic *gc; struct irq_chip_type *ct; struct irq_chip *chip; - unsigned long flags; int idx; gc = __irq_get_domain_generic_chip(d, hw_irq); @@ -479,9 +467,8 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, /* We only init the cache for the first mapping of a generic chip */ if (!gc->installed) { - raw_spin_lock_irqsave(&gc->lock, flags); + guard(raw_spinlock_irqsave)(&gc->lock); irq_gc_init_mask_cache(gc, dgc->gc_flags); - raw_spin_unlock_irqrestore(&gc->lock, flags); } /* Mark the interrupt as installed */ @@ -548,9 +535,8 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, struct irq_chip *chip = &ct->chip; unsigned int i; - raw_spin_lock(&gc_lock); - list_add_tail(&gc->list, &gc_list); - raw_spin_unlock(&gc_lock); + scoped_guard (raw_spinlock, &gc_lock) + list_add_tail(&gc->list, &gc_list); irq_gc_init_mask_cache(gc, flags); @@ -616,9 +602,8 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, { unsigned int i, virq; - raw_spin_lock(&gc_lock); - list_del(&gc->list); - raw_spin_unlock(&gc_lock); + scoped_guard (raw_spinlock, &gc_lock) + list_del(&gc->list); for (i = 0; msk; msk >>= 1, i++) { if (!(msk & 0x01)) @@ -665,7 +650,7 @@ static struct irq_data *irq_gc_get_irq_data(struct irq_chip_generic *gc) } #ifdef CONFIG_PM -static int irq_gc_suspend(void) +static int irq_gc_suspend(void *data) { struct irq_chip_generic *gc; @@ -685,7 +670,7 @@ static int irq_gc_suspend(void) return 0; } -static void irq_gc_resume(void) +static void irq_gc_resume(void *data) { struct irq_chip_generic *gc; @@ -708,7 +693,7 @@ static void irq_gc_resume(void) #define irq_gc_resume NULL #endif -static void irq_gc_shutdown(void) +static void irq_gc_shutdown(void *data) { struct irq_chip_generic *gc; @@ -724,15 +709,19 @@ static void irq_gc_shutdown(void) } } -static struct syscore_ops irq_gc_syscore_ops = { +static const struct syscore_ops irq_gc_syscore_ops = { .suspend = irq_gc_suspend, .resume = irq_gc_resume, .shutdown = irq_gc_shutdown, }; +static struct syscore irq_gc_syscore = { + .ops = &irq_gc_syscore_ops, +}; + static int __init irq_gc_init_ops(void) { - register_syscore_ops(&irq_gc_syscore_ops); + register_syscore(&irq_gc_syscore); return 0; } device_initcall(irq_gc_init_ops); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 9489f93b3db3..786f5570a640 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -133,7 +133,53 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) */ atomic_inc(&desc->threads_active); - wake_up_process(action->thread); + /* + * This might be a premature wakeup before the thread reached the + * thread function and set the IRQTF_READY bit. It's waiting in + * kthread code with state UNINTERRUPTIBLE. Once it reaches the + * thread function it waits with INTERRUPTIBLE. The wakeup is not + * lost in that case because the thread is guaranteed to observe + * the RUN flag before it goes to sleep in wait_for_interrupt(). + */ + wake_up_state(action->thread, TASK_INTERRUPTIBLE); +} + +static DEFINE_STATIC_KEY_FALSE(irqhandler_duration_check_enabled); +static u64 irqhandler_duration_threshold_ns __ro_after_init; + +static int __init irqhandler_duration_check_setup(char *arg) +{ + unsigned long val; + int ret; + + ret = kstrtoul(arg, 0, &val); + if (ret) { + pr_err("Unable to parse irqhandler.duration_warn_us setting: ret=%d\n", ret); + return 0; + } + + if (!val) { + pr_err("Invalid irqhandler.duration_warn_us setting, must be > 0\n"); + return 0; + } + + irqhandler_duration_threshold_ns = val * 1000; + static_branch_enable(&irqhandler_duration_check_enabled); + + return 1; +} +__setup("irqhandler.duration_warn_us=", irqhandler_duration_check_setup); + +static inline void irqhandler_duration_check(u64 ts_start, unsigned int irq, + const struct irqaction *action) +{ + u64 delta_ns = local_clock() - ts_start; + + if (unlikely(delta_ns > irqhandler_duration_threshold_ns)) { + pr_warn_ratelimited("[CPU%u] long duration of IRQ[%u:%ps], took: %llu us\n", + smp_processor_id(), irq, action->handler, + div_u64(delta_ns, NSEC_PER_USEC)); + } } irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc) @@ -155,7 +201,16 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc) lockdep_hardirq_threaded(); trace_irq_handler_entry(irq, action); - res = action->handler(irq, action->dev_id); + + if (static_branch_unlikely(&irqhandler_duration_check_enabled)) { + u64 ts_start = local_clock(); + + res = action->handler(irq, action->dev_id); + irqhandler_duration_check(ts_start, irq, action); + } else { + res = action->handler(irq, action->dev_id); + } + trace_irq_handler_exit(irq, action, res); if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pS enabled interrupts\n", diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index b0290849c395..0164ca48da59 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -20,6 +20,7 @@ #define istate core_internal_state__do_not_mess_with_it extern bool noirqdebug; +extern int irq_poll_cpu; extern struct irqaction chained_action; @@ -87,6 +88,7 @@ extern void __enable_irq(struct irq_desc *desc); extern int irq_activate(struct irq_desc *desc); extern int irq_activate_and_startup(struct irq_desc *desc, bool resend); extern int irq_startup(struct irq_desc *desc, bool resend, bool force); +extern void irq_startup_managed(struct irq_desc *desc); extern void irq_shutdown(struct irq_desc *desc); extern void irq_shutdown_and_deactivate(struct irq_desc *desc); @@ -111,7 +113,6 @@ irqreturn_t handle_irq_event(struct irq_desc *desc); int check_irq_resend(struct irq_desc *desc, bool inject); void clear_irq_resend(struct irq_desc *desc); void irq_resend_init(struct irq_desc *desc); -bool irq_wait_for_poll(struct irq_desc *desc); void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action); void wake_threads_waitq(struct irq_desc *desc); @@ -141,6 +142,10 @@ extern int irq_setup_affinity(struct irq_desc *desc); static inline int irq_setup_affinity(struct irq_desc *desc) { return 0; } #endif + +#define for_each_action_of_desc(desc, act) \ + for (act = desc->action; act; act = act->next) + /* Inline functions for support of irq chips on slow busses */ static inline void chip_bus_lock(struct irq_desc *desc) { @@ -160,38 +165,33 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc) #define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK) #define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU) -#define for_each_action_of_desc(desc, act) \ - for (act = desc->action; act; act = act->next) - -struct irq_desc * -__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, - unsigned int check); +struct irq_desc *__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, + unsigned int check); void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus); -static inline struct irq_desc * -irq_get_desc_buslock(unsigned int irq, unsigned long *flags, unsigned int check) -{ - return __irq_get_desc_lock(irq, flags, true, check); -} +__DEFINE_CLASS_IS_CONDITIONAL(irqdesc_lock, true); +__DEFINE_UNLOCK_GUARD(irqdesc_lock, struct irq_desc, + __irq_put_desc_unlock(_T->lock, _T->flags, _T->bus), + unsigned long flags; bool bus); -static inline void -irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags) +static inline class_irqdesc_lock_t class_irqdesc_lock_constructor(unsigned int irq, bool bus, + unsigned int check) { - __irq_put_desc_unlock(desc, flags, true); -} + class_irqdesc_lock_t _t = { .bus = bus, }; -static inline struct irq_desc * -irq_get_desc_lock(unsigned int irq, unsigned long *flags, unsigned int check) -{ - return __irq_get_desc_lock(irq, flags, false, check); -} + _t.lock = __irq_get_desc_lock(irq, &_t.flags, bus, check); -static inline void -irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) -{ - __irq_put_desc_unlock(desc, flags, false); + return _t; } +#define scoped_irqdesc_get_and_lock(_irq, _check) \ + scoped_guard(irqdesc_lock, _irq, false, _check) + +#define scoped_irqdesc_get_and_buslock(_irq, _check) \ + scoped_guard(irqdesc_lock, _irq, true, _check) + +#define scoped_irqdesc ((struct irq_desc *)(__guard_ptr(irqdesc_lock)(&scope))) + #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) static inline unsigned int irqd_get(struct irq_data *d) @@ -277,11 +277,11 @@ static inline bool irq_is_nmi(struct irq_desc *desc) } #ifdef CONFIG_PM_SLEEP -bool irq_pm_check_wakeup(struct irq_desc *desc); +void irq_pm_handle_wakeup(struct irq_desc *desc); void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action); void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action); #else -static inline bool irq_pm_check_wakeup(struct irq_desc *desc) { return false; } +static inline void irq_pm_handle_wakeup(struct irq_desc *desc) { } static inline void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { } static inline void diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index 1a3d483548e2..ae4c9cbd1b4b 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -202,7 +202,7 @@ struct irq_domain *irq_domain_create_sim_full(struct fwnode_handle *fwnode, void *data) { struct irq_sim_work_ctx *work_ctx __free(kfree) = - kmalloc(sizeof(*work_ctx), GFP_KERNEL); + kzalloc(sizeof(*work_ctx), GFP_KERNEL); if (!work_ctx) return ERR_PTR(-ENOMEM); diff --git a/kernel/irq/irq_test.c b/kernel/irq/irq_test.c new file mode 100644 index 000000000000..e2d31914b3c4 --- /dev/null +++ b/kernel/irq/irq_test.c @@ -0,0 +1,236 @@ +// SPDX-License-Identifier: LGPL-2.1+ + +#include <linux/cleanup.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/irqdesc.h> +#include <linux/irqdomain.h> +#include <linux/nodemask.h> +#include <kunit/test.h> + +#include "internals.h" + +static irqreturn_t noop_handler(int irq, void *data) +{ + return IRQ_HANDLED; +} + +static void noop(struct irq_data *data) { } +static unsigned int noop_ret(struct irq_data *data) { return 0; } + +static int noop_affinity(struct irq_data *data, const struct cpumask *dest, + bool force) +{ + irq_data_update_effective_affinity(data, dest); + + return 0; +} + +static struct irq_chip fake_irq_chip = { + .name = "fake", + .irq_startup = noop_ret, + .irq_shutdown = noop, + .irq_enable = noop, + .irq_disable = noop, + .irq_ack = noop, + .irq_mask = noop, + .irq_unmask = noop, + .irq_set_affinity = noop_affinity, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +static int irq_test_setup_fake_irq(struct kunit *test, struct irq_affinity_desc *affd) +{ + struct irq_desc *desc; + int virq; + + virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, affd); + KUNIT_ASSERT_GE(test, virq, 0); + + irq_set_chip_and_handler(virq, &fake_irq_chip, handle_simple_irq); + + desc = irq_to_desc(virq); + KUNIT_ASSERT_PTR_NE(test, desc, NULL); + + /* On some architectures, IRQs are NOREQUEST | NOPROBE by default. */ + irq_settings_clr_norequest(desc); + + return virq; +} + +static void irq_disable_depth_test(struct kunit *test) +{ + struct irq_desc *desc; + int virq, ret; + + virq = irq_test_setup_fake_irq(test, NULL); + + desc = irq_to_desc(virq); + KUNIT_ASSERT_PTR_NE(test, desc, NULL); + + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); + KUNIT_ASSERT_EQ(test, ret, 0); + + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + disable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + enable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + free_irq(virq, NULL); +} + +static void irq_free_disabled_test(struct kunit *test) +{ + struct irq_desc *desc; + int virq, ret; + + virq = irq_test_setup_fake_irq(test, NULL); + + desc = irq_to_desc(virq); + KUNIT_ASSERT_PTR_NE(test, desc, NULL); + + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); + KUNIT_ASSERT_EQ(test, ret, 0); + + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + disable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + free_irq(virq, NULL); + KUNIT_EXPECT_GE(test, desc->depth, 1); + + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); + KUNIT_ASSERT_EQ(test, ret, 0); + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + free_irq(virq, NULL); +} + +static void irq_shutdown_depth_test(struct kunit *test) +{ + struct irq_desc *desc; + struct irq_data *data; + int virq, ret; + struct irq_affinity_desc affinity = { + .is_managed = 1, + .mask = CPU_MASK_ALL, + }; + + if (!IS_ENABLED(CONFIG_SMP)) + kunit_skip(test, "requires CONFIG_SMP for managed shutdown"); + + virq = irq_test_setup_fake_irq(test, &affinity); + + desc = irq_to_desc(virq); + KUNIT_ASSERT_PTR_NE(test, desc, NULL); + + data = irq_desc_get_irq_data(desc); + KUNIT_ASSERT_PTR_NE(test, data, NULL); + + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); + KUNIT_ASSERT_EQ(test, ret, 0); + + KUNIT_EXPECT_TRUE(test, irqd_is_activated(data)); + KUNIT_EXPECT_TRUE(test, irqd_is_started(data)); + KUNIT_EXPECT_TRUE(test, irqd_affinity_is_managed(data)); + + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + disable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + scoped_guard(raw_spinlock_irqsave, &desc->lock) + irq_shutdown_and_deactivate(desc); + + KUNIT_EXPECT_FALSE(test, irqd_is_activated(data)); + KUNIT_EXPECT_FALSE(test, irqd_is_started(data)); + + KUNIT_EXPECT_EQ(test, irq_activate(desc), 0); +#ifdef CONFIG_SMP + irq_startup_managed(desc); +#endif + + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + enable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + free_irq(virq, NULL); +} + +static void irq_cpuhotplug_test(struct kunit *test) +{ + struct irq_desc *desc; + struct irq_data *data; + int virq, ret; + struct irq_affinity_desc affinity = { + .is_managed = 1, + }; + + if (!IS_ENABLED(CONFIG_SMP)) + kunit_skip(test, "requires CONFIG_SMP for CPU hotplug"); + if (!get_cpu_device(1)) + kunit_skip(test, "requires more than 1 CPU for CPU hotplug"); + if (!cpu_is_hotpluggable(1)) + kunit_skip(test, "CPU 1 must be hotpluggable"); + if (!cpu_online(1)) + kunit_skip(test, "CPU 1 must be online"); + + cpumask_copy(&affinity.mask, cpumask_of(1)); + + virq = irq_test_setup_fake_irq(test, &affinity); + + desc = irq_to_desc(virq); + KUNIT_ASSERT_PTR_NE(test, desc, NULL); + + data = irq_desc_get_irq_data(desc); + KUNIT_ASSERT_PTR_NE(test, data, NULL); + + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); + KUNIT_ASSERT_EQ(test, ret, 0); + + KUNIT_EXPECT_TRUE(test, irqd_is_activated(data)); + KUNIT_EXPECT_TRUE(test, irqd_is_started(data)); + KUNIT_EXPECT_TRUE(test, irqd_affinity_is_managed(data)); + + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + disable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + KUNIT_EXPECT_EQ(test, remove_cpu(1), 0); + KUNIT_EXPECT_GE(test, desc->depth, 1); + KUNIT_EXPECT_EQ(test, add_cpu(1), 0); + + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + enable_irq(virq); + KUNIT_EXPECT_TRUE(test, irqd_is_activated(data)); + KUNIT_EXPECT_TRUE(test, irqd_is_started(data)); + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + free_irq(virq, NULL); +} + +static struct kunit_case irq_test_cases[] = { + KUNIT_CASE(irq_disable_depth_test), + KUNIT_CASE(irq_free_disabled_test), + KUNIT_CASE(irq_shutdown_depth_test), + KUNIT_CASE(irq_cpuhotplug_test), + {} +}; + +static struct kunit_suite irq_test_suite = { + .name = "irq_test_cases", + .test_cases = irq_test_cases, +}; + +kunit_test_suite(irq_test_suite); +MODULE_DESCRIPTION("IRQ unit test suite"); +MODULE_LICENSE("GPL"); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 4258cd6bd3b4..f8e4e13dbe33 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -246,8 +246,7 @@ static struct kobject *irq_kobj_base; #define IRQ_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) -static ssize_t per_cpu_count_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t per_cpu_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); ssize_t ret = 0; @@ -257,112 +256,83 @@ static ssize_t per_cpu_count_show(struct kobject *kobj, for_each_possible_cpu(cpu) { unsigned int c = irq_desc_kstat_cpu(desc, cpu); - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c); + ret += sysfs_emit_at(buf, ret, "%s%u", p, c); p = ","; } - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + ret += sysfs_emit_at(buf, ret, "\n"); return ret; } IRQ_ATTR_RO(per_cpu_count); -static ssize_t chip_name_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t chip_name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); - ssize_t ret = 0; - - raw_spin_lock_irq(&desc->lock); - if (desc->irq_data.chip && desc->irq_data.chip->name) { - ret = scnprintf(buf, PAGE_SIZE, "%s\n", - desc->irq_data.chip->name); - } - raw_spin_unlock_irq(&desc->lock); - return ret; + guard(raw_spinlock_irq)(&desc->lock); + if (desc->irq_data.chip && desc->irq_data.chip->name) + return sysfs_emit(buf, "%s\n", desc->irq_data.chip->name); + return 0; } IRQ_ATTR_RO(chip_name); -static ssize_t hwirq_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t hwirq_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); - ssize_t ret = 0; - raw_spin_lock_irq(&desc->lock); + guard(raw_spinlock_irq)(&desc->lock); if (desc->irq_data.domain) - ret = sprintf(buf, "%lu\n", desc->irq_data.hwirq); - raw_spin_unlock_irq(&desc->lock); - - return ret; + return sysfs_emit(buf, "%lu\n", desc->irq_data.hwirq); + return 0; } IRQ_ATTR_RO(hwirq); -static ssize_t type_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t type_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); - ssize_t ret = 0; - - raw_spin_lock_irq(&desc->lock); - ret = sprintf(buf, "%s\n", - irqd_is_level_type(&desc->irq_data) ? "level" : "edge"); - raw_spin_unlock_irq(&desc->lock); - return ret; + guard(raw_spinlock_irq)(&desc->lock); + return sysfs_emit(buf, "%s\n", irqd_is_level_type(&desc->irq_data) ? "level" : "edge"); } IRQ_ATTR_RO(type); -static ssize_t wakeup_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t wakeup_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); - ssize_t ret = 0; - - raw_spin_lock_irq(&desc->lock); - ret = sprintf(buf, "%s\n", str_enabled_disabled(irqd_is_wakeup_set(&desc->irq_data))); - raw_spin_unlock_irq(&desc->lock); - - return ret; + guard(raw_spinlock_irq)(&desc->lock); + return sysfs_emit(buf, "%s\n", str_enabled_disabled(irqd_is_wakeup_set(&desc->irq_data))); } IRQ_ATTR_RO(wakeup); -static ssize_t name_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); - ssize_t ret = 0; - raw_spin_lock_irq(&desc->lock); + guard(raw_spinlock_irq)(&desc->lock); if (desc->name) - ret = scnprintf(buf, PAGE_SIZE, "%s\n", desc->name); - raw_spin_unlock_irq(&desc->lock); - - return ret; + return sysfs_emit(buf, "%s\n", desc->name); + return 0; } IRQ_ATTR_RO(name); -static ssize_t actions_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t actions_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); struct irqaction *action; ssize_t ret = 0; char *p = ""; - raw_spin_lock_irq(&desc->lock); - for_each_action_of_desc(desc, action) { - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", - p, action->name); - p = ","; + scoped_guard(raw_spinlock_irq, &desc->lock) { + for_each_action_of_desc(desc, action) { + ret += sysfs_emit_at(buf, ret, "%s%s", p, action->name); + p = ","; + } } - raw_spin_unlock_irq(&desc->lock); if (ret) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); - + ret += sysfs_emit_at(buf, ret, "\n"); return ret; } IRQ_ATTR_RO(actions); @@ -418,19 +388,14 @@ static int __init irq_sysfs_init(void) int irq; /* Prevent concurrent irq alloc/free */ - irq_lock_sparse(); - + guard(mutex)(&sparse_irq_lock); irq_kobj_base = kobject_create_and_add("irq", kernel_kobj); - if (!irq_kobj_base) { - irq_unlock_sparse(); + if (!irq_kobj_base) return -ENOMEM; - } /* Add the already allocated interrupts */ for_each_irq_desc(irq, desc) irq_sysfs_add(irq, desc); - irq_unlock_sparse(); - return 0; } postcore_initcall(irq_sysfs_init); @@ -573,12 +538,12 @@ err: return -ENOMEM; } -static int irq_expand_nr_irqs(unsigned int nr) +static bool irq_expand_nr_irqs(unsigned int nr) { if (nr > MAX_SPARSE_IRQS) - return -ENOMEM; + return false; nr_irqs = nr; - return 0; + return true; } int __init early_irq_init(void) @@ -656,11 +621,9 @@ EXPORT_SYMBOL(irq_to_desc); static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - raw_spin_lock_irqsave(&desc->lock, flags); - desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL); - raw_spin_unlock_irqrestore(&desc->lock, flags); + scoped_guard(raw_spinlock_irqsave, &desc->lock) + desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL); delete_irq_desc(irq); } @@ -679,24 +642,16 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, return start; } -static int irq_expand_nr_irqs(unsigned int nr) +static inline bool irq_expand_nr_irqs(unsigned int nr) { - return -ENOMEM; + return false; } void irq_mark_irq(unsigned int irq) { - mutex_lock(&sparse_irq_lock); + guard(mutex)(&sparse_irq_lock); irq_insert_desc(irq, irq_desc + irq); - mutex_unlock(&sparse_irq_lock); -} - -#ifdef CONFIG_GENERIC_IRQ_LEGACY -void irq_init_desc(unsigned int irq) -{ - free_desc(irq); } -#endif #endif /* !CONFIG_SPARSE_IRQ */ @@ -765,7 +720,7 @@ EXPORT_SYMBOL_GPL(generic_handle_irq_safe); * This function must be called from an IRQ context with irq regs * initialized. */ -int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq) +int generic_handle_domain_irq(struct irq_domain *domain, irq_hw_number_t hwirq) { return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); } @@ -783,7 +738,7 @@ EXPORT_SYMBOL_GPL(generic_handle_domain_irq); * context). If the interrupt is marked as 'enforce IRQ-context only' then * the function must be invoked from hard interrupt context. */ -int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq) +int generic_handle_domain_irq_safe(struct irq_domain *domain, irq_hw_number_t hwirq) { unsigned long flags; int ret; @@ -806,7 +761,7 @@ EXPORT_SYMBOL_GPL(generic_handle_domain_irq_safe); * This function must be called from an NMI context with irq regs * initialized. **/ -int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq) +int generic_handle_domain_nmi(struct irq_domain *domain, irq_hw_number_t hwirq) { WARN_ON_ONCE(!in_nmi()); return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); @@ -827,11 +782,9 @@ void irq_free_descs(unsigned int from, unsigned int cnt) if (from >= nr_irqs || (from + cnt) > nr_irqs) return; - mutex_lock(&sparse_irq_lock); + guard(mutex)(&sparse_irq_lock); for (i = 0; i < cnt; i++) free_desc(from + i); - - mutex_unlock(&sparse_irq_lock); } EXPORT_SYMBOL_GPL(irq_free_descs); @@ -848,11 +801,10 @@ EXPORT_SYMBOL_GPL(irq_free_descs); * * Returns the first irq number or error code */ -int __ref -__irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, - struct module *owner, const struct irq_affinity_desc *affinity) +int __ref __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, + struct module *owner, const struct irq_affinity_desc *affinity) { - int start, ret; + int start; if (!cnt) return -EINVAL; @@ -870,22 +822,17 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, from = arch_dynirq_lower_bound(from); } - mutex_lock(&sparse_irq_lock); + guard(mutex)(&sparse_irq_lock); start = irq_find_free_area(from, cnt); - ret = -EEXIST; if (irq >=0 && start != irq) - goto unlock; + return -EEXIST; if (start + cnt > nr_irqs) { - ret = irq_expand_nr_irqs(start + cnt); - if (ret) - goto unlock; + if (!irq_expand_nr_irqs(start + cnt)) + return -ENOMEM; } - ret = alloc_descs(start, cnt, node, affinity, owner); -unlock: - mutex_unlock(&sparse_irq_lock); - return ret; + return alloc_descs(start, cnt, node, affinity, owner); } EXPORT_SYMBOL_GPL(__irq_alloc_descs); @@ -900,27 +847,27 @@ unsigned int irq_get_next_irq(unsigned int offset) return irq_find_at_or_after(offset); } -struct irq_desc * -__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, - unsigned int check) +struct irq_desc *__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, + unsigned int check) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc; - if (desc) { - if (check & _IRQ_DESC_CHECK) { - if ((check & _IRQ_DESC_PERCPU) && - !irq_settings_is_per_cpu_devid(desc)) - return NULL; - - if (!(check & _IRQ_DESC_PERCPU) && - irq_settings_is_per_cpu_devid(desc)) - return NULL; - } + desc = irq_to_desc(irq); + if (!desc) + return NULL; - if (bus) - chip_bus_lock(desc); - raw_spin_lock_irqsave(&desc->lock, *flags); + if (check & _IRQ_DESC_CHECK) { + if ((check & _IRQ_DESC_PERCPU) && !irq_settings_is_per_cpu_devid(desc)) + return NULL; + + if (!(check & _IRQ_DESC_PERCPU) && irq_settings_is_per_cpu_devid(desc)) + return NULL; } + + if (bus) + chip_bus_lock(desc); + raw_spin_lock_irqsave(&desc->lock, *flags); + return desc; } @@ -932,8 +879,7 @@ void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) chip_bus_sync_unlock(desc); } -int irq_set_percpu_devid_partition(unsigned int irq, - const struct cpumask *affinity) +int irq_set_percpu_devid(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -945,31 +891,10 @@ int irq_set_percpu_devid_partition(unsigned int irq, if (!desc->percpu_enabled) return -ENOMEM; - desc->percpu_affinity = affinity ? : cpu_possible_mask; - irq_set_percpu_devid_flags(irq); return 0; } -int irq_set_percpu_devid(unsigned int irq) -{ - return irq_set_percpu_devid_partition(irq, NULL); -} - -int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity) -{ - struct irq_desc *desc = irq_to_desc(irq); - - if (!desc || !desc->percpu_enabled) - return -EINVAL; - - if (affinity) - cpumask_copy(affinity, desc->percpu_affinity); - - return 0; -} -EXPORT_SYMBOL_GPL(irq_get_percpu_devid_partition); - void kstat_incr_irq_this_cpu(unsigned int irq) { kstat_incr_irqs_this_cpu(irq_to_desc(irq)); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 9d5c8651492d..2652c4cfd877 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -317,6 +317,7 @@ static struct irq_domain *__irq_domain_instantiate(const struct irq_domain_info domain->flags |= info->domain_flags; domain->exit = info->exit; + domain->dev = info->dev; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY if (info->parent) { @@ -480,33 +481,6 @@ struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode, } EXPORT_SYMBOL_GPL(irq_domain_create_simple); -/** - * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. - * @of_node: pointer to interrupt controller's device tree node. - * @size: total number of irqs in legacy mapping - * @first_irq: first number of irq block assigned to the domain - * @first_hwirq: first hwirq number to use for the translation. Should normally - * be '0', but a positive integer can be used if the effective - * hwirqs numbering does not begin at zero. - * @ops: map/unmap domain callbacks - * @host_data: Controller private data pointer - * - * Note: the map() callback will be called before this function returns - * for all legacy interrupts except 0 (which is always the invalid irq for - * a legacy controller). - */ -struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, - unsigned int size, - unsigned int first_irq, - irq_hw_number_t first_hwirq, - const struct irq_domain_ops *ops, - void *host_data) -{ - return irq_domain_create_legacy(of_node_to_fwnode(of_node), size, - first_irq, first_hwirq, ops, host_data); -} -EXPORT_SYMBOL_GPL(irq_domain_add_legacy); - struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode, unsigned int size, unsigned int first_irq, @@ -885,7 +859,7 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, { int i; - fwspec->fwnode = of_node_to_fwnode(np); + fwspec->fwnode = of_fwnode_handle(np); fwspec->param_count = count; for (i = 0; i < count; i++) @@ -893,13 +867,9 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, } EXPORT_SYMBOL_GPL(of_phandle_args_to_fwspec); -unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) +static struct irq_domain *fwspec_to_domain(struct irq_fwspec *fwspec) { struct irq_domain *domain; - struct irq_data *irq_data; - irq_hw_number_t hwirq; - unsigned int type = IRQ_TYPE_NONE; - int virq; if (fwspec->fwnode) { domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_WIRED); @@ -909,6 +879,32 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) domain = irq_default_domain; } + return domain; +} + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY +int irq_populate_fwspec_info(struct irq_fwspec *fwspec, struct irq_fwspec_info *info) +{ + struct irq_domain *domain = fwspec_to_domain(fwspec); + + memset(info, 0, sizeof(*info)); + + if (!domain || !domain->ops->get_fwspec_info) + return 0; + + return domain->ops->get_fwspec_info(fwspec, info); +} +#endif + +unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) +{ + unsigned int type = IRQ_TYPE_NONE; + struct irq_domain *domain; + struct irq_data *irq_data; + irq_hw_number_t hwirq; + int virq; + + domain = fwspec_to_domain(fwspec); if (!domain) { pr_warn("no irq domain found for %s !\n", of_node_full_name(to_of_node(fwspec->fwnode))); @@ -1133,6 +1129,31 @@ int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr, EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell); /** + * irq_domain_xlate_twothreecell() - Generic xlate for direct two or three cell bindings + * @d: Interrupt domain involved in the translation + * @ctrlr: The device tree node for the device whose interrupt is translated + * @intspec: The interrupt specifier data from the device tree + * @intsize: The number of entries in @intspec + * @out_hwirq: Pointer to storage for the hardware interrupt number + * @out_type: Pointer to storage for the interrupt type + * + * Device Tree interrupt specifier translation function for two or three + * cell bindings, where the cell values map directly to the hardware + * interrupt number and the type specifier. + */ +int irq_domain_xlate_twothreecell(struct irq_domain *d, struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, unsigned int *out_type) +{ + struct irq_fwspec fwspec; + + of_phandle_args_to_fwspec(ctrlr, intspec, intsize, &fwspec); + + return irq_domain_translate_twothreecell(d, &fwspec, out_hwirq, out_type); +} +EXPORT_SYMBOL_GPL(irq_domain_xlate_twothreecell); + +/** * irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings * @d: Interrupt domain involved in the translation * @ctrlr: The device tree node for the device whose interrupt is translated @@ -1216,6 +1237,37 @@ int irq_domain_translate_twocell(struct irq_domain *d, } EXPORT_SYMBOL_GPL(irq_domain_translate_twocell); +/** + * irq_domain_translate_twothreecell() - Generic translate for direct two or three cell + * bindings + * @d: Interrupt domain involved in the translation + * @fwspec: The firmware interrupt specifier to translate + * @out_hwirq: Pointer to storage for the hardware interrupt number + * @out_type: Pointer to storage for the interrupt type + * + * Firmware interrupt specifier translation function for two or three cell + * specifications, where the parameter values map directly to the hardware + * interrupt number and the type specifier. + */ +int irq_domain_translate_twothreecell(struct irq_domain *d, struct irq_fwspec *fwspec, + unsigned long *out_hwirq, unsigned int *out_type) +{ + if (fwspec->param_count == 2) { + *out_hwirq = fwspec->param[0]; + *out_type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK; + return 0; + } + + if (fwspec->param_count == 3) { + *out_hwirq = fwspec->param[1]; + *out_type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK; + return 0; + } + + return -EINVAL; +} +EXPORT_SYMBOL_GPL(irq_domain_translate_twothreecell); + int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, int node, const struct irq_affinity_desc *affinity) { @@ -1252,47 +1304,6 @@ void irq_domain_reset_irq_data(struct irq_data *irq_data) EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY -/** - * irq_domain_create_hierarchy - Add a irqdomain into the hierarchy - * @parent: Parent irq domain to associate with the new domain - * @flags: Irq domain flags associated to the domain - * @size: Size of the domain. See below - * @fwnode: Optional fwnode of the interrupt controller - * @ops: Pointer to the interrupt domain callbacks - * @host_data: Controller private data pointer - * - * If @size is 0 a tree domain is created, otherwise a linear domain. - * - * If successful the parent is associated to the new domain and the - * domain flags are set. - * Returns pointer to IRQ domain, or NULL on failure. - */ -struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, - unsigned int flags, - unsigned int size, - struct fwnode_handle *fwnode, - const struct irq_domain_ops *ops, - void *host_data) -{ - struct irq_domain_info info = { - .fwnode = fwnode, - .size = size, - .hwirq_max = size, - .ops = ops, - .host_data = host_data, - .domain_flags = flags, - .parent = parent, - }; - struct irq_domain *d; - - if (!info.size) - info.hwirq_max = ~0U; - - d = irq_domain_instantiate(&info); - return IS_ERR(d) ? NULL : d; -} -EXPORT_SYMBOL_GPL(irq_domain_create_hierarchy); - static void irq_domain_insert_irq(int virq) { struct irq_data *data; @@ -1573,6 +1584,7 @@ void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq, } irq_domain_free_irqs_common(domain, virq, nr_irqs); } +EXPORT_SYMBOL_GPL(irq_domain_free_irqs_top); static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base, @@ -2008,7 +2020,7 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain) domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY; } #else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ -/** +/* * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain * @domain: domain to match * @virq: IRQ number to get irq_data @@ -2022,7 +2034,7 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, } EXPORT_SYMBOL_GPL(irq_domain_get_irq_data); -/** +/* * irq_domain_set_info - Set the complete data for a @virq in @domain * @domain: Interrupt domain to match * @virq: IRQ number diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 753eef8e041c..349ae7979da0 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -43,8 +43,6 @@ static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip) bool inprogress; do { - unsigned long flags; - /* * Wait until we're out of the critical section. This might * give the wrong answer due to the lack of memory barriers. @@ -53,7 +51,7 @@ static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip) cpu_relax(); /* Ok, that indicated we're done: double-check carefully. */ - raw_spin_lock_irqsave(&desc->lock, flags); + guard(raw_spinlock_irqsave)(&desc->lock); inprogress = irqd_irq_inprogress(&desc->irq_data); /* @@ -69,33 +67,30 @@ static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip) __irq_get_irqchip_state(irqd, IRQCHIP_STATE_ACTIVE, &inprogress); } - raw_spin_unlock_irqrestore(&desc->lock, flags); - /* Oops, that failed? */ } while (inprogress); } /** - * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs) - * @irq: interrupt number to wait for + * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs) + * @irq: interrupt number to wait for * - * This function waits for any pending hard IRQ handlers for this - * interrupt to complete before returning. If you use this - * function while holding a resource the IRQ handler may need you - * will deadlock. It does not take associated threaded handlers - * into account. + * This function waits for any pending hard IRQ handlers for this interrupt + * to complete before returning. If you use this function while holding a + * resource the IRQ handler may need you will deadlock. It does not take + * associated threaded handlers into account. * - * Do not use this for shutdown scenarios where you must be sure - * that all parts (hardirq and threaded handler) have completed. + * Do not use this for shutdown scenarios where you must be sure that all + * parts (hardirq and threaded handler) have completed. * - * Returns: false if a threaded handler is active. + * Returns: false if a threaded handler is active. * - * This function may be called - with care - from IRQ context. + * This function may be called - with care - from IRQ context. * - * It does not check whether there is an interrupt in flight at the - * hardware level, but not serviced yet, as this might deadlock when - * called with interrupts disabled and the target CPU of the interrupt - * is the current CPU. + * It does not check whether there is an interrupt in flight at the + * hardware level, but not serviced yet, as this might deadlock when called + * with interrupts disabled and the target CPU of the interrupt is the + * current CPU. */ bool synchronize_hardirq(unsigned int irq) { @@ -121,19 +116,19 @@ static void __synchronize_irq(struct irq_desc *desc) } /** - * synchronize_irq - wait for pending IRQ handlers (on other CPUs) - * @irq: interrupt number to wait for + * synchronize_irq - wait for pending IRQ handlers (on other CPUs) + * @irq: interrupt number to wait for * - * This function waits for any pending IRQ handlers for this interrupt - * to complete before returning. If you use this function while - * holding a resource the IRQ handler may need you will deadlock. + * This function waits for any pending IRQ handlers for this interrupt to + * complete before returning. If you use this function while holding a + * resource the IRQ handler may need you will deadlock. * - * Can only be called from preemptible code as it might sleep when - * an interrupt thread is associated to @irq. + * Can only be called from preemptible code as it might sleep when + * an interrupt thread is associated to @irq. * - * It optionally makes sure (when the irq chip supports that method) - * that the interrupt is not pending in any CPU and waiting for - * service. + * It optionally makes sure (when the irq chip supports that method) + * that the interrupt is not pending in any CPU and waiting for + * service. */ void synchronize_irq(unsigned int irq) { @@ -156,8 +151,8 @@ static bool __irq_can_set_affinity(struct irq_desc *desc) } /** - * irq_can_set_affinity - Check if the affinity of a given irq can be set - * @irq: Interrupt to check + * irq_can_set_affinity - Check if the affinity of a given irq can be set + * @irq: Interrupt to check * */ int irq_can_set_affinity(unsigned int irq) @@ -181,13 +176,13 @@ bool irq_can_set_affinity_usr(unsigned int irq) } /** - * irq_set_thread_affinity - Notify irq threads to adjust affinity - * @desc: irq descriptor which has affinity changed + * irq_set_thread_affinity - Notify irq threads to adjust affinity + * @desc: irq descriptor which has affinity changed * - * We just set IRQTF_AFFINITY and delegate the affinity setting - * to the interrupt thread itself. We can not call - * set_cpus_allowed_ptr() here as we hold desc->lock and this - * code can be called from hard interrupt context. + * Just set IRQTF_AFFINITY and delegate the affinity setting to the + * interrupt thread itself. We can not call set_cpus_allowed_ptr() here as + * we hold desc->lock and this code can be called from hard interrupt + * context. */ static void irq_set_thread_affinity(struct irq_desc *desc) { @@ -400,14 +395,8 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, * an interrupt which is already started or which has already been configured * as managed will also fail, as these mean invalid init state or double init. */ -int irq_update_affinity_desc(unsigned int irq, - struct irq_affinity_desc *affinity) +int irq_update_affinity_desc(unsigned int irq, struct irq_affinity_desc *affinity) { - struct irq_desc *desc; - unsigned long flags; - bool activated; - int ret = 0; - /* * Supporting this with the reservation scheme used by x86 needs * some more thought. Fail it for now. @@ -415,60 +404,50 @@ int irq_update_affinity_desc(unsigned int irq, if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE)) return -EOPNOTSUPP; - desc = irq_get_desc_buslock(irq, &flags, 0); - if (!desc) - return -EINVAL; + scoped_irqdesc_get_and_buslock(irq, 0) { + struct irq_desc *desc = scoped_irqdesc; + bool activated; - /* Requires the interrupt to be shut down */ - if (irqd_is_started(&desc->irq_data)) { - ret = -EBUSY; - goto out_unlock; - } - - /* Interrupts which are already managed cannot be modified */ - if (irqd_affinity_is_managed(&desc->irq_data)) { - ret = -EBUSY; - goto out_unlock; - } - - /* - * Deactivate the interrupt. That's required to undo - * anything an earlier activation has established. - */ - activated = irqd_is_activated(&desc->irq_data); - if (activated) - irq_domain_deactivate_irq(&desc->irq_data); + /* Requires the interrupt to be shut down */ + if (irqd_is_started(&desc->irq_data)) + return -EBUSY; - if (affinity->is_managed) { - irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED); - irqd_set(&desc->irq_data, IRQD_MANAGED_SHUTDOWN); - } + /* Interrupts which are already managed cannot be modified */ + if (irqd_affinity_is_managed(&desc->irq_data)) + return -EBUSY; + /* + * Deactivate the interrupt. That's required to undo + * anything an earlier activation has established. + */ + activated = irqd_is_activated(&desc->irq_data); + if (activated) + irq_domain_deactivate_irq(&desc->irq_data); - cpumask_copy(desc->irq_common_data.affinity, &affinity->mask); + if (affinity->is_managed) { + irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED); + irqd_set(&desc->irq_data, IRQD_MANAGED_SHUTDOWN); + } - /* Restore the activation state */ - if (activated) - irq_domain_activate_irq(&desc->irq_data, false); + cpumask_copy(desc->irq_common_data.affinity, &affinity->mask); -out_unlock: - irq_put_desc_busunlock(desc, flags); - return ret; + /* Restore the activation state */ + if (activated) + irq_domain_activate_irq(&desc->irq_data, false); + return 0; + } + return -EINVAL; } static int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force) { struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - int ret; if (!desc) return -EINVAL; - raw_spin_lock_irqsave(&desc->lock, flags); - ret = irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force); - raw_spin_unlock_irqrestore(&desc->lock, flags); - return ret; + guard(raw_spinlock_irqsave)(&desc->lock); + return irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force); } /** @@ -501,39 +480,36 @@ int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask) } EXPORT_SYMBOL_GPL(irq_force_affinity); -int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, - bool setaffinity) +int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, bool setaffinity) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); + int ret = -EINVAL; - if (!desc) - return -EINVAL; - desc->affinity_hint = m; - irq_put_desc_unlock(desc, flags); - if (m && setaffinity) + scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { + scoped_irqdesc->affinity_hint = m; + ret = 0; + } + + if (!ret && m && setaffinity) __irq_set_affinity(irq, m, false); - return 0; + return ret; } EXPORT_SYMBOL_GPL(__irq_apply_affinity_hint); static void irq_affinity_notify(struct work_struct *work) { - struct irq_affinity_notify *notify = - container_of(work, struct irq_affinity_notify, work); + struct irq_affinity_notify *notify = container_of(work, struct irq_affinity_notify, work); struct irq_desc *desc = irq_to_desc(notify->irq); cpumask_var_t cpumask; - unsigned long flags; if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL)) goto out; - raw_spin_lock_irqsave(&desc->lock, flags); - if (irq_move_pending(&desc->irq_data)) - irq_get_pending(cpumask, desc); - else - cpumask_copy(cpumask, desc->irq_common_data.affinity); - raw_spin_unlock_irqrestore(&desc->lock, flags); + scoped_guard(raw_spinlock_irqsave, &desc->lock) { + if (irq_move_pending(&desc->irq_data)) + irq_get_pending(cpumask, desc); + else + cpumask_copy(cpumask, desc->irq_common_data.affinity); + } notify->notify(notify, cpumask); @@ -543,22 +519,20 @@ out: } /** - * irq_set_affinity_notifier - control notification of IRQ affinity changes - * @irq: Interrupt for which to enable/disable notification - * @notify: Context for notification, or %NULL to disable - * notification. Function pointers must be initialised; - * the other fields will be initialised by this function. - * - * Must be called in process context. Notification may only be enabled - * after the IRQ is allocated and must be disabled before the IRQ is - * freed using free_irq(). + * irq_set_affinity_notifier - control notification of IRQ affinity changes + * @irq: Interrupt for which to enable/disable notification + * @notify: Context for notification, or %NULL to disable + * notification. Function pointers must be initialised; + * the other fields will be initialised by this function. + * + * Must be called in process context. Notification may only be enabled + * after the IRQ is allocated and must be disabled before the IRQ is freed + * using free_irq(). */ -int -irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) +int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) { struct irq_desc *desc = irq_to_desc(irq); struct irq_affinity_notify *old_notify; - unsigned long flags; /* The release function is promised process context */ might_sleep(); @@ -573,10 +547,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) INIT_WORK(¬ify->work, irq_affinity_notify); } - raw_spin_lock_irqsave(&desc->lock, flags); - old_notify = desc->affinity_notify; - desc->affinity_notify = notify; - raw_spin_unlock_irqrestore(&desc->lock, flags); + scoped_guard(raw_spinlock_irq, &desc->lock) { + old_notify = desc->affinity_notify; + desc->affinity_notify = notify; + } if (old_notify) { if (cancel_work_sync(&old_notify->work)) { @@ -597,7 +571,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); int irq_setup_affinity(struct irq_desc *desc) { struct cpumask *set = irq_default_affinity; - int ret, node = irq_desc_get_node(desc); + int node = irq_desc_get_node(desc); + static DEFINE_RAW_SPINLOCK(mask_lock); static struct cpumask mask; @@ -605,7 +580,7 @@ int irq_setup_affinity(struct irq_desc *desc) if (!__irq_can_set_affinity(desc)) return 0; - raw_spin_lock(&mask_lock); + guard(raw_spinlock)(&mask_lock); /* * Preserve the managed affinity setting and a userspace affinity * setup, but make sure that one of the targets is online. @@ -630,9 +605,7 @@ int irq_setup_affinity(struct irq_desc *desc) if (cpumask_intersects(&mask, nodemask)) cpumask_and(&mask, &mask, nodemask); } - ret = irq_do_set_affinity(&desc->irq_data, &mask, false); - raw_spin_unlock(&mask_lock); - return ret; + return irq_do_set_affinity(&desc->irq_data, &mask, false); } #else /* Wrapper for ALPHA specific affinity selector magic */ @@ -645,44 +618,36 @@ int irq_setup_affinity(struct irq_desc *desc) /** - * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt - * @irq: interrupt number to set affinity - * @vcpu_info: vCPU specific data or pointer to a percpu array of vCPU - * specific data for percpu_devid interrupts - * - * This function uses the vCPU specific data to set the vCPU - * affinity for an irq. The vCPU specific data is passed from - * outside, such as KVM. One example code path is as below: - * KVM -> IOMMU -> irq_set_vcpu_affinity(). + * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt + * @irq: interrupt number to set affinity + * @vcpu_info: vCPU specific data or pointer to a percpu array of vCPU + * specific data for percpu_devid interrupts + * + * This function uses the vCPU specific data to set the vCPU affinity for + * an irq. The vCPU specific data is passed from outside, such as KVM. One + * example code path is as below: KVM -> IOMMU -> irq_set_vcpu_affinity(). */ int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); - struct irq_data *data; - struct irq_chip *chip; - int ret = -ENOSYS; + scoped_irqdesc_get_and_lock(irq, 0) { + struct irq_desc *desc = scoped_irqdesc; + struct irq_data *data; + struct irq_chip *chip; - if (!desc) - return -EINVAL; - - data = irq_desc_get_irq_data(desc); - do { - chip = irq_data_get_irq_chip(data); - if (chip && chip->irq_set_vcpu_affinity) - break; -#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY - data = data->parent_data; -#else - data = NULL; -#endif - } while (data); + data = irq_desc_get_irq_data(desc); + do { + chip = irq_data_get_irq_chip(data); + if (chip && chip->irq_set_vcpu_affinity) + break; - if (data) - ret = chip->irq_set_vcpu_affinity(data, vcpu_info); - irq_put_desc_unlock(desc, flags); + data = irqd_get_parent_data(data); + } while (data); - return ret; + if (!data) + return -ENOSYS; + return chip->irq_set_vcpu_affinity(data, vcpu_info); + } + return -EINVAL; } EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity); @@ -694,26 +659,23 @@ void __disable_irq(struct irq_desc *desc) static int __disable_irq_nosync(unsigned int irq) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); - - if (!desc) - return -EINVAL; - __disable_irq(desc); - irq_put_desc_busunlock(desc, flags); - return 0; + scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { + __disable_irq(scoped_irqdesc); + return 0; + } + return -EINVAL; } /** - * disable_irq_nosync - disable an irq without waiting - * @irq: Interrupt to disable + * disable_irq_nosync - disable an irq without waiting + * @irq: Interrupt to disable * - * Disable the selected interrupt line. Disables and Enables are - * nested. - * Unlike disable_irq(), this function does not ensure existing - * instances of the IRQ handler have completed before returning. + * Disable the selected interrupt line. Disables and Enables are + * nested. + * Unlike disable_irq(), this function does not ensure existing + * instances of the IRQ handler have completed before returning. * - * This function may be called from IRQ context. + * This function may be called from IRQ context. */ void disable_irq_nosync(unsigned int irq) { @@ -722,17 +684,17 @@ void disable_irq_nosync(unsigned int irq) EXPORT_SYMBOL(disable_irq_nosync); /** - * disable_irq - disable an irq and wait for completion - * @irq: Interrupt to disable + * disable_irq - disable an irq and wait for completion + * @irq: Interrupt to disable * - * Disable the selected interrupt line. Enables and Disables are - * nested. - * This function waits for any pending IRQ handlers for this interrupt - * to complete before returning. If you use this function while - * holding a resource the IRQ handler may need you will deadlock. + * Disable the selected interrupt line. Enables and Disables are nested. * - * Can only be called from preemptible code as it might sleep when - * an interrupt thread is associated to @irq. + * This function waits for any pending IRQ handlers for this interrupt to + * complete before returning. If you use this function while holding a + * resource the IRQ handler may need you will deadlock. + * + * Can only be called from preemptible code as it might sleep when an + * interrupt thread is associated to @irq. * */ void disable_irq(unsigned int irq) @@ -744,40 +706,39 @@ void disable_irq(unsigned int irq) EXPORT_SYMBOL(disable_irq); /** - * disable_hardirq - disables an irq and waits for hardirq completion - * @irq: Interrupt to disable + * disable_hardirq - disables an irq and waits for hardirq completion + * @irq: Interrupt to disable + * + * Disable the selected interrupt line. Enables and Disables are nested. * - * Disable the selected interrupt line. Enables and Disables are - * nested. - * This function waits for any pending hard IRQ handlers for this - * interrupt to complete before returning. If you use this function while - * holding a resource the hard IRQ handler may need you will deadlock. + * This function waits for any pending hard IRQ handlers for this interrupt + * to complete before returning. If you use this function while holding a + * resource the hard IRQ handler may need you will deadlock. * - * When used to optimistically disable an interrupt from atomic context - * the return value must be checked. + * When used to optimistically disable an interrupt from atomic context the + * return value must be checked. * - * Returns: false if a threaded handler is active. + * Returns: false if a threaded handler is active. * - * This function may be called - with care - from IRQ context. + * This function may be called - with care - from IRQ context. */ bool disable_hardirq(unsigned int irq) { if (!__disable_irq_nosync(irq)) return synchronize_hardirq(irq); - return false; } EXPORT_SYMBOL_GPL(disable_hardirq); /** - * disable_nmi_nosync - disable an nmi without waiting - * @irq: Interrupt to disable - * - * Disable the selected interrupt line. Disables and enables are - * nested. - * The interrupt to disable must have been requested through request_nmi. - * Unlike disable_nmi(), this function does not ensure existing - * instances of the IRQ handler have completed before returning. + * disable_nmi_nosync - disable an nmi without waiting + * @irq: Interrupt to disable + * + * Disable the selected interrupt line. Disables and enables are nested. + * + * The interrupt to disable must have been requested through request_nmi. + * Unlike disable_nmi(), this function does not ensure existing + * instances of the IRQ handler have completed before returning. */ void disable_nmi_nosync(unsigned int irq) { @@ -817,41 +778,34 @@ void __enable_irq(struct irq_desc *desc) } /** - * enable_irq - enable handling of an irq - * @irq: Interrupt to enable + * enable_irq - enable handling of an irq + * @irq: Interrupt to enable * - * Undoes the effect of one call to disable_irq(). If this - * matches the last disable, processing of interrupts on this - * IRQ line is re-enabled. + * Undoes the effect of one call to disable_irq(). If this matches the + * last disable, processing of interrupts on this IRQ line is re-enabled. * - * This function may be called from IRQ context only when - * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! + * This function may be called from IRQ context only when + * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! */ void enable_irq(unsigned int irq) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); + scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { + struct irq_desc *desc = scoped_irqdesc; - if (!desc) - return; - if (WARN(!desc->irq_data.chip, - KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) - goto out; - - __enable_irq(desc); -out: - irq_put_desc_busunlock(desc, flags); + if (WARN(!desc->irq_data.chip, "enable_irq before setup/request_irq: irq %u\n", irq)) + return; + __enable_irq(desc); + } } EXPORT_SYMBOL(enable_irq); /** - * enable_nmi - enable handling of an nmi - * @irq: Interrupt to enable + * enable_nmi - enable handling of an nmi + * @irq: Interrupt to enable * - * The interrupt to enable must have been requested through request_nmi. - * Undoes the effect of one call to disable_nmi(). If this - * matches the last disable, processing of interrupts on this - * IRQ line is re-enabled. + * The interrupt to enable must have been requested through request_nmi. + * Undoes the effect of one call to disable_nmi(). If this matches the last + * disable, processing of interrupts on this IRQ line is re-enabled. */ void enable_nmi(unsigned int irq) { @@ -873,65 +827,59 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) } /** - * irq_set_irq_wake - control irq power management wakeup - * @irq: interrupt to control - * @on: enable/disable power management wakeup - * - * Enable/disable power management wakeup mode, which is - * disabled by default. Enables and disables must match, - * just as they match for non-wakeup mode support. - * - * Wakeup mode lets this IRQ wake the system from sleep - * states like "suspend to RAM". - * - * Note: irq enable/disable state is completely orthogonal - * to the enable/disable state of irq wake. An irq can be - * disabled with disable_irq() and still wake the system as - * long as the irq has wake enabled. If this does not hold, - * then the underlying irq chip and the related driver need - * to be investigated. + * irq_set_irq_wake - control irq power management wakeup + * @irq: interrupt to control + * @on: enable/disable power management wakeup + * + * Enable/disable power management wakeup mode, which is disabled by + * default. Enables and disables must match, just as they match for + * non-wakeup mode support. + * + * Wakeup mode lets this IRQ wake the system from sleep states like + * "suspend to RAM". + * + * Note: irq enable/disable state is completely orthogonal to the + * enable/disable state of irq wake. An irq can be disabled with + * disable_irq() and still wake the system as long as the irq has wake + * enabled. If this does not hold, then the underlying irq chip and the + * related driver need to be investigated. */ int irq_set_irq_wake(unsigned int irq, unsigned int on) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); - int ret = 0; - - if (!desc) - return -EINVAL; + scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { + struct irq_desc *desc = scoped_irqdesc; + int ret = 0; - /* Don't use NMIs as wake up interrupts please */ - if (irq_is_nmi(desc)) { - ret = -EINVAL; - goto out_unlock; - } + /* Don't use NMIs as wake up interrupts please */ + if (irq_is_nmi(desc)) + return -EINVAL; - /* wakeup-capable irqs can be shared between drivers that - * don't need to have the same sleep mode behaviors. - */ - if (on) { - if (desc->wake_depth++ == 0) { - ret = set_irq_wake_real(irq, on); - if (ret) - desc->wake_depth = 0; - else - irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE); - } - } else { - if (desc->wake_depth == 0) { - WARN(1, "Unbalanced IRQ %d wake disable\n", irq); - } else if (--desc->wake_depth == 0) { - ret = set_irq_wake_real(irq, on); - if (ret) - desc->wake_depth = 1; - else - irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); + /* + * wakeup-capable irqs can be shared between drivers that + * don't need to have the same sleep mode behaviors. + */ + if (on) { + if (desc->wake_depth++ == 0) { + ret = set_irq_wake_real(irq, on); + if (ret) + desc->wake_depth = 0; + else + irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE); + } + } else { + if (desc->wake_depth == 0) { + WARN(1, "Unbalanced IRQ %d wake disable\n", irq); + } else if (--desc->wake_depth == 0) { + ret = set_irq_wake_real(irq, on); + if (ret) + desc->wake_depth = 1; + else + irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); + } } + return ret; } - -out_unlock: - irq_put_desc_busunlock(desc, flags); - return ret; + return -EINVAL; } EXPORT_SYMBOL(irq_set_irq_wake); @@ -940,22 +888,17 @@ EXPORT_SYMBOL(irq_set_irq_wake); * particular irq has been exclusively allocated or is available * for driver use. */ -int can_request_irq(unsigned int irq, unsigned long irqflags) +bool can_request_irq(unsigned int irq, unsigned long irqflags) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); - int canrequest = 0; - - if (!desc) - return 0; + scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { + struct irq_desc *desc = scoped_irqdesc; - if (irq_settings_can_request(desc)) { - if (!desc->action || - irqflags & desc->action->flags & IRQF_SHARED) - canrequest = 1; + if (irq_settings_can_request(desc)) { + if (!desc->action || irqflags & desc->action->flags & IRQF_SHARED) + return true; + } } - irq_put_desc_unlock(desc, flags); - return canrequest; + return false; } int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) @@ -1016,16 +959,11 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) #ifdef CONFIG_HARDIRQS_SW_RESEND int irq_set_parent(int irq, int parent_irq) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); - - if (!desc) - return -EINVAL; - - desc->parent_irq = parent_irq; - - irq_put_desc_unlock(desc, flags); - return 0; + scoped_irqdesc_get_and_lock(irq, 0) { + scoped_irqdesc->parent_irq = parent_irq; + return 0; + } + return -EINVAL; } EXPORT_SYMBOL_GPL(irq_set_parent); #endif @@ -1063,7 +1001,6 @@ static irqreturn_t irq_forced_secondary_handler(int irq, void *dev_id) static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { cpumask_var_t mask; - bool valid = false; if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) return; @@ -1079,22 +1016,14 @@ static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *a return; } - raw_spin_lock_irq(&desc->lock); - /* - * This code is triggered unconditionally. Check the affinity - * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. - */ - if (cpumask_available(desc->irq_common_data.affinity)) { + scoped_guard(raw_spinlock_irq, &desc->lock) { const struct cpumask *m; m = irq_data_get_effective_affinity_mask(&desc->irq_data); cpumask_copy(mask, m); - valid = true; } - raw_spin_unlock_irq(&desc->lock); - if (valid) - set_cpus_allowed_ptr(current, mask); + set_cpus_allowed_ptr(current, mask); free_cpumask_var(mask); } #else @@ -1259,9 +1188,8 @@ static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action) if (WARN_ON_ONCE(!secondary)) return; - raw_spin_lock_irq(&desc->lock); + guard(raw_spinlock_irq)(&desc->lock); __irq_wake_thread(desc, secondary); - raw_spin_unlock_irq(&desc->lock); } /* @@ -1302,7 +1230,10 @@ static int irq_thread(void *data) irq_thread_set_ready(desc, action); - sched_set_fifo(current); + if (action->handler == irq_forced_secondary_handler) + sched_set_fifo_secondary(current); + else + sched_set_fifo(current); if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD, &action->thread_flags)) @@ -1334,21 +1265,19 @@ static int irq_thread(void *data) } /** - * irq_wake_thread - wake the irq thread for the action identified by dev_id - * @irq: Interrupt line - * @dev_id: Device identity for which the thread should be woken - * + * irq_wake_thread - wake the irq thread for the action identified by dev_id + * @irq: Interrupt line + * @dev_id: Device identity for which the thread should be woken */ void irq_wake_thread(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; - unsigned long flags; if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return; - raw_spin_lock_irqsave(&desc->lock, flags); + guard(raw_spinlock_irqsave)(&desc->lock); for_each_action_of_desc(desc, action) { if (action->dev_id == dev_id) { if (action->thread) @@ -1356,7 +1285,6 @@ void irq_wake_thread(unsigned int irq, void *dev_id) break; } } - raw_spin_unlock_irqrestore(&desc->lock, flags); } EXPORT_SYMBOL_GPL(irq_wake_thread); @@ -1471,19 +1399,39 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) * references an already freed task_struct. */ new->thread = get_task_struct(t); + + /* + * The affinity can not be established yet, but it will be once the + * interrupt is enabled. Delay and defer the actual setting to the + * thread itself once it is ready to run. In the meantime, prevent + * it from ever being re-affined directly by cpuset or + * housekeeping. The proper way to do it is to re-affine the whole + * vector. + */ + kthread_bind_mask(t, cpu_possible_mask); + /* - * Tell the thread to set its affinity. This is - * important for shared interrupt handlers as we do - * not invoke setup_affinity() for the secondary - * handlers as everything is already set up. Even for - * interrupts marked with IRQF_NO_BALANCE this is - * correct as we want the thread to move to the cpu(s) - * on which the requesting code placed the interrupt. + * Ensure the thread adjusts the affinity once it reaches the + * thread function. */ set_bit(IRQTF_AFFINITY, &new->thread_flags); + return 0; } +static bool valid_percpu_irqaction(struct irqaction *old, struct irqaction *new) +{ + do { + if (cpumask_intersects(old->affinity, new->affinity) || + old->percpu_dev_id == new->percpu_dev_id) + return false; + + old = old->next; + } while (old); + + return true; +} + /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. @@ -1504,6 +1452,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) struct irqaction *old, **old_ptr; unsigned long flags, thread_mask = 0; int ret, nested, shared = 0; + bool per_cpu_devid; if (!desc) return -EINVAL; @@ -1513,6 +1462,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (!try_module_get(desc->owner)) return -ENODEV; + per_cpu_devid = irq_settings_is_per_cpu_devid(desc); + new->irq = irq; /* @@ -1620,13 +1571,20 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ unsigned int oldtype; - if (irq_is_nmi(desc)) { + if (irq_is_nmi(desc) && !per_cpu_devid) { pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n", new->name, irq, desc->irq_data.chip->name); ret = -EINVAL; goto out_unlock; } + if (per_cpu_devid && !valid_percpu_irqaction(old, new)) { + pr_err("Overlapping affinities for %s (irq %d) on irqchip %s.\n", + new->name, irq, desc->irq_data.chip->name); + ret = -EINVAL; + goto out_unlock; + } + /* * If nobody did set the configuration before, inherit * the one provided by the requester. @@ -1777,7 +1735,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (!(new->flags & IRQF_NO_AUTOEN) && irq_settings_can_autoenable(desc)) { irq_startup(desc, IRQ_RESEND, IRQ_START_COND); - } else { + } else if (!per_cpu_devid) { /* * Shared interrupts do not go well with disabling * auto enable. The sharing interrupt might request @@ -1987,9 +1945,8 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) * There is no interrupt on the fly anymore. Deactivate it * completely. */ - raw_spin_lock_irqsave(&desc->lock, flags); - irq_domain_deactivate_irq(&desc->irq_data); - raw_spin_unlock_irqrestore(&desc->lock, flags); + scoped_guard(raw_spinlock_irqsave, &desc->lock) + irq_domain_deactivate_irq(&desc->irq_data); irq_release_resources(desc); chip_bus_sync_unlock(desc); @@ -2005,20 +1962,19 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) } /** - * free_irq - free an interrupt allocated with request_irq - * @irq: Interrupt line to free - * @dev_id: Device identity to free + * free_irq - free an interrupt allocated with request_irq + * @irq: Interrupt line to free + * @dev_id: Device identity to free * - * Remove an interrupt handler. The handler is removed and if the - * interrupt line is no longer in use by any driver it is disabled. - * On a shared IRQ the caller must ensure the interrupt is disabled - * on the card it drives before calling this function. The function - * does not return until any executing interrupts for this IRQ - * have completed. + * Remove an interrupt handler. The handler is removed and if the interrupt + * line is no longer in use by any driver it is disabled. On a shared IRQ + * the caller must ensure the interrupt is disabled on the card it drives + * before calling this function. The function does not return until any + * executing interrupts for this IRQ have completed. * - * This function must not be called from interrupt context. + * This function must not be called from interrupt context. * - * Returns the devname argument passed to request_irq. + * Returns the devname argument passed to request_irq. */ const void *free_irq(unsigned int irq, void *dev_id) { @@ -2075,8 +2031,6 @@ static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc) const void *free_nmi(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - const void *devname; if (!desc || WARN_ON(!irq_is_nmi(desc))) return NULL; @@ -2088,53 +2042,46 @@ const void *free_nmi(unsigned int irq, void *dev_id) if (WARN_ON(desc->depth == 0)) disable_nmi_nosync(irq); - raw_spin_lock_irqsave(&desc->lock, flags); - + guard(raw_spinlock_irqsave)(&desc->lock); irq_nmi_teardown(desc); - devname = __cleanup_nmi(irq, desc); - - raw_spin_unlock_irqrestore(&desc->lock, flags); - - return devname; + return __cleanup_nmi(irq, desc); } /** - * request_threaded_irq - allocate an interrupt line - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs. - * Primary handler for threaded interrupts. - * If handler is NULL and thread_fn != NULL - * the default primary handler is installed. - * @thread_fn: Function called from the irq handler thread - * If NULL, no irq thread is created - * @irqflags: Interrupt type flags - * @devname: An ascii name for the claiming device - * @dev_id: A cookie passed back to the handler function - * - * This call allocates interrupt resources and enables the - * interrupt line and IRQ handling. From the point this - * call is made your handler function may be invoked. Since - * your handler function must clear any interrupt the board - * raises, you must take care both to initialise your hardware - * and to set up the interrupt handler in the right order. - * - * If you want to set up a threaded irq handler for your device - * then you need to supply @handler and @thread_fn. @handler is - * still called in hard interrupt context and has to check - * whether the interrupt originates from the device. If yes it - * needs to disable the interrupt on the device and return - * IRQ_WAKE_THREAD which will wake up the handler thread and run - * @thread_fn. This split handler design is necessary to support - * shared interrupts. - * - * Dev_id must be globally unique. Normally the address of the - * device data structure is used as the cookie. Since the handler - * receives this value it makes sense to use it. - * - * If your interrupt is shared you must pass a non NULL dev_id - * as this is required when freeing the interrupt. - * - * Flags: + * request_threaded_irq - allocate an interrupt line + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * Primary handler for threaded interrupts. + * If handler is NULL and thread_fn != NULL + * the default primary handler is installed. + * @thread_fn: Function called from the irq handler thread + * If NULL, no irq thread is created + * @irqflags: Interrupt type flags + * @devname: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * This call allocates interrupt resources and enables the interrupt line + * and IRQ handling. From the point this call is made your handler function + * may be invoked. Since your handler function must clear any interrupt the + * board raises, you must take care both to initialise your hardware and to + * set up the interrupt handler in the right order. + * + * If you want to set up a threaded irq handler for your device then you + * need to supply @handler and @thread_fn. @handler is still called in hard + * interrupt context and has to check whether the interrupt originates from + * the device. If yes it needs to disable the interrupt on the device and + * return IRQ_WAKE_THREAD which will wake up the handler thread and run + * @thread_fn. This split handler design is necessary to support shared + * interrupts. + * + * @dev_id must be globally unique. Normally the address of the device data + * structure is used as the cookie. Since the handler receives this value + * it makes sense to use it. + * + * If your interrupt is shared you must pass a non NULL dev_id as this is + * required when freeing the interrupt. + * + * Flags: * * IRQF_SHARED Interrupt is shared * IRQF_TRIGGER_* Specify active edge(s) or level @@ -2232,21 +2179,20 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, EXPORT_SYMBOL(request_threaded_irq); /** - * request_any_context_irq - allocate an interrupt line - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs. - * Threaded handler for threaded interrupts. - * @flags: Interrupt type flags - * @name: An ascii name for the claiming device - * @dev_id: A cookie passed back to the handler function - * - * This call allocates interrupt resources and enables the - * interrupt line and IRQ handling. It selects either a - * hardirq or threaded handling method depending on the - * context. - * - * On failure, it returns a negative value. On success, - * it returns either IRQC_IS_HARDIRQ or IRQC_IS_NESTED. + * request_any_context_irq - allocate an interrupt line + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * Threaded handler for threaded interrupts. + * @flags: Interrupt type flags + * @name: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * This call allocates interrupt resources and enables the interrupt line + * and IRQ handling. It selects either a hardirq or threaded handling + * method depending on the context. + * + * Returns: On failure, it returns a negative value. On success, it returns either + * IRQC_IS_HARDIRQ or IRQC_IS_NESTED. */ int request_any_context_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev_id) @@ -2273,37 +2219,35 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler, EXPORT_SYMBOL_GPL(request_any_context_irq); /** - * request_nmi - allocate an interrupt line for NMI delivery - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs. - * Threaded handler for threaded interrupts. - * @irqflags: Interrupt type flags - * @name: An ascii name for the claiming device - * @dev_id: A cookie passed back to the handler function - * - * This call allocates interrupt resources and enables the - * interrupt line and IRQ handling. It sets up the IRQ line - * to be handled as an NMI. - * - * An interrupt line delivering NMIs cannot be shared and IRQ handling - * cannot be threaded. - * - * Interrupt lines requested for NMI delivering must produce per cpu - * interrupts and have auto enabling setting disabled. - * - * Dev_id must be globally unique. Normally the address of the - * device data structure is used as the cookie. Since the handler - * receives this value it makes sense to use it. - * - * If the interrupt line cannot be used to deliver NMIs, function - * will fail and return a negative value. + * request_nmi - allocate an interrupt line for NMI delivery + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * Threaded handler for threaded interrupts. + * @irqflags: Interrupt type flags + * @name: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * This call allocates interrupt resources and enables the interrupt line + * and IRQ handling. It sets up the IRQ line to be handled as an NMI. + * + * An interrupt line delivering NMIs cannot be shared and IRQ handling + * cannot be threaded. + * + * Interrupt lines requested for NMI delivering must produce per cpu + * interrupts and have auto enabling setting disabled. + * + * @dev_id must be globally unique. Normally the address of the device data + * structure is used as the cookie. Since the handler receives this value + * it makes sense to use it. + * + * If the interrupt line cannot be used to deliver NMIs, function will fail + * and return a negative value. */ int request_nmi(unsigned int irq, irq_handler_t handler, unsigned long irqflags, const char *name, void *dev_id) { struct irqaction *action; struct irq_desc *desc; - unsigned long flags; int retval; if (irq == IRQ_NOTCONNECTED) @@ -2345,21 +2289,17 @@ int request_nmi(unsigned int irq, irq_handler_t handler, if (retval) goto err_irq_setup; - raw_spin_lock_irqsave(&desc->lock, flags); - - /* Setup NMI state */ - desc->istate |= IRQS_NMI; - retval = irq_nmi_setup(desc); - if (retval) { - __cleanup_nmi(irq, desc); - raw_spin_unlock_irqrestore(&desc->lock, flags); - return -EINVAL; + scoped_guard(raw_spinlock_irqsave, &desc->lock) { + /* Setup NMI state */ + desc->istate |= IRQS_NMI; + retval = irq_nmi_setup(desc); + if (retval) { + __cleanup_nmi(irq, desc); + return -EINVAL; + } + return 0; } - raw_spin_unlock_irqrestore(&desc->lock, flags); - - return 0; - err_irq_setup: irq_chip_pm_put(&desc->irq_data); err_out: @@ -2370,35 +2310,25 @@ err_out: void enable_percpu_irq(unsigned int irq, unsigned int type) { - unsigned int cpu = smp_processor_id(); - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); - - if (!desc) - return; - - /* - * If the trigger type is not specified by the caller, then - * use the default for this interrupt. - */ - type &= IRQ_TYPE_SENSE_MASK; - if (type == IRQ_TYPE_NONE) - type = irqd_get_trigger_type(&desc->irq_data); - - if (type != IRQ_TYPE_NONE) { - int ret; - - ret = __irq_set_trigger(desc, type); + scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) { + struct irq_desc *desc = scoped_irqdesc; - if (ret) { - WARN(1, "failed to set type for IRQ%d\n", irq); - goto out; + /* + * If the trigger type is not specified by the caller, then + * use the default for this interrupt. + */ + type &= IRQ_TYPE_SENSE_MASK; + if (type == IRQ_TYPE_NONE) + type = irqd_get_trigger_type(&desc->irq_data); + + if (type != IRQ_TYPE_NONE) { + if (__irq_set_trigger(desc, type)) { + WARN(1, "failed to set type for IRQ%d\n", irq); + return; + } } + irq_percpu_enable(desc, smp_processor_id()); } - - irq_percpu_enable(desc, cpu); -out: - irq_put_desc_unlock(desc, flags); } EXPORT_SYMBOL_GPL(enable_percpu_irq); @@ -2416,33 +2346,16 @@ void enable_percpu_nmi(unsigned int irq, unsigned int type) */ bool irq_percpu_is_enabled(unsigned int irq) { - unsigned int cpu = smp_processor_id(); - struct irq_desc *desc; - unsigned long flags; - bool is_enabled; - - desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); - if (!desc) - return false; - - is_enabled = cpumask_test_cpu(cpu, desc->percpu_enabled); - irq_put_desc_unlock(desc, flags); - - return is_enabled; + scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) + return cpumask_test_cpu(smp_processor_id(), scoped_irqdesc->percpu_enabled); + return false; } EXPORT_SYMBOL_GPL(irq_percpu_is_enabled); void disable_percpu_irq(unsigned int irq) { - unsigned int cpu = smp_processor_id(); - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); - - if (!desc) - return; - - irq_percpu_disable(desc, cpu); - irq_put_desc_unlock(desc, flags); + scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) + irq_percpu_disable(scoped_irqdesc, smp_processor_id()); } EXPORT_SYMBOL_GPL(disable_percpu_irq); @@ -2457,72 +2370,60 @@ void disable_percpu_nmi(unsigned int irq) static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id) { struct irq_desc *desc = irq_to_desc(irq); - struct irqaction *action; - unsigned long flags; + struct irqaction *action, **action_ptr; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); if (!desc) return NULL; - raw_spin_lock_irqsave(&desc->lock, flags); + scoped_guard(raw_spinlock_irqsave, &desc->lock) { + action_ptr = &desc->action; + for (;;) { + action = *action_ptr; - action = desc->action; - if (!action || action->percpu_dev_id != dev_id) { - WARN(1, "Trying to free already-free IRQ %d\n", irq); - goto bad; - } + if (!action) { + WARN(1, "Trying to free already-free IRQ %d\n", irq); + return NULL; + } - if (!cpumask_empty(desc->percpu_enabled)) { - WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", - irq, cpumask_first(desc->percpu_enabled)); - goto bad; - } + if (action->percpu_dev_id == dev_id) + break; - /* Found it - now remove it from the list of entries: */ - desc->action = NULL; + action_ptr = &action->next; + } - desc->istate &= ~IRQS_NMI; + if (cpumask_intersects(desc->percpu_enabled, action->affinity)) { + WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", irq, + cpumask_first_and(desc->percpu_enabled, action->affinity)); + return NULL; + } - raw_spin_unlock_irqrestore(&desc->lock, flags); + /* Found it - now remove it from the list of entries: */ + *action_ptr = action->next; - unregister_handler_proc(irq, action); + /* Demote from NMI if we killed the last action */ + if (!desc->action) + desc->istate &= ~IRQS_NMI; + } + unregister_handler_proc(irq, action); irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); return action; - -bad: - raw_spin_unlock_irqrestore(&desc->lock, flags); - return NULL; } /** - * remove_percpu_irq - free a per-cpu interrupt - * @irq: Interrupt line to free - * @act: irqaction for the interrupt + * free_percpu_irq - free an interrupt allocated with request_percpu_irq + * @irq: Interrupt line to free + * @dev_id: Device identity to free * - * Used to remove interrupts statically setup by the early boot process. - */ -void remove_percpu_irq(unsigned int irq, struct irqaction *act) -{ - struct irq_desc *desc = irq_to_desc(irq); - - if (desc && irq_settings_is_per_cpu_devid(desc)) - __free_percpu_irq(irq, act->percpu_dev_id); -} - -/** - * free_percpu_irq - free an interrupt allocated with request_percpu_irq - * @irq: Interrupt line to free - * @dev_id: Device identity to free + * Remove a percpu interrupt handler. The handler is removed, but the + * interrupt line is not disabled. This must be done on each CPU before + * calling this function. The function does not return until any executing + * interrupts for this IRQ have completed. * - * Remove a percpu interrupt handler. The handler is removed, but - * the interrupt line is not disabled. This must be done on each - * CPU before calling this function. The function does not return - * until any executing interrupts for this IRQ have completed. - * - * This function must not be called from interrupt context. + * This function must not be called from interrupt context. */ void free_percpu_irq(unsigned int irq, void __percpu *dev_id) { @@ -2551,9 +2452,9 @@ void free_percpu_nmi(unsigned int irq, void __percpu *dev_id) } /** - * setup_percpu_irq - setup a per-cpu interrupt - * @irq: Interrupt line to setup - * @act: irqaction for the interrupt + * setup_percpu_irq - setup a per-cpu interrupt + * @irq: Interrupt line to setup + * @act: irqaction for the interrupt * * Used to statically setup per-cpu interrupts in the early boot process. */ @@ -2569,6 +2470,9 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) if (retval < 0) return retval; + if (!act->affinity) + act->affinity = cpu_online_mask; + retval = __setup_irq(irq, desc, act); if (retval) @@ -2577,26 +2481,57 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) return retval; } +static +struct irqaction *create_percpu_irqaction(irq_handler_t handler, unsigned long flags, + const char *devname, const cpumask_t *affinity, + void __percpu *dev_id) +{ + struct irqaction *action; + + if (!affinity) + affinity = cpu_possible_mask; + + action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); + if (!action) + return NULL; + + action->handler = handler; + action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND; + action->name = devname; + action->percpu_dev_id = dev_id; + action->affinity = affinity; + + /* + * We allow some form of sharing for non-overlapping affinity + * masks. Obviously, covering all CPUs prevents any sharing in + * the first place. + */ + if (!cpumask_equal(affinity, cpu_possible_mask)) + action->flags |= IRQF_SHARED; + + return action; +} + /** - * __request_percpu_irq - allocate a percpu interrupt line - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs. - * @flags: Interrupt type flags (IRQF_TIMER only) - * @devname: An ascii name for the claiming device - * @dev_id: A percpu cookie passed back to the handler function - * - * This call allocates interrupt resources and enables the - * interrupt on the local CPU. If the interrupt is supposed to be - * enabled on other CPUs, it has to be done on each CPU using - * enable_percpu_irq(). - * - * Dev_id must be globally unique. It is a per-cpu variable, and - * the handler gets called with the interrupted CPU's instance of - * that variable. + * __request_percpu_irq - allocate a percpu interrupt line + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * @flags: Interrupt type flags (IRQF_TIMER only) + * @devname: An ascii name for the claiming device + * @affinity: A cpumask describing the target CPUs for this interrupt + * @dev_id: A percpu cookie passed back to the handler function + * + * This call allocates interrupt resources, but doesn't enable the interrupt + * on any CPU, as all percpu-devid interrupts are flagged with IRQ_NOAUTOEN. + * It has to be done on each CPU using enable_percpu_irq(). + * + * @dev_id must be globally unique. It is a per-cpu variable, and + * the handler gets called with the interrupted CPU's instance of + * that variable. */ int __request_percpu_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *devname, - void __percpu *dev_id) + const cpumask_t *affinity, void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; @@ -2613,15 +2548,10 @@ int __request_percpu_irq(unsigned int irq, irq_handler_t handler, if (flags && flags != IRQF_TIMER) return -EINVAL; - action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); + action = create_percpu_irqaction(handler, flags, devname, affinity, dev_id); if (!action) return -ENOMEM; - action->handler = handler; - action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND; - action->name = devname; - action->percpu_dev_id = dev_id; - retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) { kfree(action); @@ -2640,32 +2570,32 @@ int __request_percpu_irq(unsigned int irq, irq_handler_t handler, EXPORT_SYMBOL_GPL(__request_percpu_irq); /** - * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs. - * @name: An ascii name for the claiming device - * @dev_id: A percpu cookie passed back to the handler function - * - * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs - * have to be setup on each CPU by calling prepare_percpu_nmi() before - * being enabled on the same CPU by using enable_percpu_nmi(). - * - * Dev_id must be globally unique. It is a per-cpu variable, and - * the handler gets called with the interrupted CPU's instance of - * that variable. - * - * Interrupt lines requested for NMI delivering should have auto enabling - * setting disabled. - * - * If the interrupt line cannot be used to deliver NMIs, function - * will fail returning a negative value. + * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * @name: An ascii name for the claiming device + * @affinity: A cpumask describing the target CPUs for this interrupt + * @dev_id: A percpu cookie passed back to the handler function + * + * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs + * have to be setup on each CPU by calling prepare_percpu_nmi() before + * being enabled on the same CPU by using enable_percpu_nmi(). + * + * @dev_id must be globally unique. It is a per-cpu variable, and the + * handler gets called with the interrupted CPU's instance of that + * variable. + * + * Interrupt lines requested for NMI delivering should have auto enabling + * setting disabled. + * + * If the interrupt line cannot be used to deliver NMIs, function + * will fail returning a negative value. */ -int request_percpu_nmi(unsigned int irq, irq_handler_t handler, - const char *name, void __percpu *dev_id) +int request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name, + const struct cpumask *affinity, void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; - unsigned long flags; int retval; if (!handler) @@ -2679,20 +2609,16 @@ int request_percpu_nmi(unsigned int irq, irq_handler_t handler, !irq_supports_nmi(desc)) return -EINVAL; - /* The line cannot already be NMI */ - if (irq_is_nmi(desc)) + /* The line cannot be NMI already if the new request covers all CPUs */ + if (irq_is_nmi(desc) && + (!affinity || cpumask_equal(affinity, cpu_possible_mask))) return -EINVAL; - action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); + action = create_percpu_irqaction(handler, IRQF_NO_THREAD | IRQF_NOBALANCING, + name, affinity, dev_id); if (!action) return -ENOMEM; - action->handler = handler; - action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND | IRQF_NO_THREAD - | IRQF_NOBALANCING; - action->name = name; - action->percpu_dev_id = dev_id; - retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) goto err_out; @@ -2701,10 +2627,8 @@ int request_percpu_nmi(unsigned int irq, irq_handler_t handler, if (retval) goto err_irq_setup; - raw_spin_lock_irqsave(&desc->lock, flags); - desc->istate |= IRQS_NMI; - raw_spin_unlock_irqrestore(&desc->lock, flags); - + scoped_guard(raw_spinlock_irqsave, &desc->lock) + desc->istate |= IRQS_NMI; return 0; err_irq_setup: @@ -2716,79 +2640,55 @@ err_out: } /** - * prepare_percpu_nmi - performs CPU local setup for NMI delivery - * @irq: Interrupt line to prepare for NMI delivery + * prepare_percpu_nmi - performs CPU local setup for NMI delivery + * @irq: Interrupt line to prepare for NMI delivery * - * This call prepares an interrupt line to deliver NMI on the current CPU, - * before that interrupt line gets enabled with enable_percpu_nmi(). + * This call prepares an interrupt line to deliver NMI on the current CPU, + * before that interrupt line gets enabled with enable_percpu_nmi(). * - * As a CPU local operation, this should be called from non-preemptible - * context. + * As a CPU local operation, this should be called from non-preemptible + * context. * - * If the interrupt line cannot be used to deliver NMIs, function - * will fail returning a negative value. + * If the interrupt line cannot be used to deliver NMIs, function will fail + * returning a negative value. */ int prepare_percpu_nmi(unsigned int irq) { - unsigned long flags; - struct irq_desc *desc; - int ret = 0; + int ret = -EINVAL; WARN_ON(preemptible()); - desc = irq_get_desc_lock(irq, &flags, - IRQ_GET_DESC_CHECK_PERCPU); - if (!desc) - return -EINVAL; - - if (WARN(!irq_is_nmi(desc), - KERN_ERR "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n", - irq)) { - ret = -EINVAL; - goto out; - } + scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) { + if (WARN(!irq_is_nmi(scoped_irqdesc), + "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n", irq)) + return -EINVAL; - ret = irq_nmi_setup(desc); - if (ret) { - pr_err("Failed to setup NMI delivery: irq %u\n", irq); - goto out; + ret = irq_nmi_setup(scoped_irqdesc); + if (ret) + pr_err("Failed to setup NMI delivery: irq %u\n", irq); } - -out: - irq_put_desc_unlock(desc, flags); return ret; } /** - * teardown_percpu_nmi - undoes NMI setup of IRQ line - * @irq: Interrupt line from which CPU local NMI configuration should be - * removed + * teardown_percpu_nmi - undoes NMI setup of IRQ line + * @irq: Interrupt line from which CPU local NMI configuration should be removed * - * This call undoes the setup done by prepare_percpu_nmi(). + * This call undoes the setup done by prepare_percpu_nmi(). * - * IRQ line should not be enabled for the current CPU. - * - * As a CPU local operation, this should be called from non-preemptible - * context. + * IRQ line should not be enabled for the current CPU. + * As a CPU local operation, this should be called from non-preemptible + * context. */ void teardown_percpu_nmi(unsigned int irq) { - unsigned long flags; - struct irq_desc *desc; - WARN_ON(preemptible()); - desc = irq_get_desc_lock(irq, &flags, - IRQ_GET_DESC_CHECK_PERCPU); - if (!desc) - return; - - if (WARN_ON(!irq_is_nmi(desc))) - goto out; - - irq_nmi_teardown(desc); -out: - irq_put_desc_unlock(desc, flags); + scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) { + if (WARN_ON(!irq_is_nmi(scoped_irqdesc))) + return; + irq_nmi_teardown(scoped_irqdesc); + } } static int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state) @@ -2815,87 +2715,62 @@ static int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state } /** - * irq_get_irqchip_state - returns the irqchip state of a interrupt. - * @irq: Interrupt line that is forwarded to a VM - * @which: One of IRQCHIP_STATE_* the caller wants to know about - * @state: a pointer to a boolean where the state is to be stored + * irq_get_irqchip_state - returns the irqchip state of a interrupt. + * @irq: Interrupt line that is forwarded to a VM + * @which: One of IRQCHIP_STATE_* the caller wants to know about + * @state: a pointer to a boolean where the state is to be stored * - * This call snapshots the internal irqchip state of an - * interrupt, returning into @state the bit corresponding to - * stage @which + * This call snapshots the internal irqchip state of an interrupt, + * returning into @state the bit corresponding to stage @which * - * This function should be called with preemption disabled if the - * interrupt controller has per-cpu registers. + * This function should be called with preemption disabled if the interrupt + * controller has per-cpu registers. */ -int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, - bool *state) +int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool *state) { - struct irq_desc *desc; - struct irq_data *data; - unsigned long flags; - int err = -EINVAL; - - desc = irq_get_desc_buslock(irq, &flags, 0); - if (!desc) - return err; + scoped_irqdesc_get_and_buslock(irq, 0) { + struct irq_data *data = irq_desc_get_irq_data(scoped_irqdesc); - data = irq_desc_get_irq_data(desc); - - err = __irq_get_irqchip_state(data, which, state); - - irq_put_desc_busunlock(desc, flags); - return err; + return __irq_get_irqchip_state(data, which, state); + } + return -EINVAL; } EXPORT_SYMBOL_GPL(irq_get_irqchip_state); /** - * irq_set_irqchip_state - set the state of a forwarded interrupt. - * @irq: Interrupt line that is forwarded to a VM - * @which: State to be restored (one of IRQCHIP_STATE_*) - * @val: Value corresponding to @which + * irq_set_irqchip_state - set the state of a forwarded interrupt. + * @irq: Interrupt line that is forwarded to a VM + * @which: State to be restored (one of IRQCHIP_STATE_*) + * @val: Value corresponding to @which * - * This call sets the internal irqchip state of an interrupt, - * depending on the value of @which. + * This call sets the internal irqchip state of an interrupt, depending on + * the value of @which. * - * This function should be called with migration disabled if the - * interrupt controller has per-cpu registers. + * This function should be called with migration disabled if the interrupt + * controller has per-cpu registers. */ -int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, - bool val) +int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool val) { - struct irq_desc *desc; - struct irq_data *data; - struct irq_chip *chip; - unsigned long flags; - int err = -EINVAL; + scoped_irqdesc_get_and_buslock(irq, 0) { + struct irq_data *data = irq_desc_get_irq_data(scoped_irqdesc); + struct irq_chip *chip; - desc = irq_get_desc_buslock(irq, &flags, 0); - if (!desc) - return err; + do { + chip = irq_data_get_irq_chip(data); - data = irq_desc_get_irq_data(desc); + if (WARN_ON_ONCE(!chip)) + return -ENODEV; - do { - chip = irq_data_get_irq_chip(data); - if (WARN_ON_ONCE(!chip)) { - err = -ENODEV; - goto out_unlock; - } - if (chip->irq_set_irqchip_state) - break; -#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY - data = data->parent_data; -#else - data = NULL; -#endif - } while (data); + if (chip->irq_set_irqchip_state) + break; - if (data) - err = chip->irq_set_irqchip_state(data, which, val); + data = irqd_get_parent_data(data); + } while (data); -out_unlock: - irq_put_desc_busunlock(desc, flags); - return err; + if (data) + return chip->irq_set_irqchip_state(data, which, val); + } + return -EINVAL; } EXPORT_SYMBOL_GPL(irq_set_irqchip_state); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index c05ba7ca00fa..68886881fe10 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -59,7 +59,8 @@ struct msi_ctrl { static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl); static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid); static inline int msi_sysfs_create_group(struct device *dev); - +static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, + int nvec, msi_alloc_info_t *arg); /** * msi_alloc_desc - Allocate an initialized msi_desc @@ -343,26 +344,30 @@ int msi_setup_device_data(struct device *dev) } /** - * msi_lock_descs - Lock the MSI descriptor storage of a device + * __msi_lock_descs - Lock the MSI descriptor storage of a device * @dev: Device to operate on + * + * Internal function for guard(msi_descs_lock). Don't use in code. */ -void msi_lock_descs(struct device *dev) +void __msi_lock_descs(struct device *dev) { mutex_lock(&dev->msi.data->mutex); } -EXPORT_SYMBOL_GPL(msi_lock_descs); +EXPORT_SYMBOL_GPL(__msi_lock_descs); /** - * msi_unlock_descs - Unlock the MSI descriptor storage of a device + * __msi_unlock_descs - Unlock the MSI descriptor storage of a device * @dev: Device to operate on + * + * Internal function for guard(msi_descs_lock). Don't use in code. */ -void msi_unlock_descs(struct device *dev) +void __msi_unlock_descs(struct device *dev) { /* Invalidate the index which was cached by the iterator */ dev->msi.data->__iter_idx = MSI_XA_MAX_INDEX; mutex_unlock(&dev->msi.data->mutex); } -EXPORT_SYMBOL_GPL(msi_unlock_descs); +EXPORT_SYMBOL_GPL(__msi_unlock_descs); static struct msi_desc *msi_find_desc(struct msi_device_data *md, unsigned int domid, enum msi_desc_filter filter) @@ -448,7 +453,6 @@ EXPORT_SYMBOL_GPL(msi_next_desc); unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigned int index) { struct msi_desc *desc; - unsigned int ret = 0; bool pcimsi = false; struct xarray *xa; @@ -462,7 +466,7 @@ unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigne if (dev_is_pci(dev) && domid == MSI_DEFAULT_DOMAIN) pcimsi = to_pci_dev(dev)->msi_enabled; - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); xa = &dev->msi.data->__domains[domid].store; desc = xa_load(xa, pcimsi ? 0 : index); if (desc && desc->irq) { @@ -471,16 +475,12 @@ unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigne * PCI-MSIX and platform MSI use a descriptor per * interrupt. */ - if (pcimsi) { - if (index < desc->nvec_used) - ret = desc->irq + index; - } else { - ret = desc->irq; - } + if (!pcimsi) + return desc->irq; + if (index < desc->nvec_used) + return desc->irq + index; } - - msi_unlock_descs(dev); - return ret; + return 0; } EXPORT_SYMBOL_GPL(msi_domain_get_virq); @@ -706,7 +706,7 @@ static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq = ops->get_hwirq(info, arg); int i, ret; - if (irq_find_mapping(domain, hwirq) > 0) + if (irq_resolve_mapping(domain, hwirq)) return -EEXIST; if (domain->parent) { @@ -796,6 +796,10 @@ static int msi_domain_ops_prepare(struct irq_domain *domain, struct device *dev, return 0; } +static void msi_domain_ops_teardown(struct irq_domain *domain, msi_alloc_info_t *arg) +{ +} + static void msi_domain_ops_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc) { @@ -821,6 +825,7 @@ static struct msi_domain_ops msi_domain_ops_default = { .get_hwirq = msi_domain_ops_get_hwirq, .msi_init = msi_domain_ops_init, .msi_prepare = msi_domain_ops_prepare, + .msi_teardown = msi_domain_ops_teardown, .set_desc = msi_domain_ops_set_desc, }; @@ -842,6 +847,8 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info) ops->msi_init = msi_domain_ops_default.msi_init; if (ops->msi_prepare == NULL) ops->msi_prepare = msi_domain_ops_default.msi_prepare; + if (ops->msi_teardown == NULL) + ops->msi_teardown = msi_domain_ops_default.msi_teardown; if (ops->set_desc == NULL) ops->set_desc = msi_domain_ops_default.set_desc; } @@ -882,6 +889,7 @@ static struct irq_domain *__msi_create_irq_domain(struct fwnode_handle *fwnode, if (domain) { irq_domain_update_bus_token(domain, info->bus_token); + domain->dev = info->dev; if (info->flags & MSI_FLAG_PARENT_PM_DEV) domain->pm_dev = parent->pm_dev; } @@ -905,6 +913,32 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, } /** + * msi_create_parent_irq_domain - Create an MSI-parent interrupt domain + * @info: MSI irqdomain creation info + * @msi_parent_ops: MSI parent callbacks and configuration + * + * Return: pointer to the created &struct irq_domain or %NULL on failure + */ +struct irq_domain *msi_create_parent_irq_domain(struct irq_domain_info *info, + const struct msi_parent_ops *msi_parent_ops) +{ + struct irq_domain *d; + + info->hwirq_max = max(info->hwirq_max, info->size); + info->size = info->hwirq_max; + info->domain_flags |= IRQ_DOMAIN_FLAG_MSI_PARENT; + info->bus_token = msi_parent_ops->bus_select_token; + + d = irq_domain_instantiate(info); + if (IS_ERR(d)) + return NULL; + + d->msi_parent_ops = msi_parent_ops; + return d; +} +EXPORT_SYMBOL_GPL(msi_create_parent_irq_domain); + +/** * msi_parent_init_dev_msi_info - Delegate initialization of device MSI info down * in the domain hierarchy * @dev: The device for which the domain should be created @@ -998,9 +1032,8 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, void *chip_data) { struct irq_domain *domain, *parent = dev->msi.domain; - struct fwnode_handle *fwnode, *fwnalloced = NULL; - struct msi_domain_template *bundle; const struct msi_parent_ops *pops; + struct fwnode_handle *fwnode; if (!irq_domain_is_msi_parent(parent)) return false; @@ -1008,7 +1041,8 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, if (domid >= MSI_MAX_DEVICE_IRQDOMAINS) return false; - bundle = kmemdup(template, sizeof(*bundle), GFP_KERNEL); + struct msi_domain_template *bundle __free(kfree) = + kmemdup(template, sizeof(*bundle), GFP_KERNEL); if (!bundle) return false; @@ -1017,6 +1051,8 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, bundle->info.ops = &bundle->ops; bundle->info.data = domain_data; bundle->info.chip_data = chip_data; + bundle->info.alloc_data = &bundle->alloc_info; + bundle->info.dev = dev; pops = parent->msi_parent_ops; snprintf(bundle->name, sizeof(bundle->name), "%s%s-%s", @@ -1031,41 +1067,42 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, * node as they are not guaranteed to have a fwnode. They are never * looked up and always handled in the context of the device. */ - if (bundle->info.flags & MSI_FLAG_USE_DEV_FWNODE) - fwnode = dev->fwnode; + struct fwnode_handle *fwnode_alloced __free(irq_domain_free_fwnode) = NULL; + + if (!(bundle->info.flags & MSI_FLAG_USE_DEV_FWNODE)) + fwnode = fwnode_alloced = irq_domain_alloc_named_fwnode(bundle->name); else - fwnode = fwnalloced = irq_domain_alloc_named_fwnode(bundle->name); + fwnode = dev->fwnode; if (!fwnode) - goto free_bundle; + return false; if (msi_setup_device_data(dev)) - goto free_fwnode; - - msi_lock_descs(dev); + return false; + guard(msi_descs_lock)(dev); if (WARN_ON_ONCE(msi_get_device_domain(dev, domid))) - goto fail; + return false; if (!pops->init_dev_msi_info(dev, parent, parent, &bundle->info)) - goto fail; + return false; domain = __msi_create_irq_domain(fwnode, &bundle->info, IRQ_DOMAIN_FLAG_MSI_DEVICE, parent); if (!domain) - goto fail; + return false; - domain->dev = dev; dev->msi.data->__domains[domid].domain = domain; - msi_unlock_descs(dev); - return true; -fail: - msi_unlock_descs(dev); -free_fwnode: - irq_domain_free_fwnode(fwnalloced); -free_bundle: - kfree(bundle); - return false; + if (msi_domain_prepare_irqs(domain, dev, hwsize, &bundle->alloc_info)) { + dev->msi.data->__domains[domid].domain = NULL; + irq_domain_remove(domain); + return false; + } + + /* @bundle and @fwnode_alloced are now in use. Prevent cleanup */ + retain_and_null_ptr(bundle); + retain_and_null_ptr(fwnode_alloced); + return true; } /** @@ -1079,23 +1116,21 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid) struct msi_domain_info *info; struct irq_domain *domain; - msi_lock_descs(dev); - + guard(msi_descs_lock)(dev); domain = msi_get_device_domain(dev, domid); - if (!domain || !irq_domain_is_msi_device(domain)) - goto unlock; + return; dev->msi.data->__domains[domid].domain = NULL; info = domain->host_data; + + info->ops->msi_teardown(domain, info->alloc_data); + if (irq_domain_is_msi_device(domain)) fwnode = domain->fwnode; irq_domain_remove(domain); irq_domain_free_fwnode(fwnode); kfree(container_of(info, struct msi_domain_template, info)); - -unlock: - msi_unlock_descs(dev); } /** @@ -1111,16 +1146,14 @@ bool msi_match_device_irq_domain(struct device *dev, unsigned int domid, { struct msi_domain_info *info; struct irq_domain *domain; - bool ret = false; - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); domain = msi_get_device_domain(dev, domid); if (domain && irq_domain_is_msi_device(domain)) { info = domain->host_data; - ret = info->bus_token == bus_token; + return info->bus_token == bus_token; } - msi_unlock_descs(dev); - return ret; + return false; } static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, @@ -1238,6 +1271,24 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag return 0; } +static int populate_alloc_info(struct irq_domain *domain, struct device *dev, + unsigned int nirqs, msi_alloc_info_t *arg) +{ + struct msi_domain_info *info = domain->host_data; + + /* + * If the caller has provided a template alloc info, use that. Once + * all users of msi_create_irq_domain() have been eliminated, this + * should be the only source of allocation information, and the + * prepare call below should be finally removed. + */ + if (!info->alloc_data) + return msi_domain_prepare_irqs(domain, dev, nirqs, arg); + + *arg = *info->alloc_data; + return 0; +} + static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain, struct msi_ctrl *ctrl) { @@ -1250,7 +1301,7 @@ static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain unsigned long idx; int i, ret, virq; - ret = msi_domain_prepare_irqs(domain, dev, ctrl->nirqs, &arg); + ret = populate_alloc_info(domain, dev, ctrl->nirqs, &arg); if (ret) return ret; @@ -1391,12 +1442,9 @@ int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid, int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, unsigned int first, unsigned int last) { - int ret; - msi_lock_descs(dev); - ret = msi_domain_alloc_irqs_range_locked(dev, domid, first, last); - msi_unlock_descs(dev); - return ret; + guard(msi_descs_lock)(dev); + return msi_domain_alloc_irqs_range_locked(dev, domid, first, last); } EXPORT_SYMBOL_GPL(msi_domain_alloc_irqs_range); @@ -1500,12 +1548,8 @@ struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, u const struct irq_affinity_desc *affdesc, union msi_instance_cookie *icookie) { - struct msi_map map; - - msi_lock_descs(dev); - map = __msi_domain_alloc_irq_at(dev, domid, index, affdesc, icookie); - msi_unlock_descs(dev); - return map; + guard(msi_descs_lock)(dev); + return __msi_domain_alloc_irq_at(dev, domid, index, affdesc, icookie); } /** @@ -1542,13 +1586,11 @@ int msi_device_domain_alloc_wired(struct irq_domain *domain, unsigned int hwirq, icookie.value = ((u64)type << 32) | hwirq; - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); if (WARN_ON_ONCE(msi_get_device_domain(dev, domid) != domain)) map.index = -EINVAL; else map = __msi_domain_alloc_irq_at(dev, domid, MSI_ANY_INDEX, NULL, &icookie); - msi_unlock_descs(dev); - return map.index >= 0 ? map.virq : map.index; } @@ -1602,9 +1644,6 @@ static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl) else __msi_domain_free_irqs(dev, domain, ctrl); - if (ops->msi_post_free) - ops->msi_post_free(domain, dev); - if (info->flags & MSI_FLAG_FREE_MSI_DESCS) msi_domain_free_descs(dev, ctrl); } @@ -1641,9 +1680,8 @@ void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid, void msi_domain_free_irqs_range(struct device *dev, unsigned int domid, unsigned int first, unsigned int last) { - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); msi_domain_free_irqs_range_locked(dev, domid, first, last); - msi_unlock_descs(dev); } EXPORT_SYMBOL_GPL(msi_domain_free_irqs_all); @@ -1673,9 +1711,8 @@ void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid) */ void msi_domain_free_irqs_all(struct device *dev, unsigned int domid) { - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); msi_domain_free_irqs_all_locked(dev, domid); - msi_unlock_descs(dev); } /** @@ -1694,12 +1731,11 @@ void msi_device_domain_free_wired(struct irq_domain *domain, unsigned int virq) if (WARN_ON_ONCE(!dev || !desc || domain->bus_token != DOMAIN_BUS_WIRED_TO_MSI)) return; - msi_lock_descs(dev); - if (!WARN_ON_ONCE(msi_get_device_domain(dev, MSI_DEFAULT_DOMAIN) != domain)) { - msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, desc->msi_index, - desc->msi_index); - } - msi_unlock_descs(dev); + guard(msi_descs_lock)(dev); + if (WARN_ON_ONCE(msi_get_device_domain(dev, MSI_DEFAULT_DOMAIN) != domain)) + return; + msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, desc->msi_index, + desc->msi_index); } /** diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index c556bc49d213..99ff65466d87 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -13,17 +13,13 @@ #include "internals.h" -bool irq_pm_check_wakeup(struct irq_desc *desc) +void irq_pm_handle_wakeup(struct irq_desc *desc) { - if (irqd_is_wakeup_armed(&desc->irq_data)) { - irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); - desc->istate |= IRQS_SUSPENDED | IRQS_PENDING; - desc->depth++; - irq_disable(desc); - pm_system_irq_wakeup(irq_desc_get_irq(desc)); - return true; - } - return false; + irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); + desc->istate |= IRQS_SUSPENDED | IRQS_PENDING; + desc->depth++; + irq_disable(desc); + pm_system_irq_wakeup(irq_desc_get_irq(desc)); } /* @@ -46,8 +42,7 @@ void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) desc->cond_suspend_depth++; WARN_ON_ONCE(desc->no_suspend_depth && - (desc->no_suspend_depth + - desc->cond_suspend_depth) != desc->nr_actions); + (desc->no_suspend_depth + desc->cond_suspend_depth) != desc->nr_actions); } /* @@ -134,14 +129,12 @@ void suspend_device_irqs(void) int irq; for_each_irq_desc(irq, desc) { - unsigned long flags; bool sync; if (irq_settings_is_nested_thread(desc)) continue; - raw_spin_lock_irqsave(&desc->lock, flags); - sync = suspend_device_irq(desc); - raw_spin_unlock_irqrestore(&desc->lock, flags); + scoped_guard(raw_spinlock_irqsave, &desc->lock) + sync = suspend_device_irq(desc); if (sync) synchronize_irq(irq); @@ -186,18 +179,15 @@ static void resume_irqs(bool want_early) int irq; for_each_irq_desc(irq, desc) { - unsigned long flags; - bool is_early = desc->action && - desc->action->flags & IRQF_EARLY_RESUME; + bool is_early = desc->action && desc->action->flags & IRQF_EARLY_RESUME; if (!is_early && want_early) continue; if (irq_settings_is_nested_thread(desc)) continue; - raw_spin_lock_irqsave(&desc->lock, flags); + guard(raw_spinlock_irqsave)(&desc->lock); resume_irq(desc); - raw_spin_unlock_irqrestore(&desc->lock, flags); } } @@ -207,41 +197,40 @@ static void resume_irqs(bool want_early) */ void rearm_wake_irq(unsigned int irq) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); + scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { + struct irq_desc *desc = scoped_irqdesc; - if (!desc) - return; - - if (!(desc->istate & IRQS_SUSPENDED) || - !irqd_is_wakeup_set(&desc->irq_data)) - goto unlock; + if (!(desc->istate & IRQS_SUSPENDED) || !irqd_is_wakeup_set(&desc->irq_data)) + return; - desc->istate &= ~IRQS_SUSPENDED; - irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); - __enable_irq(desc); - -unlock: - irq_put_desc_busunlock(desc, flags); + desc->istate &= ~IRQS_SUSPENDED; + irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); + __enable_irq(desc); + } } /** * irq_pm_syscore_resume - enable interrupt lines early + * @data: syscore context * * Enable all interrupt lines with %IRQF_EARLY_RESUME set. */ -static void irq_pm_syscore_resume(void) +static void irq_pm_syscore_resume(void *data) { resume_irqs(true); } -static struct syscore_ops irq_pm_syscore_ops = { +static const struct syscore_ops irq_pm_syscore_ops = { .resume = irq_pm_syscore_resume, }; +static struct syscore irq_pm_syscore = { + .ops = &irq_pm_syscore_ops, +}; + static int __init irq_pm_init_ops(void) { - register_syscore_ops(&irq_pm_syscore_ops); + register_syscore(&irq_pm_syscore); return 0; } diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 8e29809de38d..77258eafbf63 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -48,6 +48,8 @@ static int show_irq_affinity(int type, struct seq_file *m) struct irq_desc *desc = irq_to_desc((long)m->private); const struct cpumask *mask; + guard(raw_spinlock_irq)(&desc->lock); + switch (type) { case AFFINITY: case AFFINITY_LIST: @@ -81,20 +83,18 @@ static int show_irq_affinity(int type, struct seq_file *m) static int irq_affinity_hint_proc_show(struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long)m->private); - unsigned long flags; cpumask_var_t mask; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; - raw_spin_lock_irqsave(&desc->lock, flags); - if (desc->affinity_hint) - cpumask_copy(mask, desc->affinity_hint); - raw_spin_unlock_irqrestore(&desc->lock, flags); + scoped_guard(raw_spinlock_irq, &desc->lock) { + if (desc->affinity_hint) + cpumask_copy(mask, desc->affinity_hint); + } seq_printf(m, "%*pb\n", cpumask_pr_args(mask)); free_cpumask_var(mask); - return 0; } @@ -295,32 +295,26 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v) #define MAX_NAMELEN 128 -static int name_unique(unsigned int irq, struct irqaction *new_action) +static bool name_unique(unsigned int irq, struct irqaction *new_action) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; - unsigned long flags; - int ret = 1; - raw_spin_lock_irqsave(&desc->lock, flags); + guard(raw_spinlock_irq)(&desc->lock); for_each_action_of_desc(desc, action) { if ((action != new_action) && action->name && - !strcmp(new_action->name, action->name)) { - ret = 0; - break; - } + !strcmp(new_action->name, action->name)) + return false; } - raw_spin_unlock_irqrestore(&desc->lock, flags); - return ret; + return true; } void register_handler_proc(unsigned int irq, struct irqaction *action) { - char name [MAX_NAMELEN]; + char name[MAX_NAMELEN]; struct irq_desc *desc = irq_to_desc(irq); - if (!desc->dir || action->dir || !action->name || - !name_unique(irq, action)) + if (!desc->dir || action->dir || !action->name || !name_unique(irq, action)) return; snprintf(name, MAX_NAMELEN, "%s", action->name); @@ -347,17 +341,16 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) * added, not when the descriptor is created, so multiple * tasks might try to register at the same time. */ - mutex_lock(®ister_lock); + guard(mutex)(®ister_lock); if (desc->dir) - goto out_unlock; - - sprintf(name, "%d", irq); + return; /* create /proc/irq/1234 */ + sprintf(name, "%u", irq); desc->dir = proc_mkdir(name, root_irq_dir); if (!desc->dir) - goto out_unlock; + return; #ifdef CONFIG_SMP umode_t umode = S_IRUGO; @@ -366,31 +359,27 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) umode |= S_IWUSR; /* create /proc/irq/<irq>/smp_affinity */ - proc_create_data("smp_affinity", umode, desc->dir, - &irq_affinity_proc_ops, irqp); + proc_create_data("smp_affinity", umode, desc->dir, &irq_affinity_proc_ops, irqp); /* create /proc/irq/<irq>/affinity_hint */ proc_create_single_data("affinity_hint", 0444, desc->dir, - irq_affinity_hint_proc_show, irqp); + irq_affinity_hint_proc_show, irqp); /* create /proc/irq/<irq>/smp_affinity_list */ proc_create_data("smp_affinity_list", umode, desc->dir, &irq_affinity_list_proc_ops, irqp); - proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show, - irqp); + proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show, irqp); # ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK proc_create_single_data("effective_affinity", 0444, desc->dir, - irq_effective_aff_proc_show, irqp); + irq_effective_aff_proc_show, irqp); proc_create_single_data("effective_affinity_list", 0444, desc->dir, - irq_effective_aff_list_proc_show, irqp); + irq_effective_aff_list_proc_show, irqp); # endif #endif proc_create_single_data("spurious", 0444, desc->dir, - irq_spurious_proc_show, (void *)(long)irq); + irq_spurious_proc_show, (void *)(long)irq); -out_unlock: - mutex_unlock(®ister_lock); } void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) @@ -468,7 +457,6 @@ int show_interrupts(struct seq_file *p, void *v) int i = *(loff_t *) v, j; struct irqaction *action; struct irq_desc *desc; - unsigned long flags; if (i > ACTUAL_NR_IRQS) return 0; @@ -487,13 +475,13 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } - rcu_read_lock(); + guard(rcu)(); desc = irq_to_desc(i); if (!desc || irq_settings_is_hidden(desc)) - goto outsparse; + return 0; if (!desc->action || irq_desc_is_chained(desc) || !desc->kstat_irqs) - goto outsparse; + return 0; seq_printf(p, "%*d:", prec, i); for_each_online_cpu(j) { @@ -503,7 +491,7 @@ int show_interrupts(struct seq_file *p, void *v) } seq_putc(p, ' '); - raw_spin_lock_irqsave(&desc->lock, flags); + guard(raw_spinlock_irq)(&desc->lock); if (desc->irq_data.chip) { if (desc->irq_data.chip->irq_print_chip) desc->irq_data.chip->irq_print_chip(&desc->irq_data, p); @@ -532,9 +520,6 @@ int show_interrupts(struct seq_file *p, void *v) } seq_putc(p, '\n'); - raw_spin_unlock_irqrestore(&desc->lock, flags); -outsparse: - rcu_read_unlock(); return 0; } #endif diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 1b7fa72968bd..ca9cc1b806a9 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -30,18 +30,17 @@ static DEFINE_RAW_SPINLOCK(irq_resend_lock); */ static void resend_irqs(struct tasklet_struct *unused) { - struct irq_desc *desc; - - raw_spin_lock_irq(&irq_resend_lock); + guard(raw_spinlock_irq)(&irq_resend_lock); while (!hlist_empty(&irq_resend_list)) { - desc = hlist_entry(irq_resend_list.first, struct irq_desc, - resend_node); + struct irq_desc *desc; + + desc = hlist_entry(irq_resend_list.first, struct irq_desc, resend_node); hlist_del_init(&desc->resend_node); + raw_spin_unlock(&irq_resend_lock); desc->handle_irq(desc); raw_spin_lock(&irq_resend_lock); } - raw_spin_unlock_irq(&irq_resend_lock); } /* Tasklet to handle resend: */ @@ -75,19 +74,18 @@ static int irq_sw_resend(struct irq_desc *desc) } /* Add to resend_list and activate the softirq: */ - raw_spin_lock(&irq_resend_lock); - if (hlist_unhashed(&desc->resend_node)) - hlist_add_head(&desc->resend_node, &irq_resend_list); - raw_spin_unlock(&irq_resend_lock); + scoped_guard(raw_spinlock, &irq_resend_lock) { + if (hlist_unhashed(&desc->resend_node)) + hlist_add_head(&desc->resend_node, &irq_resend_list); + } tasklet_schedule(&resend_tasklet); return 0; } void clear_irq_resend(struct irq_desc *desc) { - raw_spin_lock(&irq_resend_lock); + guard(raw_spinlock)(&irq_resend_lock); hlist_del_init(&desc->resend_node); - raw_spin_unlock(&irq_resend_lock); } void irq_resend_init(struct irq_desc *desc) @@ -172,30 +170,24 @@ int check_irq_resend(struct irq_desc *desc, bool inject) */ int irq_inject_interrupt(unsigned int irq) { - struct irq_desc *desc; - unsigned long flags; - int err; + int err = -EINVAL; /* Try the state injection hardware interface first */ if (!irq_set_irqchip_state(irq, IRQCHIP_STATE_PENDING, true)) return 0; /* That failed, try via the resend mechanism */ - desc = irq_get_desc_buslock(irq, &flags, 0); - if (!desc) - return -EINVAL; + scoped_irqdesc_get_and_buslock(irq, 0) { + struct irq_desc *desc = scoped_irqdesc; - /* - * Only try to inject when the interrupt is: - * - not NMI type - * - activated - */ - if (irq_is_nmi(desc) || !irqd_is_activated(&desc->irq_data)) - err = -EINVAL; - else - err = check_irq_resend(desc, true); - - irq_put_desc_busunlock(desc, flags); + /* + * Only try to inject when the interrupt is: + * - not NMI type + * - activated + */ + if (!irq_is_nmi(desc) && irqd_is_activated(&desc->irq_data)) + err = check_irq_resend(desc, true); + } return err; } EXPORT_SYMBOL_GPL(irq_inject_interrupt); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 02b2daf07441..73280ccb74b0 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -19,77 +19,41 @@ static int irqfixup __read_mostly; #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) static void poll_spurious_irqs(struct timer_list *unused); static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs); -static int irq_poll_cpu; +int irq_poll_cpu; static atomic_t irq_poll_active; /* - * We wait here for a poller to finish. - * - * If the poll runs on this CPU, then we yell loudly and return - * false. That will leave the interrupt line disabled in the worst - * case, but it should never happen. - * - * We wait until the poller is done and then recheck disabled and - * action (about to be disabled). Only if it's still active, we return - * true and let the handler run. - */ -bool irq_wait_for_poll(struct irq_desc *desc) - __must_hold(&desc->lock) -{ - if (WARN_ONCE(irq_poll_cpu == smp_processor_id(), - "irq poll in progress on cpu %d for irq %d\n", - smp_processor_id(), desc->irq_data.irq)) - return false; - -#ifdef CONFIG_SMP - do { - raw_spin_unlock(&desc->lock); - while (irqd_irq_inprogress(&desc->irq_data)) - cpu_relax(); - raw_spin_lock(&desc->lock); - } while (irqd_irq_inprogress(&desc->irq_data)); - /* Might have been disabled in meantime */ - return !irqd_irq_disabled(&desc->irq_data) && desc->action; -#else - return false; -#endif -} - - -/* * Recovery handler for misrouted interrupts. */ -static int try_one_irq(struct irq_desc *desc, bool force) +static bool try_one_irq(struct irq_desc *desc, bool force) { - irqreturn_t ret = IRQ_NONE; struct irqaction *action; + bool ret = false; - raw_spin_lock(&desc->lock); + guard(raw_spinlock)(&desc->lock); /* * PER_CPU, nested thread interrupts and interrupts explicitly * marked polled are excluded from polling. */ - if (irq_settings_is_per_cpu(desc) || - irq_settings_is_nested_thread(desc) || + if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc) || irq_settings_is_polled(desc)) - goto out; + return false; /* * Do not poll disabled interrupts unless the spurious * disabled poller asks explicitly. */ if (irqd_irq_disabled(&desc->irq_data) && !force) - goto out; + return false; /* * All handlers must agree on IRQF_SHARED, so we test just the * first. */ action = desc->action; - if (!action || !(action->flags & IRQF_SHARED) || - (action->flags & __IRQF_TIMER)) - goto out; + if (!action || !(action->flags & IRQF_SHARED) || (action->flags & __IRQF_TIMER)) + return false; /* Already running on another processor */ if (irqd_irq_inprogress(&desc->irq_data)) { @@ -98,21 +62,19 @@ static int try_one_irq(struct irq_desc *desc, bool force) * CPU to go looking for our mystery interrupt too */ desc->istate |= IRQS_PENDING; - goto out; + return false; } /* Mark it poll in progress */ desc->istate |= IRQS_POLL_INPROGRESS; do { if (handle_irq_event(desc) == IRQ_HANDLED) - ret = IRQ_HANDLED; + ret = true; /* Make sure that there is still a valid action */ action = desc->action; } while ((desc->istate & IRQS_PENDING) && action); desc->istate &= ~IRQS_POLL_INPROGRESS; -out: - raw_spin_unlock(&desc->lock); - return ret == IRQ_HANDLED; + return ret; } static int misrouted_irq(int irq) @@ -157,8 +119,7 @@ static void poll_spurious_irqs(struct timer_list *unused) continue; /* Racy but it doesn't matter */ - state = desc->istate; - barrier(); + state = READ_ONCE(desc->istate); if (!(state & IRQS_SPURIOUS_DISABLED)) continue; @@ -168,8 +129,7 @@ static void poll_spurious_irqs(struct timer_list *unused) } out: atomic_dec(&irq_poll_active); - mod_timer(&poll_spurious_irq_timer, - jiffies + POLL_SPURIOUS_IRQ_INTERVAL); + mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); } static inline int bad_action_ret(irqreturn_t action_ret) @@ -193,17 +153,13 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) { unsigned int irq = irq_desc_get_irq(desc); struct irqaction *action; - unsigned long flags; - if (bad_action_ret(action_ret)) { - printk(KERN_ERR "irq event %d: bogus return value %x\n", - irq, action_ret); - } else { - printk(KERN_ERR "irq %d: nobody cared (try booting with " - "the \"irqpoll\" option)\n", irq); - } + if (bad_action_ret(action_ret)) + pr_err("irq event %d: bogus return value %x\n", irq, action_ret); + else + pr_err("irq %d: nobody cared (try booting with the \"irqpoll\" option)\n", irq); dump_stack(); - printk(KERN_ERR "handlers:\n"); + pr_err("handlers:\n"); /* * We need to take desc->lock here. note_interrupt() is called @@ -211,15 +167,13 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) * with something else removing an action. It's ok to take * desc->lock here. See synchronize_irq(). */ - raw_spin_lock_irqsave(&desc->lock, flags); + guard(raw_spinlock_irqsave)(&desc->lock); for_each_action_of_desc(desc, action) { - printk(KERN_ERR "[<%p>] %ps", action->handler, action->handler); + pr_err("[<%p>] %ps", action->handler, action->handler); if (action->thread_fn) - printk(KERN_CONT " threaded [<%p>] %ps", - action->thread_fn, action->thread_fn); - printk(KERN_CONT "\n"); + pr_cont(" threaded [<%p>] %ps", action->thread_fn, action->thread_fn); + pr_cont("\n"); } - raw_spin_unlock_irqrestore(&desc->lock, flags); } static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) @@ -232,18 +186,17 @@ static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) } } -static inline int -try_misrouted_irq(unsigned int irq, struct irq_desc *desc, - irqreturn_t action_ret) +static inline bool try_misrouted_irq(unsigned int irq, struct irq_desc *desc, + irqreturn_t action_ret) { struct irqaction *action; if (!irqfixup) - return 0; + return false; /* We didn't actually handle the IRQ - see if it was misrouted? */ if (action_ret == IRQ_NONE) - return 1; + return true; /* * But for 'irqfixup == 2' we also do it for handled interrupts if @@ -251,19 +204,16 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, * traditional PC timer interrupt.. Legacy) */ if (irqfixup < 2) - return 0; + return false; if (!irq) - return 1; + return true; /* * Since we don't get the descriptor lock, "action" can - * change under us. We don't really care, but we don't - * want to follow a NULL pointer. So tell the compiler to - * just load it once by using a barrier. + * change under us. */ - action = desc->action; - barrier(); + action = READ_ONCE(desc->action); return action && (action->flags & IRQF_IRQPOLL); } @@ -273,8 +223,7 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret) { unsigned int irq; - if (desc->istate & IRQS_POLL_INPROGRESS || - irq_settings_is_polled(desc)) + if (desc->istate & IRQS_POLL_INPROGRESS || irq_settings_is_polled(desc)) return; if (bad_action_ret(action_ret)) { @@ -420,13 +369,12 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret) /* * Now kill the IRQ */ - printk(KERN_EMERG "Disabling IRQ #%d\n", irq); + pr_emerg("Disabling IRQ #%d\n", irq); desc->istate |= IRQS_SPURIOUS_DISABLED; desc->depth++; irq_disable(desc); - mod_timer(&poll_spurious_irq_timer, - jiffies + POLL_SPURIOUS_IRQ_INTERVAL); + mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); } desc->irqs_unhandled = 0; } @@ -436,11 +384,9 @@ bool noirqdebug __read_mostly; int noirqdebug_setup(char *str) { noirqdebug = 1; - printk(KERN_INFO "IRQ lockup detection disabled\n"); - + pr_info("IRQ lockup detection disabled\n"); return 1; } - __setup("noirqdebug", noirqdebug_setup); module_param(noirqdebug, bool, 0644); MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true"); @@ -452,12 +398,10 @@ static int __init irqfixup_setup(char *str) return 1; } irqfixup = 1; - printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); - printk(KERN_WARNING "This may impact system performance.\n"); - + pr_warn("Misrouted IRQ fixup support enabled.\n"); + pr_warn("This may impact system performance.\n"); return 1; } - __setup("irqfixup", irqfixup_setup); module_param(irqfixup, int, 0644); @@ -468,11 +412,8 @@ static int __init irqpoll_setup(char *str) return 1; } irqfixup = 2; - printk(KERN_WARNING "Misrouted IRQ fixup and polling support " - "enabled\n"); - printk(KERN_WARNING "This may significantly impact system " - "performance\n"); + pr_warn("Misrouted IRQ fixup and polling support enabled\n"); + pr_warn("This may significantly impact system performance\n"); return 1; } - __setup("irqpoll", irqpoll_setup); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 4198f30aac3c..049e296f586c 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -103,8 +103,11 @@ static char kallsyms_get_symbol_type(unsigned int off) { /* * Get just the first code, look it up in the token table, - * and return the first char from this token. + * and return the first char from this token. If MSB of length + * is 1, it is a "big" symbol, so needs an additional byte. */ + if (kallsyms_names[off] & 0x80) + off++; return kallsyms_token_table[kallsyms_token_index[kallsyms_names[off + 1]]]; } @@ -829,8 +832,7 @@ static struct bpf_iter_reg ksym_iter_reg_info = { .seq_info = &ksym_iter_seq_info, }; -BTF_ID_LIST(btf_ksym_iter_id) -BTF_ID(struct, kallsym_iter) +BTF_ID_LIST_SINGLE(btf_ksym_iter_id, struct, kallsym_iter) static int __init bpf_ksym_iter_register(void) { diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c index cf4af5728307..2b082a7e24a2 100644 --- a/kernel/kallsyms_selftest.c +++ b/kernel/kallsyms_selftest.c @@ -264,7 +264,7 @@ static int test_kallsyms_basic_function(void) char namebuf[KSYM_NAME_LEN]; struct test_stat *stat, *stat2; - stat = kmalloc(sizeof(*stat) * 2, GFP_KERNEL); + stat = kmalloc_array(2, sizeof(*stat), GFP_KERNEL); if (!stat) return -ENOMEM; stat2 = stat + 1; diff --git a/kernel/kcov.c b/kernel/kcov.c index 187ba1b80bda..6563141f5de9 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -552,7 +552,7 @@ static int kcov_get_mode(unsigned long arg) /* * Fault in a lazily-faulted vmalloc area before it can be used by - * __santizer_cov_trace_pc(), to avoid recursion issues if any code on the + * __sanitizer_cov_trace_pc(), to avoid recursion issues if any code on the * vmalloc fault handling path is instrumented. */ static void kcov_fault_in_area(struct kcov *kcov) @@ -978,6 +978,15 @@ static void kcov_move_area(enum kcov_mode mode, void *dst_area, memcpy(dst_entries, src_entries, bytes_to_move); entries_moved = bytes_to_move >> entry_size_log; + /* + * A write memory barrier is required here, to ensure + * that the writes from the memcpy() are visible before + * the count is updated. Without this, it is possible for + * a user to observe a new count value but stale + * coverage data. + */ + smp_wmb(); + switch (mode) { case KCOV_MODE_TRACE_PC: WRITE_ONCE(*(unsigned long *)dst_area, dst_len + entries_moved); diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 6ce73cceaf53..219d22857c98 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -125,7 +125,7 @@ static void probe_console(void *ignore, const char *buf, size_t len) goto out; /* No second line of interest. */ - strcpy(observed.lines[nlines++], "<none>"); + strscpy(observed.lines[nlines++], "<none>"); } } @@ -231,7 +231,7 @@ static bool __report_matches(const struct expect_report *r) if (!r->access[1].fn) { /* Dummy string if no second access is available. */ - strcpy(cur, "<none>"); + strscpy(expect[2], "<none>"); break; } } @@ -533,7 +533,7 @@ static void test_barrier_nothreads(struct kunit *test) struct kcsan_scoped_access *reorder_access = NULL; #endif arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED; - atomic_t dummy; + atomic_t dummy = ATOMIC_INIT(0); KCSAN_TEST_REQUIRES(test, reorder_access != NULL); KCSAN_TEST_REQUIRES(test, IS_ENABLED(CONFIG_SMP)); @@ -1383,7 +1383,7 @@ static void test_atomic_builtins_missing_barrier(struct kunit *test) * The thread counts are chosen to cover potentially interesting boundaries and * corner cases (2 to 5), and then stress the system with larger counts. */ -static const void *nthreads_gen_params(const void *prev, char *desc) +static const void *nthreads_gen_params(struct kunit *test, const void *prev, char *desc) { long nthreads = (long)prev; @@ -1501,7 +1501,7 @@ static int access_thread(void *arg) } } while (!torture_must_stop()); timer_delete_sync(&timer); - destroy_timer_on_stack(&timer); + timer_destroy_on_stack(&timer); torture_kthread_stopping("access_thread"); return 0; diff --git a/kernel/kexec.c b/kernel/kexec.c index a6b3f96bb50c..28008e3d462e 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -152,7 +152,7 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, goto out; for (i = 0; i < nr_segments; i++) { - ret = kimage_load_segment(image, &image->segment[i]); + ret = kimage_load_segment(image, i); if (ret) goto out; } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 3e62b944c883..95c585c6ddc3 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -15,6 +15,7 @@ #include <linux/kexec.h> #include <linux/mutex.h> #include <linux/list.h> +#include <linux/liveupdate.h> #include <linux/highmem.h> #include <linux/syscalls.h> #include <linux/reboot.h> @@ -40,6 +41,8 @@ #include <linux/hugetlb.h> #include <linux/objtool.h> #include <linux/kmsg_dump.h> +#include <linux/dma-map-ops.h> +#include <linux/sysfs.h> #include <asm/page.h> #include <asm/sections.h> @@ -232,7 +235,6 @@ struct kimage *do_kimage_alloc_init(void) if (!image) return NULL; - image->head = 0; image->entry = &image->head; image->last_entry = &image->head; image->control_page = ~0; /* By default this does not apply */ @@ -553,6 +555,24 @@ static void kimage_free_entry(kimage_entry_t entry) kimage_free_pages(page); } +static void kimage_free_cma(struct kimage *image) +{ + unsigned long i; + + for (i = 0; i < image->nr_segments; i++) { + struct page *cma = image->segment_cma[i]; + u32 nr_pages = image->segment[i].memsz >> PAGE_SHIFT; + + if (!cma) + continue; + + arch_kexec_pre_free_pages(page_address(cma), nr_pages); + dma_release_from_contiguous(NULL, cma, nr_pages); + image->segment_cma[i] = NULL; + } + +} + void kimage_free(struct kimage *image) { kimage_entry_t *ptr, entry; @@ -591,6 +611,9 @@ void kimage_free(struct kimage *image) /* Free the kexec control pages... */ kimage_free_page_list(&image->control_pages); + /* Free CMA allocations */ + kimage_free_cma(image); + /* * Free up any temporary buffers allocated. This might hit if * error occurred much later after buffer allocation. @@ -716,9 +739,64 @@ static struct page *kimage_alloc_page(struct kimage *image, return page; } -static int kimage_load_normal_segment(struct kimage *image, - struct kexec_segment *segment) +static int kimage_load_cma_segment(struct kimage *image, int idx) +{ + struct kexec_segment *segment = &image->segment[idx]; + struct page *cma = image->segment_cma[idx]; + char *ptr = page_address(cma); + size_t ubytes, mbytes; + int result = 0; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; + + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; + ubytes = segment->bufsz; + mbytes = segment->memsz; + + /* Then copy from source buffer to the CMA one */ + while (mbytes) { + size_t uchunk, mchunk; + + mchunk = min_t(size_t, mbytes, PAGE_SIZE); + uchunk = min(ubytes, mchunk); + + if (uchunk) { + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); + ubytes -= uchunk; + if (image->file_mode) + kbuf += uchunk; + else + buf += uchunk; + } + + if (result) { + result = -EFAULT; + goto out; + } + + ptr += mchunk; + mbytes -= mchunk; + + cond_resched(); + } + + /* Clear any remainder */ + memset(ptr, 0, mbytes); + +out: + return result; +} + +static int kimage_load_normal_segment(struct kimage *image, int idx) { + struct kexec_segment *segment = &image->segment[idx]; unsigned long maddr; size_t ubytes, mbytes; int result; @@ -733,6 +811,9 @@ static int kimage_load_normal_segment(struct kimage *image, mbytes = segment->memsz; maddr = segment->mem; + if (image->segment_cma[idx]) + return kimage_load_cma_segment(image, idx); + result = kimage_set_destination(image, maddr); if (result < 0) goto out; @@ -755,9 +836,7 @@ static int kimage_load_normal_segment(struct kimage *image, ptr = kmap_local_page(page); /* Start with a clear page */ clear_page(ptr); - ptr += maddr & ~PAGE_MASK; - mchunk = min_t(size_t, mbytes, - PAGE_SIZE - (maddr & ~PAGE_MASK)); + mchunk = min_t(size_t, mbytes, PAGE_SIZE); uchunk = min(ubytes, mchunk); if (uchunk) { @@ -787,13 +866,13 @@ out: } #ifdef CONFIG_CRASH_DUMP -static int kimage_load_crash_segment(struct kimage *image, - struct kexec_segment *segment) +static int kimage_load_crash_segment(struct kimage *image, int idx) { /* For crash dumps kernels we simply copy the data from * user space to it's destination. * We do things a page at a time for the sake of kmap. */ + struct kexec_segment *segment = &image->segment[idx]; unsigned long maddr; size_t ubytes, mbytes; int result; @@ -820,9 +899,7 @@ static int kimage_load_crash_segment(struct kimage *image, } arch_kexec_post_alloc_pages(page_address(page), 1, 0); ptr = kmap_local_page(page); - ptr += maddr & ~PAGE_MASK; - mchunk = min_t(size_t, mbytes, - PAGE_SIZE - (maddr & ~PAGE_MASK)); + mchunk = min_t(size_t, mbytes, PAGE_SIZE); uchunk = min(ubytes, mchunk); if (mchunk > uchunk) { /* Zero the trailing part of the page */ @@ -858,18 +935,17 @@ out: } #endif -int kimage_load_segment(struct kimage *image, - struct kexec_segment *segment) +int kimage_load_segment(struct kimage *image, int idx) { int result = -ENOMEM; switch (image->type) { case KEXEC_TYPE_DEFAULT: - result = kimage_load_normal_segment(image, segment); + result = kimage_load_normal_segment(image, idx); break; #ifdef CONFIG_CRASH_DUMP case KEXEC_TYPE_CRASH: - result = kimage_load_crash_segment(image, segment); + result = kimage_load_crash_segment(image, idx); break; #endif } @@ -877,6 +953,68 @@ int kimage_load_segment(struct kimage *image, return result; } +void *kimage_map_segment(struct kimage *image, int idx) +{ + unsigned long addr, size, eaddr; + unsigned long src_page_addr, dest_page_addr = 0; + kimage_entry_t *ptr, entry; + struct page **src_pages; + unsigned int npages; + struct page *cma; + void *vaddr = NULL; + int i; + + cma = image->segment_cma[idx]; + if (cma) + return page_address(cma); + + addr = image->segment[idx].mem; + size = image->segment[idx].memsz; + eaddr = addr + size; + /* + * Collect the source pages and map them in a contiguous VA range. + */ + npages = PFN_UP(eaddr) - PFN_DOWN(addr); + src_pages = kmalloc_array(npages, sizeof(*src_pages), GFP_KERNEL); + if (!src_pages) { + pr_err("Could not allocate ima pages array.\n"); + return NULL; + } + + i = 0; + for_each_kimage_entry(image, ptr, entry) { + if (entry & IND_DESTINATION) { + dest_page_addr = entry & PAGE_MASK; + } else if (entry & IND_SOURCE) { + if (dest_page_addr >= addr && dest_page_addr < eaddr) { + src_page_addr = entry & PAGE_MASK; + src_pages[i++] = + virt_to_page(__va(src_page_addr)); + if (i == npages) + break; + dest_page_addr += PAGE_SIZE; + } + } + } + + /* Sanity check. */ + WARN_ON(i < npages); + + vaddr = vmap(src_pages, npages, VM_MAP, PAGE_KERNEL); + kfree(src_pages); + + if (!vaddr) + pr_err("Could not map ima buffer.\n"); + + return vaddr; +} + +void kimage_unmap_segment(void *segment_buffer) +{ + if (is_vmalloc_addr(segment_buffer)) + vunmap(segment_buffer); +} + struct kexec_load_limit { /* Mutex protects the limit count. */ struct mutex mutex; @@ -1009,6 +1147,10 @@ int kernel_kexec(void) goto Unlock; } + error = liveupdate_reboot(); + if (error) + goto Unlock; + #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { /* @@ -1026,7 +1168,7 @@ int kernel_kexec(void) console_suspend_all(); error = dpm_suspend_start(PMSG_FREEZE); if (error) - goto Resume_console; + goto Resume_devices; /* * dpm_suspend_end() must be called after dpm_suspend_start() * to complete the transition, like in the hibernation flows @@ -1081,7 +1223,6 @@ int kernel_kexec(void) dpm_resume_start(PMSG_RESTORE); Resume_devices: dpm_resume_end(PMSG_RESTORE); - Resume_console: console_resume_all(); thaw_processes(); Restore_console: @@ -1093,3 +1234,143 @@ int kernel_kexec(void) kexec_unlock(); return error; } + +static ssize_t loaded_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", !!kexec_image); +} +static struct kobj_attribute loaded_attr = __ATTR_RO(loaded); + +#ifdef CONFIG_CRASH_DUMP +static ssize_t crash_loaded_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", kexec_crash_loaded()); +} +static struct kobj_attribute crash_loaded_attr = __ATTR_RO(crash_loaded); + +#ifdef CONFIG_CRASH_RESERVE +static ssize_t crash_cma_ranges_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + + ssize_t len = 0; + int i; + + for (i = 0; i < crashk_cma_cnt; ++i) { + len += sysfs_emit_at(buf, len, "%08llx-%08llx\n", + crashk_cma_ranges[i].start, + crashk_cma_ranges[i].end); + } + return len; +} +static struct kobj_attribute crash_cma_ranges_attr = __ATTR_RO(crash_cma_ranges); +#endif + +static ssize_t crash_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + ssize_t size = crash_get_memory_size(); + + if (size < 0) + return size; + + return sysfs_emit(buf, "%zd\n", size); +} +static ssize_t crash_size_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long cnt; + int ret; + + if (kstrtoul(buf, 0, &cnt)) + return -EINVAL; + + ret = crash_shrink_memory(cnt); + return ret < 0 ? ret : count; +} +static struct kobj_attribute crash_size_attr = __ATTR_RW(crash_size); + +#ifdef CONFIG_CRASH_HOTPLUG +static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + unsigned int sz = crash_get_elfcorehdr_size(); + + return sysfs_emit(buf, "%u\n", sz); +} +static struct kobj_attribute crash_elfcorehdr_size_attr = __ATTR_RO(crash_elfcorehdr_size); + +#endif /* CONFIG_CRASH_HOTPLUG */ +#endif /* CONFIG_CRASH_DUMP */ + +static struct attribute *kexec_attrs[] = { + &loaded_attr.attr, +#ifdef CONFIG_CRASH_DUMP + &crash_loaded_attr.attr, + &crash_size_attr.attr, +#ifdef CONFIG_CRASH_RESERVE + &crash_cma_ranges_attr.attr, +#endif +#ifdef CONFIG_CRASH_HOTPLUG + &crash_elfcorehdr_size_attr.attr, +#endif +#endif + NULL +}; + +struct kexec_link_entry { + const char *target; + const char *name; +}; + +static struct kexec_link_entry kexec_links[] = { + { "loaded", "kexec_loaded" }, +#ifdef CONFIG_CRASH_DUMP + { "crash_loaded", "kexec_crash_loaded" }, + { "crash_size", "kexec_crash_size" }, +#ifdef CONFIG_CRASH_RESERVE + {"crash_cma_ranges", "kexec_crash_cma_ranges"}, +#endif +#ifdef CONFIG_CRASH_HOTPLUG + { "crash_elfcorehdr_size", "crash_elfcorehdr_size" }, +#endif +#endif +}; + +static struct kobject *kexec_kobj; +ATTRIBUTE_GROUPS(kexec); + +static int __init init_kexec_sysctl(void) +{ + int error; + int i; + + kexec_kobj = kobject_create_and_add("kexec", kernel_kobj); + if (!kexec_kobj) { + pr_err("failed to create kexec kobject\n"); + return -ENOMEM; + } + + error = sysfs_create_groups(kexec_kobj, kexec_groups); + if (error) + goto kset_exit; + + for (i = 0; i < ARRAY_SIZE(kexec_links); i++) { + error = compat_only_sysfs_link_entry_to_kobj(kernel_kobj, kexec_kobj, + kexec_links[i].target, + kexec_links[i].name); + if (error) + pr_err("Unable to create %s symlink (%d)", kexec_links[i].name, error); + } + + return 0; + +kset_exit: + kobject_put(kexec_kobj); + return error; +} + +subsys_initcall(init_kexec_sysctl); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index fba686487e3b..eb62a9794242 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -19,7 +19,6 @@ #include <linux/list.h> #include <linux/fs.h> #include <linux/ima.h> -#include <crypto/hash.h> #include <crypto/sha2.h> #include <linux/elf.h> #include <linux/elfcore.h> @@ -27,6 +26,7 @@ #include <linux/kernel_read_file.h> #include <linux/syscalls.h> #include <linux/vmalloc.h> +#include <linux/dma-map-ops.h> #include "kexec_internal.h" #ifdef CONFIG_KEXEC_SIG @@ -38,6 +38,21 @@ void set_kexec_sig_enforced(void) } #endif +#ifdef CONFIG_IMA_KEXEC +static bool check_ima_segment_index(struct kimage *image, int i) +{ + if (image->is_ima_segment_index_set && i == image->ima_segment_index) + return true; + else + return false; +} +#else +static bool check_ima_segment_index(struct kimage *image, int i) +{ + return false; +} +#endif + static int kexec_calculate_store_digests(struct kimage *image); /* Maximum size in bytes for kernel/initrd files. */ @@ -186,6 +201,15 @@ kimage_validate_signature(struct kimage *image) } #endif +static int kexec_post_load(struct kimage *image, unsigned long flags) +{ +#ifdef CONFIG_IMA_KEXEC + if (!(flags & KEXEC_FILE_ON_CRASH)) + ima_kexec_post_load(image); +#endif + return machine_kexec_post_load(image); +} + /* * In file mode list of segments is prepared by kernel. Copy relevant * data from user space, do error checking, prepare segment list @@ -230,6 +254,9 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, ret = 0; } + image->no_cma = !!(flags & KEXEC_FILE_NO_CMA); + image->force_dtb = flags & KEXEC_FILE_FORCE_DTB; + if (cmdline_len) { image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len); if (IS_ERR(image->cmdline_buf)) { @@ -253,6 +280,11 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, /* IMA needs to pass the measurement list to the next kernel. */ ima_add_kexec_buffer(image); + /* If KHO is active, add its images to the list */ + ret = kho_fill_kimage(image); + if (ret) + goto out; + /* Call image load handler */ ldata = kexec_image_load_default(image); @@ -406,14 +438,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, i, ksegment->buf, ksegment->bufsz, ksegment->mem, ksegment->memsz); - ret = kimage_load_segment(image, &image->segment[i]); + ret = kimage_load_segment(image, i); if (ret) goto out; } kimage_terminate(image); - ret = machine_kexec_post_load(image); + ret = kexec_post_load(image, flags); if (ret) goto out; @@ -445,6 +477,7 @@ static int locate_mem_hole_top_down(unsigned long start, unsigned long end, temp_end = min(end, kbuf->buf_max); temp_start = temp_end - kbuf->memsz + 1; + kexec_random_range_start(temp_start, temp_end, kbuf, &temp_start); do { /* align down start */ @@ -489,6 +522,8 @@ static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, temp_start = max(start, kbuf->buf_min); + kexec_random_range_start(temp_start, end, kbuf, &temp_start); + do { temp_start = ALIGN(temp_start, kbuf->buf_align); temp_end = temp_start + kbuf->memsz - 1; @@ -632,6 +667,43 @@ static int kexec_walk_resources(struct kexec_buf *kbuf, return walk_system_ram_res(0, ULONG_MAX, kbuf, func); } +static int kexec_alloc_contig(struct kexec_buf *kbuf) +{ + size_t nr_pages = kbuf->memsz >> PAGE_SHIFT; + unsigned long mem; + struct page *p; + + /* User space disabled CMA allocations, bail out. */ + if (kbuf->image->no_cma) + return -EPERM; + + /* Skip CMA logic for crash kernel */ + if (kbuf->image->type == KEXEC_TYPE_CRASH) + return -EPERM; + + p = dma_alloc_from_contiguous(NULL, nr_pages, get_order(kbuf->buf_align), true); + if (!p) + return -ENOMEM; + + pr_debug("allocated %zu DMA pages at 0x%lx", nr_pages, page_to_boot_pfn(p)); + + mem = page_to_boot_pfn(p) << PAGE_SHIFT; + + if (kimage_is_destination_range(kbuf->image, mem, mem + kbuf->memsz)) { + /* Our region is already in use by a statically defined one. Bail out. */ + pr_debug("CMA overlaps existing mem: 0x%lx+0x%lx\n", mem, kbuf->memsz); + dma_release_from_contiguous(NULL, p, nr_pages); + return -EBUSY; + } + + kbuf->mem = page_to_boot_pfn(p) << PAGE_SHIFT; + kbuf->cma = p; + + arch_kexec_post_alloc_pages(page_address(p), (int)nr_pages, 0); + + return 0; +} + /** * kexec_locate_mem_hole - find free memory for the purgatory or the next kernel * @kbuf: Parameters for the memory search. @@ -648,6 +720,21 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN) return 0; + /* + * If KHO is active, only use KHO scratch memory. All other memory + * could potentially be handed over. + */ + ret = kho_locate_mem_hole(kbuf, locate_mem_hole_callback); + if (ret <= 0) + return ret; + + /* + * Try to find a free physically contiguous block of memory first. With that, we + * can avoid any copying at kexec time. + */ + if (!kexec_alloc_contig(kbuf)) + return 0; + if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) ret = kexec_walk_resources(kbuf, locate_mem_hole_callback); else @@ -693,6 +780,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf) /* Ensure minimum alignment needed for segments. */ kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE); kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE); + kbuf->cma = NULL; /* Walk the RAM ranges and allocate a suitable range for the buffer */ ret = arch_kexec_locate_mem_hole(kbuf); @@ -705,6 +793,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf) ksegment->bufsz = kbuf->bufsz; ksegment->mem = kbuf->mem; ksegment->memsz = kbuf->memsz; + kbuf->image->segment_cma[kbuf->image->nr_segments] = kbuf->cma; kbuf->image->nr_segments++; return 0; } @@ -712,11 +801,10 @@ int kexec_add_buffer(struct kexec_buf *kbuf) /* Calculate and store the digest of segments */ static int kexec_calculate_store_digests(struct kimage *image) { - struct crypto_shash *tfm; - struct shash_desc *desc; + struct sha256_ctx sctx; int ret = 0, i, j, zero_buf_sz, sha_region_sz; - size_t desc_size, nullsz; - char *digest; + size_t nullsz; + u8 digest[SHA256_DIGEST_SIZE]; void *zero_buf; struct kexec_sha_region *sha_regions; struct purgatory_info *pi = &image->purgatory_info; @@ -727,37 +815,12 @@ static int kexec_calculate_store_digests(struct kimage *image) zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); zero_buf_sz = PAGE_SIZE; - tfm = crypto_alloc_shash("sha256", 0, 0); - if (IS_ERR(tfm)) { - ret = PTR_ERR(tfm); - goto out; - } - - desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); - desc = kzalloc(desc_size, GFP_KERNEL); - if (!desc) { - ret = -ENOMEM; - goto out_free_tfm; - } - sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); sha_regions = vzalloc(sha_region_sz); - if (!sha_regions) { - ret = -ENOMEM; - goto out_free_desc; - } - - desc->tfm = tfm; + if (!sha_regions) + return -ENOMEM; - ret = crypto_shash_init(desc); - if (ret < 0) - goto out_free_sha_regions; - - digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); - if (!digest) { - ret = -ENOMEM; - goto out_free_sha_regions; - } + sha256_init(&sctx); for (j = i = 0; i < image->nr_segments; i++) { struct kexec_segment *ksegment; @@ -776,10 +839,14 @@ static int kexec_calculate_store_digests(struct kimage *image) if (ksegment->kbuf == pi->purgatory_buf) continue; - ret = crypto_shash_update(desc, ksegment->kbuf, - ksegment->bufsz); - if (ret) - break; + /* + * Skip the segment if ima_segment_index is set and matches + * the current index + */ + if (check_ima_segment_index(image, i)) + continue; + + sha256_update(&sctx, ksegment->kbuf, ksegment->bufsz); /* * Assume rest of the buffer is filled with zero and @@ -791,44 +858,26 @@ static int kexec_calculate_store_digests(struct kimage *image) if (bytes > zero_buf_sz) bytes = zero_buf_sz; - ret = crypto_shash_update(desc, zero_buf, bytes); - if (ret) - break; + sha256_update(&sctx, zero_buf, bytes); nullsz -= bytes; } - if (ret) - break; - sha_regions[j].start = ksegment->mem; sha_regions[j].len = ksegment->memsz; j++; } - if (!ret) { - ret = crypto_shash_final(desc, digest); - if (ret) - goto out_free_digest; - ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha_regions", - sha_regions, sha_region_sz, 0); - if (ret) - goto out_free_digest; + sha256_final(&sctx, digest); - ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha256_digest", - digest, SHA256_DIGEST_SIZE, 0); - if (ret) - goto out_free_digest; - } + ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha_regions", + sha_regions, sha_region_sz, 0); + if (ret) + goto out_free_sha_regions; -out_free_digest: - kfree(digest); + ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha256_digest", + digest, SHA256_DIGEST_SIZE, 0); out_free_sha_regions: vfree(sha_regions); -out_free_desc: - kfree(desc); -out_free_tfm: - kfree(tfm); -out: return ret; } diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index d35d9792402d..228bb88c018b 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -10,7 +10,7 @@ struct kimage *do_kimage_alloc_init(void); int sanity_check_segment_list(struct kimage *image); void kimage_free_page_list(struct list_head *list); void kimage_free(struct kimage *image); -int kimage_load_segment(struct kimage *image, struct kexec_segment *segment); +int kimage_load_segment(struct kimage *image, int idx); void kimage_terminate(struct kimage *image); int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end); @@ -39,4 +39,20 @@ extern size_t kexec_purgatory_size; #else /* CONFIG_KEXEC_FILE */ static inline void kimage_file_post_load_cleanup(struct kimage *image) { } #endif /* CONFIG_KEXEC_FILE */ + +struct kexec_buf; + +#ifdef CONFIG_KEXEC_HANDOVER +int kho_locate_mem_hole(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)); +int kho_fill_kimage(struct kimage *image); +#else +static inline int kho_locate_mem_hole(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + return 1; +} + +static inline int kho_fill_kimage(struct kimage *image) { return 0; } +#endif /* CONFIG_KEXEC_HANDOVER */ #endif /* LINUX_KEXEC_INTERNAL_H */ diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ffe0c3d52306..ab8f9fc1f0d1 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -135,8 +135,12 @@ struct kprobe_insn_cache kprobe_insn_slots = { static int collect_garbage_slots(struct kprobe_insn_cache *c); /** - * __get_insn_slot() - Find a slot on an executable page for an instruction. - * We allocate an executable page if there's no room on existing ones. + * __get_insn_slot - Find a slot on an executable page for an instruction. + * @c: Pointer to kprobe instruction cache + * + * Description: Locates available slot on existing executable pages, + * allocates an executable page if there's no room on existing ones. + * Return: Pointer to instruction slot on success, NULL on failure. */ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) { diff --git a/kernel/stackleak.c b/kernel/kstack_erase.c index bb65321761b4..d4449884084c 100644 --- a/kernel/stackleak.c +++ b/kernel/kstack_erase.c @@ -6,14 +6,14 @@ * * Author: Alexander Popov <alex.popov@linux.com> * - * STACKLEAK reduces the information which kernel stack leak bugs can + * KSTACK_ERASE reduces the information which kernel stack leak bugs can * reveal and blocks some uninitialized stack variable attacks. */ -#include <linux/stackleak.h> +#include <linux/kstack_erase.h> #include <linux/kprobes.h> -#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE +#ifdef CONFIG_KSTACK_ERASE_RUNTIME_DISABLE #include <linux/jump_label.h> #include <linux/string_choices.h> #include <linux/sysctl.h> @@ -23,7 +23,7 @@ static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass); #ifdef CONFIG_SYSCTL static int stack_erasing_sysctl(const struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; int state = !static_branch_unlikely(&stack_erasing_bypass); @@ -68,7 +68,7 @@ late_initcall(stackleak_sysctls_init); #define skip_erasing() static_branch_unlikely(&stack_erasing_bypass) #else #define skip_erasing() false -#endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */ +#endif /* CONFIG_KSTACK_ERASE_RUNTIME_DISABLE */ #ifndef __stackleak_poison static __always_inline void __stackleak_poison(unsigned long erase_low, @@ -91,7 +91,7 @@ static __always_inline void __stackleak_erase(bool on_task_stack) erase_low = stackleak_find_top_of_poison(task_stack_low, current->lowest_stack); -#ifdef CONFIG_STACKLEAK_METRICS +#ifdef CONFIG_KSTACK_ERASE_METRICS current->prev_lowest_stack = erase_low; #endif @@ -113,7 +113,7 @@ static __always_inline void __stackleak_erase(bool on_task_stack) else erase_high = task_stack_high; - __stackleak_poison(erase_low, erase_high, STACKLEAK_POISON); + __stackleak_poison(erase_low, erase_high, KSTACK_ERASE_POISON); /* Reset the 'lowest_stack' value for the next syscall */ current->lowest_stack = task_stack_high; @@ -156,16 +156,16 @@ asmlinkage void noinstr stackleak_erase_off_task_stack(void) __stackleak_erase(false); } -void __used __no_caller_saved_registers noinstr stackleak_track_stack(void) +void __used __no_caller_saved_registers noinstr __sanitizer_cov_stack_depth(void) { unsigned long sp = current_stack_pointer; /* - * Having CONFIG_STACKLEAK_TRACK_MIN_SIZE larger than - * STACKLEAK_SEARCH_DEPTH makes the poison search in + * Having CONFIG_KSTACK_ERASE_TRACK_MIN_SIZE larger than + * KSTACK_ERASE_SEARCH_DEPTH makes the poison search in * stackleak_erase() unreliable. Let's prevent that. */ - BUILD_BUG_ON(CONFIG_STACKLEAK_TRACK_MIN_SIZE > STACKLEAK_SEARCH_DEPTH); + BUILD_BUG_ON(CONFIG_KSTACK_ERASE_TRACK_MIN_SIZE > KSTACK_ERASE_SEARCH_DEPTH); /* 'lowest_stack' should be aligned on the register width boundary */ sp = ALIGN(sp, sizeof(unsigned long)); @@ -174,4 +174,4 @@ void __used __no_caller_saved_registers noinstr stackleak_track_stack(void) current->lowest_stack = sp; } } -EXPORT_SYMBOL(stackleak_track_stack); +EXPORT_SYMBOL(__sanitizer_cov_stack_depth); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index eefb67d9883c..a9e6354d9e25 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -12,7 +12,7 @@ #include <linux/sysfs.h> #include <linux/export.h> #include <linux/init.h> -#include <linux/kexec.h> +#include <linux/vmcore_info.h> #include <linux/profile.h> #include <linux/stat.h> #include <linux/sched.h> @@ -119,50 +119,6 @@ static ssize_t profiling_store(struct kobject *kobj, KERNEL_ATTR_RW(profiling); #endif -#ifdef CONFIG_KEXEC_CORE -static ssize_t kexec_loaded_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sysfs_emit(buf, "%d\n", !!kexec_image); -} -KERNEL_ATTR_RO(kexec_loaded); - -#ifdef CONFIG_CRASH_DUMP -static ssize_t kexec_crash_loaded_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sysfs_emit(buf, "%d\n", kexec_crash_loaded()); -} -KERNEL_ATTR_RO(kexec_crash_loaded); - -static ssize_t kexec_crash_size_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - ssize_t size = crash_get_memory_size(); - - if (size < 0) - return size; - - return sysfs_emit(buf, "%zd\n", size); -} -static ssize_t kexec_crash_size_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long cnt; - int ret; - - if (kstrtoul(buf, 0, &cnt)) - return -EINVAL; - - ret = crash_shrink_memory(cnt); - return ret < 0 ? ret : count; -} -KERNEL_ATTR_RW(kexec_crash_size); - -#endif /* CONFIG_CRASH_DUMP*/ -#endif /* CONFIG_KEXEC_CORE */ - #ifdef CONFIG_VMCORE_INFO static ssize_t vmcoreinfo_show(struct kobject *kobj, @@ -174,18 +130,6 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, } KERNEL_ATTR_RO(vmcoreinfo); -#ifdef CONFIG_CRASH_HOTPLUG -static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - unsigned int sz = crash_get_elfcorehdr_size(); - - return sysfs_emit(buf, "%u\n", sz); -} -KERNEL_ATTR_RO(crash_elfcorehdr_size); - -#endif - #endif /* CONFIG_VMCORE_INFO */ /* whether file capabilities are enabled */ @@ -255,18 +199,8 @@ static struct attribute * kernel_attrs[] = { #ifdef CONFIG_PROFILING &profiling_attr.attr, #endif -#ifdef CONFIG_KEXEC_CORE - &kexec_loaded_attr.attr, -#ifdef CONFIG_CRASH_DUMP - &kexec_crash_loaded_attr.attr, - &kexec_crash_size_attr.attr, -#endif -#endif #ifdef CONFIG_VMCORE_INFO &vmcoreinfo_attr.attr, -#ifdef CONFIG_CRASH_HOTPLUG - &crash_elfcorehdr_size_attr.attr, -#endif #endif #ifndef CONFIG_TINY_RCU &rcu_expedited_attr.attr, diff --git a/kernel/kthread.c b/kernel/kthread.c index 77c44924cf54..39511dd2abc9 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -88,13 +88,12 @@ static inline struct kthread *to_kthread(struct task_struct *k) /* * Variant of to_kthread() that doesn't assume @p is a kthread. * - * Per construction; when: + * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will + * always remain a kthread. For kthreads p->worker_private always + * points to a struct kthread. For tasks that are not kthreads + * p->worker_private is used to point to other things. * - * (p->flags & PF_KTHREAD) && p->worker_private - * - * the task is both a kthread and struct kthread is persistent. However - * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and - * begin_new_exec()). + * Return NULL for any task that is not a kthread. */ static inline struct kthread *__to_kthread(struct task_struct *p) { @@ -594,18 +593,16 @@ EXPORT_SYMBOL(kthread_create_on_node); static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state) { - unsigned long flags; - if (!wait_task_inactive(p, state)) { WARN_ON(1); return; } + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) + set_cpus_allowed_force(p, mask); + /* It's safe because the task is inactive. */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - do_set_cpus_allowed(p, mask); p->flags |= PF_NO_SETAFFINITY; - raw_spin_unlock_irqrestore(&p->pi_lock, flags); } static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state) @@ -858,7 +855,6 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) { struct kthread *kthread = to_kthread(p); cpumask_var_t affinity; - unsigned long flags; int ret = 0; if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) { @@ -883,10 +879,8 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); kthread_fetch_affinity(kthread, affinity); - /* It's safe because the task is inactive. */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - do_set_cpus_allowed(p, affinity); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) + set_cpus_allowed_force(p, affinity); mutex_unlock(&kthreads_hotplug_lock); out: @@ -894,6 +888,7 @@ out: return ret; } +EXPORT_SYMBOL_GPL(kthread_affine_preferred); /* * Re-affine kthreads according to their preferences @@ -1207,7 +1202,8 @@ EXPORT_SYMBOL_GPL(kthread_queue_work); */ void kthread_delayed_work_timer_fn(struct timer_list *t) { - struct kthread_delayed_work *dwork = from_timer(dwork, t, timer); + struct kthread_delayed_work *dwork = timer_container_of(dwork, t, + timer); struct kthread_work *work = &dwork->work; struct kthread_worker *worker = work->worker; unsigned long flags; @@ -1603,6 +1599,7 @@ void kthread_use_mm(struct mm_struct *mm) WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(tsk->mm); + WARN_ON_ONCE(!mm->user_ns); /* * It is possible for mm to be the same as tsk->active_mm, but diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig index 53d51ed619a3..4c0a9c18d0b2 100644 --- a/kernel/livepatch/Kconfig +++ b/kernel/livepatch/Kconfig @@ -18,3 +18,15 @@ config LIVEPATCH module uses the interface provided by this option to register a patch, causing calls to patched functions to be redirected to new function code contained in the patch module. + +config HAVE_KLP_BUILD + bool + help + Arch supports klp-build + +config KLP_BUILD + def_bool y + depends on LIVEPATCH && HAVE_KLP_BUILD + select OBJTOOL + help + Enable klp-build support diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 0e73fac55f8e..9917756dae46 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -88,8 +88,14 @@ static struct klp_func *klp_find_func(struct klp_object *obj, struct klp_func *func; klp_for_each_func(obj, func) { + /* + * Besides identical old_sympos, also consider old_sympos + * of 0 and 1 are identical. + */ if ((strcmp(old_func->old_name, func->old_name) == 0) && - (old_func->old_sympos == func->old_sympos)) { + ((old_func->old_sympos == func->old_sympos) || + (old_func->old_sympos == 0 && func->old_sympos == 1) || + (old_func->old_sympos == 1 && func->old_sympos == 0))) { return func; } } @@ -217,14 +223,14 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab, for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) { sym = (Elf_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info); if (sym->st_shndx != SHN_LIVEPATCH) { - pr_err("symbol %s is not marked as a livepatch symbol\n", - strtab + sym->st_name); + pr_err("symbol %s at rela sec %u idx %d is not marked as a livepatch symbol\n", + strtab + sym->st_name, symndx, i); return -EINVAL; } /* Format: .klp.sym.sym_objname.sym_name,sympos */ cnt = sscanf(strtab + sym->st_name, - ".klp.sym.%55[^.].%511[^,],%lu", + KLP_SYM_PREFIX "%55[^.].%511[^,],%lu", sym_objname, sym_name, &sympos); if (cnt != 3) { pr_err("symbol %s has an incorrectly formatted name\n", @@ -303,7 +309,7 @@ static int klp_write_section_relocs(struct module *pmod, Elf_Shdr *sechdrs, * See comment in klp_resolve_symbols() for an explanation * of the selected field width value. */ - cnt = sscanf(shstrtab + sec->sh_name, ".klp.rela.%55[^.]", + cnt = sscanf(shstrtab + sec->sh_name, KLP_RELOC_SEC_PREFIX "%55[^.]", sec_objname); if (cnt != 1) { pr_err("section %s has an incorrectly formatted name\n", diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index ba069459c101..2351a19ac2a9 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -29,22 +29,13 @@ static unsigned int klp_signals_cnt; /* * When a livepatch is in progress, enable klp stack checking in - * cond_resched(). This helps CPU-bound kthreads get patched. + * schedule(). This helps CPU-bound kthreads get patched. */ -#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) - -#define klp_cond_resched_enable() sched_dynamic_klp_enable() -#define klp_cond_resched_disable() sched_dynamic_klp_disable() - -#else /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ DEFINE_STATIC_KEY_FALSE(klp_sched_try_switch_key); -EXPORT_SYMBOL(klp_sched_try_switch_key); -#define klp_cond_resched_enable() static_branch_enable(&klp_sched_try_switch_key) -#define klp_cond_resched_disable() static_branch_disable(&klp_sched_try_switch_key) - -#endif /* CONFIG_PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ +#define klp_resched_enable() static_branch_enable(&klp_sched_try_switch_key) +#define klp_resched_disable() static_branch_disable(&klp_sched_try_switch_key) /* * This work can be performed periodically to finish patching or unpatching any @@ -365,26 +356,18 @@ static bool klp_try_switch_task(struct task_struct *task) void __klp_sched_try_switch(void) { - if (likely(!klp_patch_pending(current))) - return; - /* - * This function is called from cond_resched() which is called in many - * places throughout the kernel. Using the klp_mutex here might - * deadlock. - * - * Instead, disable preemption to prevent racing with other callers of - * klp_try_switch_task(). Thanks to task_call_func() they won't be - * able to switch this task while it's running. + * This function is called from __schedule() while a context switch is + * about to happen. Preemption is already disabled and klp_mutex + * can't be acquired. + * Disabled preemption is used to prevent racing with other callers of + * klp_try_switch_task(). Thanks to task_call_func() they won't be + * able to switch to this task while it's running. */ - preempt_disable(); + lockdep_assert_preemption_disabled(); - /* - * Make sure current didn't get patched between the above check and - * preempt_disable(). - */ - if (unlikely(!klp_patch_pending(current))) - goto out; + if (likely(!klp_patch_pending(current))) + return; /* * Enforce the order of the TIF_PATCH_PENDING read above and the @@ -395,11 +378,7 @@ void __klp_sched_try_switch(void) smp_rmb(); klp_try_switch_task(current); - -out: - preempt_enable(); } -EXPORT_SYMBOL(__klp_sched_try_switch); /* * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set. @@ -508,7 +487,7 @@ void klp_try_complete_transition(void) } /* Done! Now cleanup the data structures. */ - klp_cond_resched_disable(); + klp_resched_disable(); patch = klp_transition_patch; klp_complete_transition(); @@ -560,7 +539,7 @@ void klp_start_transition(void) set_tsk_thread_flag(task, TIF_PATCH_PENDING); } - klp_cond_resched_enable(); + klp_resched_enable(); klp_signals_cnt = 0; } diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig new file mode 100644 index 000000000000..d2aeaf13c3ac --- /dev/null +++ b/kernel/liveupdate/Kconfig @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menu "Live Update and Kexec HandOver" + depends on !DEFERRED_STRUCT_PAGE_INIT + +config KEXEC_HANDOVER + bool "kexec handover" + depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE + depends on !DEFERRED_STRUCT_PAGE_INIT + select MEMBLOCK_KHO_SCRATCH + select KEXEC_FILE + select LIBFDT + select CMA + help + Allow kexec to hand over state across kernels by generating and + passing additional metadata to the target kernel. This is useful + to keep data or state alive across the kexec. For this to work, + both source and target kernels need to have this option enabled. + +config KEXEC_HANDOVER_DEBUG + bool "Enable Kexec Handover debug checks" + depends on KEXEC_HANDOVER + help + This option enables extra sanity checks for the Kexec Handover + subsystem. Since, KHO performance is crucial in live update + scenarios and the extra code might be adding overhead it is + only optionally enabled. + +config KEXEC_HANDOVER_DEBUGFS + bool "kexec handover debugfs interface" + default KEXEC_HANDOVER + depends on KEXEC_HANDOVER + select DEBUG_FS + help + Allow to control kexec handover device tree via debugfs + interface, i.e. finalize the state or aborting the finalization. + Also, enables inspecting the KHO fdt trees with the debugfs binary + blobs. + +config KEXEC_HANDOVER_ENABLE_DEFAULT + bool "Enable kexec handover by default" + depends on KEXEC_HANDOVER + help + Enable Kexec Handover by default. This avoids the need to + explicitly pass 'kho=on' on the kernel command line. + + This is useful for systems where KHO is a prerequisite for other + features, such as Live Update, ensuring the mechanism is always + active. + + The default behavior can still be overridden at boot time by + passing 'kho=off'. + +config LIVEUPDATE + bool "Live Update Orchestrator" + depends on KEXEC_HANDOVER + depends on SHMEM + help + Enable the Live Update Orchestrator. Live Update is a mechanism, + typically based on kexec, that allows the kernel to be updated + while keeping selected devices operational across the transition. + These devices are intended to be reclaimed by the new kernel and + re-attached to their original workload without requiring a device + reset. + + Ability to handover a device from current to the next kernel depends + on specific support within device drivers and related kernel + subsystems. + + This feature primarily targets virtual machine hosts to quickly update + the kernel hypervisor with minimal disruption to the running virtual + machines. + + If unsure, say N. + +endmenu diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile new file mode 100644 index 000000000000..7cad2eece32d --- /dev/null +++ b/kernel/liveupdate/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 + +luo-y := \ + luo_core.o \ + luo_file.o \ + luo_session.o + +obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o +obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o +obj-$(CONFIG_KEXEC_HANDOVER_DEBUGFS) += kexec_handover_debugfs.o + +obj-$(CONFIG_LIVEUPDATE) += luo.o diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c new file mode 100644 index 000000000000..9dc51fab604f --- /dev/null +++ b/kernel/liveupdate/kexec_handover.c @@ -0,0 +1,1594 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kexec_handover.c - kexec handover metadata processing + * Copyright (C) 2023 Alexander Graf <graf@amazon.com> + * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org> + * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com> + * Copyright (C) 2025 Pasha Tatashin <pasha.tatashin@soleen.com> + */ + +#define pr_fmt(fmt) "KHO: " fmt + +#include <linux/cleanup.h> +#include <linux/cma.h> +#include <linux/kmemleak.h> +#include <linux/count_zeros.h> +#include <linux/kexec.h> +#include <linux/kexec_handover.h> +#include <linux/libfdt.h> +#include <linux/list.h> +#include <linux/memblock.h> +#include <linux/page-isolation.h> +#include <linux/unaligned.h> +#include <linux/vmalloc.h> + +#include <asm/early_ioremap.h> + +#include "kexec_handover_internal.h" +/* + * KHO is tightly coupled with mm init and needs access to some of mm + * internal APIs. + */ +#include "../../mm/internal.h" +#include "../kexec_internal.h" +#include "kexec_handover_internal.h" + +#define KHO_FDT_COMPATIBLE "kho-v1" +#define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map" +#define PROP_SUB_FDT "fdt" + +#define KHO_PAGE_MAGIC 0x4b484f50U /* ASCII for 'KHOP' */ + +/* + * KHO uses page->private, which is an unsigned long, to store page metadata. + * Use it to store both the magic and the order. + */ +union kho_page_info { + unsigned long page_private; + struct { + unsigned int order; + unsigned int magic; + }; +}; + +static_assert(sizeof(union kho_page_info) == sizeof(((struct page *)0)->private)); + +static bool kho_enable __ro_after_init = IS_ENABLED(CONFIG_KEXEC_HANDOVER_ENABLE_DEFAULT); + +bool kho_is_enabled(void) +{ + return kho_enable; +} +EXPORT_SYMBOL_GPL(kho_is_enabled); + +static int __init kho_parse_enable(char *p) +{ + return kstrtobool(p, &kho_enable); +} +early_param("kho", kho_parse_enable); + +/* + * Keep track of memory that is to be preserved across KHO. + * + * The serializing side uses two levels of xarrays to manage chunks of per-order + * PAGE_SIZE byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order + * of a 8TB system would fit inside a single 4096 byte bitmap. For order 0 + * allocations each bitmap will cover 128M of address space. Thus, for 16G of + * memory at most 512K of bitmap memory will be needed for order 0. + * + * This approach is fully incremental, as the serialization progresses folios + * can continue be aggregated to the tracker. The final step, immediately prior + * to kexec would serialize the xarray information into a linked list for the + * successor kernel to parse. + */ + +#define PRESERVE_BITS (PAGE_SIZE * 8) + +struct kho_mem_phys_bits { + DECLARE_BITMAP(preserve, PRESERVE_BITS); +}; + +static_assert(sizeof(struct kho_mem_phys_bits) == PAGE_SIZE); + +struct kho_mem_phys { + /* + * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized + * to order. + */ + struct xarray phys_bits; +}; + +struct kho_mem_track { + /* Points to kho_mem_phys, each order gets its own bitmap tree */ + struct xarray orders; +}; + +struct khoser_mem_chunk; + +struct kho_out { + void *fdt; + bool finalized; + struct mutex lock; /* protects KHO FDT finalization */ + + struct kho_mem_track track; + struct kho_debugfs dbg; +}; + +static struct kho_out kho_out = { + .lock = __MUTEX_INITIALIZER(kho_out.lock), + .track = { + .orders = XARRAY_INIT(kho_out.track.orders, 0), + }, + .finalized = false, +}; + +static void *xa_load_or_alloc(struct xarray *xa, unsigned long index) +{ + void *res = xa_load(xa, index); + + if (res) + return res; + + void *elm __free(free_page) = (void *)get_zeroed_page(GFP_KERNEL); + + if (!elm) + return ERR_PTR(-ENOMEM); + + if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), PAGE_SIZE))) + return ERR_PTR(-EINVAL); + + res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); + if (xa_is_err(res)) + return ERR_PTR(xa_err(res)); + else if (res) + return res; + + return no_free_ptr(elm); +} + +static void __kho_unpreserve_order(struct kho_mem_track *track, unsigned long pfn, + unsigned int order) +{ + struct kho_mem_phys_bits *bits; + struct kho_mem_phys *physxa; + const unsigned long pfn_high = pfn >> order; + + physxa = xa_load(&track->orders, order); + if (WARN_ON_ONCE(!physxa)) + return; + + bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); + if (WARN_ON_ONCE(!bits)) + return; + + clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); +} + +static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, + unsigned long end_pfn) +{ + unsigned int order; + + while (pfn < end_pfn) { + order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); + + __kho_unpreserve_order(track, pfn, order); + + pfn += 1 << order; + } +} + +static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, + unsigned int order) +{ + struct kho_mem_phys_bits *bits; + struct kho_mem_phys *physxa, *new_physxa; + const unsigned long pfn_high = pfn >> order; + + might_sleep(); + physxa = xa_load(&track->orders, order); + if (!physxa) { + int err; + + new_physxa = kzalloc(sizeof(*physxa), GFP_KERNEL); + if (!new_physxa) + return -ENOMEM; + + xa_init(&new_physxa->phys_bits); + physxa = xa_cmpxchg(&track->orders, order, NULL, new_physxa, + GFP_KERNEL); + + err = xa_err(physxa); + if (err || physxa) { + xa_destroy(&new_physxa->phys_bits); + kfree(new_physxa); + + if (err) + return err; + } else { + physxa = new_physxa; + } + } + + bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS); + if (IS_ERR(bits)) + return PTR_ERR(bits); + + set_bit(pfn_high % PRESERVE_BITS, bits->preserve); + + return 0; +} + +static struct page *kho_restore_page(phys_addr_t phys, bool is_folio) +{ + struct page *page = pfn_to_online_page(PHYS_PFN(phys)); + unsigned int nr_pages, ref_cnt; + union kho_page_info info; + + if (!page) + return NULL; + + info.page_private = page->private; + /* + * deserialize_bitmap() only sets the magic on the head page. This magic + * check also implicitly makes sure phys is order-aligned since for + * non-order-aligned phys addresses, magic will never be set. + */ + if (WARN_ON_ONCE(info.magic != KHO_PAGE_MAGIC || info.order > MAX_PAGE_ORDER)) + return NULL; + nr_pages = (1 << info.order); + + /* Clear private to make sure later restores on this page error out. */ + page->private = 0; + /* Head page gets refcount of 1. */ + set_page_count(page, 1); + + /* + * For higher order folios, tail pages get a page count of zero. + * For physically contiguous order-0 pages every pages gets a page + * count of 1 + */ + ref_cnt = is_folio ? 0 : 1; + for (unsigned int i = 1; i < nr_pages; i++) + set_page_count(page + i, ref_cnt); + + if (is_folio && info.order) + prep_compound_page(page, info.order); + + adjust_managed_page_count(page, nr_pages); + return page; +} + +/** + * kho_restore_folio - recreates the folio from the preserved memory. + * @phys: physical address of the folio. + * + * Return: pointer to the struct folio on success, NULL on failure. + */ +struct folio *kho_restore_folio(phys_addr_t phys) +{ + struct page *page = kho_restore_page(phys, true); + + return page ? page_folio(page) : NULL; +} +EXPORT_SYMBOL_GPL(kho_restore_folio); + +/** + * kho_restore_pages - restore list of contiguous order 0 pages. + * @phys: physical address of the first page. + * @nr_pages: number of pages. + * + * Restore a contiguous list of order 0 pages that was preserved with + * kho_preserve_pages(). + * + * Return: 0 on success, error code on failure + */ +struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages) +{ + const unsigned long start_pfn = PHYS_PFN(phys); + const unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn = start_pfn; + + while (pfn < end_pfn) { + const unsigned int order = + min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); + struct page *page = kho_restore_page(PFN_PHYS(pfn), false); + + if (!page) + return NULL; + pfn += 1 << order; + } + + return pfn_to_page(start_pfn); +} +EXPORT_SYMBOL_GPL(kho_restore_pages); + +/* Serialize and deserialize struct kho_mem_phys across kexec + * + * Record all the bitmaps in a linked list of pages for the next kernel to + * process. Each chunk holds bitmaps of the same order and each block of bitmaps + * starts at a given physical address. This allows the bitmaps to be sparse. The + * xarray is used to store them in a tree while building up the data structure, + * but the KHO successor kernel only needs to process them once in order. + * + * All of this memory is normal kmalloc() memory and is not marked for + * preservation. The successor kernel will remain isolated to the scratch space + * until it completes processing this list. Once processed all the memory + * storing these ranges will be marked as free. + */ + +struct khoser_mem_bitmap_ptr { + phys_addr_t phys_start; + DECLARE_KHOSER_PTR(bitmap, struct kho_mem_phys_bits *); +}; + +struct khoser_mem_chunk_hdr { + DECLARE_KHOSER_PTR(next, struct khoser_mem_chunk *); + unsigned int order; + unsigned int num_elms; +}; + +#define KHOSER_BITMAP_SIZE \ + ((PAGE_SIZE - sizeof(struct khoser_mem_chunk_hdr)) / \ + sizeof(struct khoser_mem_bitmap_ptr)) + +struct khoser_mem_chunk { + struct khoser_mem_chunk_hdr hdr; + struct khoser_mem_bitmap_ptr bitmaps[KHOSER_BITMAP_SIZE]; +}; + +static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE); + +static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, + unsigned long order) +{ + struct khoser_mem_chunk *chunk __free(free_page) = NULL; + + chunk = (void *)get_zeroed_page(GFP_KERNEL); + if (!chunk) + return ERR_PTR(-ENOMEM); + + if (WARN_ON(kho_scratch_overlap(virt_to_phys(chunk), PAGE_SIZE))) + return ERR_PTR(-EINVAL); + + chunk->hdr.order = order; + if (cur_chunk) + KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk); + return no_free_ptr(chunk); +} + +static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) +{ + struct khoser_mem_chunk *chunk = first_chunk; + + while (chunk) { + struct khoser_mem_chunk *tmp = chunk; + + chunk = KHOSER_LOAD_PTR(chunk->hdr.next); + free_page((unsigned long)tmp); + } +} + +/* + * Update memory map property, if old one is found discard it via + * kho_mem_ser_free(). + */ +static void kho_update_memory_map(struct khoser_mem_chunk *first_chunk) +{ + void *ptr; + u64 phys; + + ptr = fdt_getprop_w(kho_out.fdt, 0, PROP_PRESERVED_MEMORY_MAP, NULL); + + /* Check and discard previous memory map */ + phys = get_unaligned((u64 *)ptr); + if (phys) + kho_mem_ser_free((struct khoser_mem_chunk *)phys_to_virt(phys)); + + /* Update with the new value */ + phys = first_chunk ? (u64)virt_to_phys(first_chunk) : 0; + put_unaligned(phys, (u64 *)ptr); +} + +static int kho_mem_serialize(struct kho_out *kho_out) +{ + struct khoser_mem_chunk *first_chunk = NULL; + struct khoser_mem_chunk *chunk = NULL; + struct kho_mem_phys *physxa; + unsigned long order; + int err = -ENOMEM; + + xa_for_each(&kho_out->track.orders, order, physxa) { + struct kho_mem_phys_bits *bits; + unsigned long phys; + + chunk = new_chunk(chunk, order); + if (IS_ERR(chunk)) { + err = PTR_ERR(chunk); + goto err_free; + } + + if (!first_chunk) + first_chunk = chunk; + + xa_for_each(&physxa->phys_bits, phys, bits) { + struct khoser_mem_bitmap_ptr *elm; + + if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) { + chunk = new_chunk(chunk, order); + if (IS_ERR(chunk)) { + err = PTR_ERR(chunk); + goto err_free; + } + } + + elm = &chunk->bitmaps[chunk->hdr.num_elms]; + chunk->hdr.num_elms++; + elm->phys_start = (phys * PRESERVE_BITS) + << (order + PAGE_SHIFT); + KHOSER_STORE_PTR(elm->bitmap, bits); + } + } + + kho_update_memory_map(first_chunk); + + return 0; + +err_free: + kho_mem_ser_free(first_chunk); + return err; +} + +static void __init deserialize_bitmap(unsigned int order, + struct khoser_mem_bitmap_ptr *elm) +{ + struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap); + unsigned long bit; + + for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) { + int sz = 1 << (order + PAGE_SHIFT); + phys_addr_t phys = + elm->phys_start + (bit << (order + PAGE_SHIFT)); + struct page *page = phys_to_page(phys); + union kho_page_info info; + + memblock_reserve(phys, sz); + memblock_reserved_mark_noinit(phys, sz); + info.magic = KHO_PAGE_MAGIC; + info.order = order; + page->private = info.page_private; + } +} + +/* Return true if memory was deserizlied */ +static bool __init kho_mem_deserialize(const void *fdt) +{ + struct khoser_mem_chunk *chunk; + const void *mem_ptr; + u64 mem; + int len; + + mem_ptr = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len); + if (!mem_ptr || len != sizeof(u64)) { + pr_err("failed to get preserved memory bitmaps\n"); + return false; + } + + mem = get_unaligned((const u64 *)mem_ptr); + chunk = mem ? phys_to_virt(mem) : NULL; + + /* No preserved physical pages were passed, no deserialization */ + if (!chunk) + return false; + + while (chunk) { + unsigned int i; + + for (i = 0; i != chunk->hdr.num_elms; i++) + deserialize_bitmap(chunk->hdr.order, + &chunk->bitmaps[i]); + chunk = KHOSER_LOAD_PTR(chunk->hdr.next); + } + + return true; +} + +/* + * With KHO enabled, memory can become fragmented because KHO regions may + * be anywhere in physical address space. The scratch regions give us a + * safe zones that we will never see KHO allocations from. This is where we + * can later safely load our new kexec images into and then use the scratch + * area for early allocations that happen before page allocator is + * initialized. + */ +struct kho_scratch *kho_scratch; +unsigned int kho_scratch_cnt; + +/* + * The scratch areas are scaled by default as percent of memory allocated from + * memblock. A user can override the scale with command line parameter: + * + * kho_scratch=N% + * + * It is also possible to explicitly define size for a lowmem, a global and + * per-node scratch areas: + * + * kho_scratch=l[KMG],n[KMG],m[KMG] + * + * The explicit size definition takes precedence over scale definition. + */ +static unsigned int scratch_scale __initdata = 200; +static phys_addr_t scratch_size_global __initdata; +static phys_addr_t scratch_size_pernode __initdata; +static phys_addr_t scratch_size_lowmem __initdata; + +static int __init kho_parse_scratch_size(char *p) +{ + size_t len; + unsigned long sizes[3]; + size_t total_size = 0; + int i; + + if (!p) + return -EINVAL; + + len = strlen(p); + if (!len) + return -EINVAL; + + /* parse nn% */ + if (p[len - 1] == '%') { + /* unsigned int max is 4,294,967,295, 10 chars */ + char s_scale[11] = {}; + int ret = 0; + + if (len > ARRAY_SIZE(s_scale)) + return -EINVAL; + + memcpy(s_scale, p, len - 1); + ret = kstrtouint(s_scale, 10, &scratch_scale); + if (!ret) + pr_notice("scratch scale is %d%%\n", scratch_scale); + return ret; + } + + /* parse ll[KMG],mm[KMG],nn[KMG] */ + for (i = 0; i < ARRAY_SIZE(sizes); i++) { + char *endp = p; + + if (i > 0) { + if (*p != ',') + return -EINVAL; + p += 1; + } + + sizes[i] = memparse(p, &endp); + if (endp == p) + return -EINVAL; + p = endp; + total_size += sizes[i]; + } + + if (!total_size) + return -EINVAL; + + /* The string should be fully consumed by now. */ + if (*p) + return -EINVAL; + + scratch_size_lowmem = sizes[0]; + scratch_size_global = sizes[1]; + scratch_size_pernode = sizes[2]; + scratch_scale = 0; + + pr_notice("scratch areas: lowmem: %lluMiB global: %lluMiB pernode: %lldMiB\n", + (u64)(scratch_size_lowmem >> 20), + (u64)(scratch_size_global >> 20), + (u64)(scratch_size_pernode >> 20)); + + return 0; +} +early_param("kho_scratch", kho_parse_scratch_size); + +static void __init scratch_size_update(void) +{ + phys_addr_t size; + + if (!scratch_scale) + return; + + size = memblock_reserved_kern_size(ARCH_LOW_ADDRESS_LIMIT, + NUMA_NO_NODE); + size = size * scratch_scale / 100; + scratch_size_lowmem = round_up(size, CMA_MIN_ALIGNMENT_BYTES); + + size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, + NUMA_NO_NODE); + size = size * scratch_scale / 100 - scratch_size_lowmem; + scratch_size_global = round_up(size, CMA_MIN_ALIGNMENT_BYTES); +} + +static phys_addr_t __init scratch_size_node(int nid) +{ + phys_addr_t size; + + if (scratch_scale) { + size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, + nid); + size = size * scratch_scale / 100; + } else { + size = scratch_size_pernode; + } + + return round_up(size, CMA_MIN_ALIGNMENT_BYTES); +} + +/** + * kho_reserve_scratch - Reserve a contiguous chunk of memory for kexec + * + * With KHO we can preserve arbitrary pages in the system. To ensure we still + * have a large contiguous region of memory when we search the physical address + * space for target memory, let's make sure we always have a large CMA region + * active. This CMA region will only be used for movable pages which are not a + * problem for us during KHO because we can just move them somewhere else. + */ +static void __init kho_reserve_scratch(void) +{ + phys_addr_t addr, size; + int nid, i = 0; + + if (!kho_enable) + return; + + scratch_size_update(); + + /* FIXME: deal with node hot-plug/remove */ + kho_scratch_cnt = num_online_nodes() + 2; + size = kho_scratch_cnt * sizeof(*kho_scratch); + kho_scratch = memblock_alloc(size, PAGE_SIZE); + if (!kho_scratch) + goto err_disable_kho; + + /* + * reserve scratch area in low memory for lowmem allocations in the + * next kernel + */ + size = scratch_size_lowmem; + addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, 0, + ARCH_LOW_ADDRESS_LIMIT); + if (!addr) + goto err_free_scratch_desc; + + kho_scratch[i].addr = addr; + kho_scratch[i].size = size; + i++; + + /* reserve large contiguous area for allocations without nid */ + size = scratch_size_global; + addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES); + if (!addr) + goto err_free_scratch_areas; + + kho_scratch[i].addr = addr; + kho_scratch[i].size = size; + i++; + + for_each_online_node(nid) { + size = scratch_size_node(nid); + addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES, + 0, MEMBLOCK_ALLOC_ACCESSIBLE, + nid, true); + if (!addr) + goto err_free_scratch_areas; + + kho_scratch[i].addr = addr; + kho_scratch[i].size = size; + i++; + } + + return; + +err_free_scratch_areas: + for (i--; i >= 0; i--) + memblock_phys_free(kho_scratch[i].addr, kho_scratch[i].size); +err_free_scratch_desc: + memblock_free(kho_scratch, kho_scratch_cnt * sizeof(*kho_scratch)); +err_disable_kho: + pr_warn("Failed to reserve scratch area, disabling kexec handover\n"); + kho_enable = false; +} + +/** + * kho_add_subtree - record the physical address of a sub FDT in KHO root tree. + * @name: name of the sub tree. + * @fdt: the sub tree blob. + * + * Creates a new child node named @name in KHO root FDT and records + * the physical address of @fdt. The pages of @fdt must also be preserved + * by KHO for the new kernel to retrieve it after kexec. + * + * A debugfs blob entry is also created at + * ``/sys/kernel/debug/kho/out/sub_fdts/@name`` when kernel is configured with + * CONFIG_KEXEC_HANDOVER_DEBUGFS + * + * Return: 0 on success, error code on failure + */ +int kho_add_subtree(const char *name, void *fdt) +{ + phys_addr_t phys = virt_to_phys(fdt); + void *root_fdt = kho_out.fdt; + int err = -ENOMEM; + int off, fdt_err; + + guard(mutex)(&kho_out.lock); + + fdt_err = fdt_open_into(root_fdt, root_fdt, PAGE_SIZE); + if (fdt_err < 0) + return err; + + off = fdt_add_subnode(root_fdt, 0, name); + if (off < 0) { + if (off == -FDT_ERR_EXISTS) + err = -EEXIST; + goto out_pack; + } + + err = fdt_setprop(root_fdt, off, PROP_SUB_FDT, &phys, sizeof(phys)); + if (err < 0) + goto out_pack; + + WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false)); + +out_pack: + fdt_pack(root_fdt); + + return err; +} +EXPORT_SYMBOL_GPL(kho_add_subtree); + +void kho_remove_subtree(void *fdt) +{ + phys_addr_t target_phys = virt_to_phys(fdt); + void *root_fdt = kho_out.fdt; + int off; + int err; + + guard(mutex)(&kho_out.lock); + + err = fdt_open_into(root_fdt, root_fdt, PAGE_SIZE); + if (err < 0) + return; + + for (off = fdt_first_subnode(root_fdt, 0); off >= 0; + off = fdt_next_subnode(root_fdt, off)) { + const u64 *val; + int len; + + val = fdt_getprop(root_fdt, off, PROP_SUB_FDT, &len); + if (!val || len != sizeof(phys_addr_t)) + continue; + + if ((phys_addr_t)*val == target_phys) { + fdt_del_node(root_fdt, off); + kho_debugfs_fdt_remove(&kho_out.dbg, fdt); + break; + } + } + + fdt_pack(root_fdt); +} +EXPORT_SYMBOL_GPL(kho_remove_subtree); + +/** + * kho_preserve_folio - preserve a folio across kexec. + * @folio: folio to preserve. + * + * Instructs KHO to preserve the whole folio across kexec. The order + * will be preserved as well. + * + * Return: 0 on success, error code on failure + */ +int kho_preserve_folio(struct folio *folio) +{ + const unsigned long pfn = folio_pfn(folio); + const unsigned int order = folio_order(folio); + struct kho_mem_track *track = &kho_out.track; + + if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order))) + return -EINVAL; + + return __kho_preserve_order(track, pfn, order); +} +EXPORT_SYMBOL_GPL(kho_preserve_folio); + +/** + * kho_unpreserve_folio - unpreserve a folio. + * @folio: folio to unpreserve. + * + * Instructs KHO to unpreserve a folio that was preserved by + * kho_preserve_folio() before. The provided @folio (pfn and order) + * must exactly match a previously preserved folio. + */ +void kho_unpreserve_folio(struct folio *folio) +{ + const unsigned long pfn = folio_pfn(folio); + const unsigned int order = folio_order(folio); + struct kho_mem_track *track = &kho_out.track; + + __kho_unpreserve_order(track, pfn, order); +} +EXPORT_SYMBOL_GPL(kho_unpreserve_folio); + +/** + * kho_preserve_pages - preserve contiguous pages across kexec + * @page: first page in the list. + * @nr_pages: number of pages. + * + * Preserve a contiguous list of order 0 pages. Must be restored using + * kho_restore_pages() to ensure the pages are restored properly as order 0. + * + * Return: 0 on success, error code on failure + */ +int kho_preserve_pages(struct page *page, unsigned int nr_pages) +{ + struct kho_mem_track *track = &kho_out.track; + const unsigned long start_pfn = page_to_pfn(page); + const unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn = start_pfn; + unsigned long failed_pfn = 0; + int err = 0; + + if (WARN_ON(kho_scratch_overlap(start_pfn << PAGE_SHIFT, + nr_pages << PAGE_SHIFT))) { + return -EINVAL; + } + + while (pfn < end_pfn) { + const unsigned int order = + min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); + + err = __kho_preserve_order(track, pfn, order); + if (err) { + failed_pfn = pfn; + break; + } + + pfn += 1 << order; + } + + if (err) + __kho_unpreserve(track, start_pfn, failed_pfn); + + return err; +} +EXPORT_SYMBOL_GPL(kho_preserve_pages); + +/** + * kho_unpreserve_pages - unpreserve contiguous pages. + * @page: first page in the list. + * @nr_pages: number of pages. + * + * Instructs KHO to unpreserve @nr_pages contiguous pages starting from @page. + * This must be called with the same @page and @nr_pages as the corresponding + * kho_preserve_pages() call. Unpreserving arbitrary sub-ranges of larger + * preserved blocks is not supported. + */ +void kho_unpreserve_pages(struct page *page, unsigned int nr_pages) +{ + struct kho_mem_track *track = &kho_out.track; + const unsigned long start_pfn = page_to_pfn(page); + const unsigned long end_pfn = start_pfn + nr_pages; + + __kho_unpreserve(track, start_pfn, end_pfn); +} +EXPORT_SYMBOL_GPL(kho_unpreserve_pages); + +struct kho_vmalloc_hdr { + DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *); +}; + +#define KHO_VMALLOC_SIZE \ + ((PAGE_SIZE - sizeof(struct kho_vmalloc_hdr)) / \ + sizeof(phys_addr_t)) + +struct kho_vmalloc_chunk { + struct kho_vmalloc_hdr hdr; + phys_addr_t phys[KHO_VMALLOC_SIZE]; +}; + +static_assert(sizeof(struct kho_vmalloc_chunk) == PAGE_SIZE); + +/* vmalloc flags KHO supports */ +#define KHO_VMALLOC_SUPPORTED_FLAGS (VM_ALLOC | VM_ALLOW_HUGE_VMAP) + +/* KHO internal flags for vmalloc preservations */ +#define KHO_VMALLOC_ALLOC 0x0001 +#define KHO_VMALLOC_HUGE_VMAP 0x0002 + +static unsigned short vmalloc_flags_to_kho(unsigned int vm_flags) +{ + unsigned short kho_flags = 0; + + if (vm_flags & VM_ALLOC) + kho_flags |= KHO_VMALLOC_ALLOC; + if (vm_flags & VM_ALLOW_HUGE_VMAP) + kho_flags |= KHO_VMALLOC_HUGE_VMAP; + + return kho_flags; +} + +static unsigned int kho_flags_to_vmalloc(unsigned short kho_flags) +{ + unsigned int vm_flags = 0; + + if (kho_flags & KHO_VMALLOC_ALLOC) + vm_flags |= VM_ALLOC; + if (kho_flags & KHO_VMALLOC_HUGE_VMAP) + vm_flags |= VM_ALLOW_HUGE_VMAP; + + return vm_flags; +} + +static struct kho_vmalloc_chunk *new_vmalloc_chunk(struct kho_vmalloc_chunk *cur) +{ + struct kho_vmalloc_chunk *chunk; + int err; + + chunk = (struct kho_vmalloc_chunk *)get_zeroed_page(GFP_KERNEL); + if (!chunk) + return NULL; + + err = kho_preserve_pages(virt_to_page(chunk), 1); + if (err) + goto err_free; + if (cur) + KHOSER_STORE_PTR(cur->hdr.next, chunk); + return chunk; + +err_free: + free_page((unsigned long)chunk); + return NULL; +} + +static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk, + unsigned short order) +{ + struct kho_mem_track *track = &kho_out.track; + unsigned long pfn = PHYS_PFN(virt_to_phys(chunk)); + + __kho_unpreserve(track, pfn, pfn + 1); + + for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) { + pfn = PHYS_PFN(chunk->phys[i]); + __kho_unpreserve(track, pfn, pfn + (1 << order)); + } +} + +/** + * kho_preserve_vmalloc - preserve memory allocated with vmalloc() across kexec + * @ptr: pointer to the area in vmalloc address space + * @preservation: placeholder for preservation metadata + * + * Instructs KHO to preserve the area in vmalloc address space at @ptr. The + * physical pages mapped at @ptr will be preserved and on successful return + * @preservation will hold the physical address of a structure that describes + * the preservation. + * + * NOTE: The memory allocated with vmalloc_node() variants cannot be reliably + * restored on the same node + * + * Return: 0 on success, error code on failure + */ +int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation) +{ + struct kho_vmalloc_chunk *chunk; + struct vm_struct *vm = find_vm_area(ptr); + unsigned int order, flags, nr_contig_pages; + unsigned int idx = 0; + int err; + + if (!vm) + return -EINVAL; + + if (vm->flags & ~KHO_VMALLOC_SUPPORTED_FLAGS) + return -EOPNOTSUPP; + + flags = vmalloc_flags_to_kho(vm->flags); + order = get_vm_area_page_order(vm); + + chunk = new_vmalloc_chunk(NULL); + if (!chunk) + return -ENOMEM; + KHOSER_STORE_PTR(preservation->first, chunk); + + nr_contig_pages = (1 << order); + for (int i = 0; i < vm->nr_pages; i += nr_contig_pages) { + phys_addr_t phys = page_to_phys(vm->pages[i]); + + err = kho_preserve_pages(vm->pages[i], nr_contig_pages); + if (err) + goto err_free; + + chunk->phys[idx++] = phys; + if (idx == ARRAY_SIZE(chunk->phys)) { + chunk = new_vmalloc_chunk(chunk); + if (!chunk) + goto err_free; + idx = 0; + } + } + + preservation->total_pages = vm->nr_pages; + preservation->flags = flags; + preservation->order = order; + + return 0; + +err_free: + kho_unpreserve_vmalloc(preservation); + return err; +} +EXPORT_SYMBOL_GPL(kho_preserve_vmalloc); + +/** + * kho_unpreserve_vmalloc - unpreserve memory allocated with vmalloc() + * @preservation: preservation metadata returned by kho_preserve_vmalloc() + * + * Instructs KHO to unpreserve the area in vmalloc address space that was + * previously preserved with kho_preserve_vmalloc(). + */ +void kho_unpreserve_vmalloc(struct kho_vmalloc *preservation) +{ + struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first); + + while (chunk) { + struct kho_vmalloc_chunk *tmp = chunk; + + kho_vmalloc_unpreserve_chunk(chunk, preservation->order); + + chunk = KHOSER_LOAD_PTR(chunk->hdr.next); + free_page((unsigned long)tmp); + } +} +EXPORT_SYMBOL_GPL(kho_unpreserve_vmalloc); + +/** + * kho_restore_vmalloc - recreates and populates an area in vmalloc address + * space from the preserved memory. + * @preservation: preservation metadata. + * + * Recreates an area in vmalloc address space and populates it with memory that + * was preserved using kho_preserve_vmalloc(). + * + * Return: pointer to the area in the vmalloc address space, NULL on failure. + */ +void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) +{ + struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first); + unsigned int align, order, shift, vm_flags; + unsigned long total_pages, contig_pages; + unsigned long addr, size; + struct vm_struct *area; + struct page **pages; + unsigned int idx = 0; + int err; + + vm_flags = kho_flags_to_vmalloc(preservation->flags); + if (vm_flags & ~KHO_VMALLOC_SUPPORTED_FLAGS) + return NULL; + + total_pages = preservation->total_pages; + pages = kvmalloc_array(total_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return NULL; + order = preservation->order; + contig_pages = (1 << order); + shift = PAGE_SHIFT + order; + align = 1 << shift; + + while (chunk) { + struct page *page; + + for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) { + phys_addr_t phys = chunk->phys[i]; + + if (idx + contig_pages > total_pages) + goto err_free_pages_array; + + page = kho_restore_pages(phys, contig_pages); + if (!page) + goto err_free_pages_array; + + for (int j = 0; j < contig_pages; j++) + pages[idx++] = page + j; + + phys += contig_pages * PAGE_SIZE; + } + + page = kho_restore_pages(virt_to_phys(chunk), 1); + if (!page) + goto err_free_pages_array; + chunk = KHOSER_LOAD_PTR(chunk->hdr.next); + __free_page(page); + } + + if (idx != total_pages) + goto err_free_pages_array; + + area = __get_vm_area_node(total_pages * PAGE_SIZE, align, shift, + vm_flags, VMALLOC_START, VMALLOC_END, + NUMA_NO_NODE, GFP_KERNEL, + __builtin_return_address(0)); + if (!area) + goto err_free_pages_array; + + addr = (unsigned long)area->addr; + size = get_vm_area_size(area); + err = vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, shift); + if (err) + goto err_free_vm_area; + + area->nr_pages = total_pages; + area->pages = pages; + + return area->addr; + +err_free_vm_area: + free_vm_area(area); +err_free_pages_array: + kvfree(pages); + return NULL; +} +EXPORT_SYMBOL_GPL(kho_restore_vmalloc); + +/** + * kho_alloc_preserve - Allocate, zero, and preserve memory. + * @size: The number of bytes to allocate. + * + * Allocates a physically contiguous block of zeroed pages that is large + * enough to hold @size bytes. The allocated memory is then registered with + * KHO for preservation across a kexec. + * + * Note: The actual allocated size will be rounded up to the nearest + * power-of-two page boundary. + * + * @return A virtual pointer to the allocated and preserved memory on success, + * or an ERR_PTR() encoded error on failure. + */ +void *kho_alloc_preserve(size_t size) +{ + struct folio *folio; + int order, ret; + + if (!size) + return ERR_PTR(-EINVAL); + + order = get_order(size); + if (order > MAX_PAGE_ORDER) + return ERR_PTR(-E2BIG); + + folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, order); + if (!folio) + return ERR_PTR(-ENOMEM); + + ret = kho_preserve_folio(folio); + if (ret) { + folio_put(folio); + return ERR_PTR(ret); + } + + return folio_address(folio); +} +EXPORT_SYMBOL_GPL(kho_alloc_preserve); + +/** + * kho_unpreserve_free - Unpreserve and free memory. + * @mem: Pointer to the memory allocated by kho_alloc_preserve(). + * + * Unregisters the memory from KHO preservation and frees the underlying + * pages back to the system. This function should be called to clean up + * memory allocated with kho_alloc_preserve(). + */ +void kho_unpreserve_free(void *mem) +{ + struct folio *folio; + + if (!mem) + return; + + folio = virt_to_folio(mem); + kho_unpreserve_folio(folio); + folio_put(folio); +} +EXPORT_SYMBOL_GPL(kho_unpreserve_free); + +/** + * kho_restore_free - Restore and free memory after kexec. + * @mem: Pointer to the memory (in the new kernel's address space) + * that was allocated by the old kernel. + * + * This function is intended to be called in the new kernel (post-kexec) + * to take ownership of and free a memory region that was preserved by the + * old kernel using kho_alloc_preserve(). + * + * It first restores the pages from KHO (using their physical address) + * and then frees the pages back to the new kernel's page allocator. + */ +void kho_restore_free(void *mem) +{ + struct folio *folio; + + if (!mem) + return; + + folio = kho_restore_folio(__pa(mem)); + if (!WARN_ON(!folio)) + folio_put(folio); +} +EXPORT_SYMBOL_GPL(kho_restore_free); + +int kho_finalize(void) +{ + int ret; + + if (!kho_enable) + return -EOPNOTSUPP; + + guard(mutex)(&kho_out.lock); + ret = kho_mem_serialize(&kho_out); + if (ret) + return ret; + + kho_out.finalized = true; + + return 0; +} + +bool kho_finalized(void) +{ + guard(mutex)(&kho_out.lock); + return kho_out.finalized; +} + +struct kho_in { + phys_addr_t fdt_phys; + phys_addr_t scratch_phys; + struct kho_debugfs dbg; +}; + +static struct kho_in kho_in = { +}; + +static const void *kho_get_fdt(void) +{ + return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL; +} + +/** + * is_kho_boot - check if current kernel was booted via KHO-enabled + * kexec + * + * This function checks if the current kernel was loaded through a kexec + * operation with KHO enabled, by verifying that a valid KHO FDT + * was passed. + * + * Note: This function returns reliable results only after + * kho_populate() has been called during early boot. Before that, + * it may return false even if KHO data is present. + * + * Return: true if booted via KHO-enabled kexec, false otherwise + */ +bool is_kho_boot(void) +{ + return !!kho_get_fdt(); +} +EXPORT_SYMBOL_GPL(is_kho_boot); + +/** + * kho_retrieve_subtree - retrieve a preserved sub FDT by its name. + * @name: the name of the sub FDT passed to kho_add_subtree(). + * @phys: if found, the physical address of the sub FDT is stored in @phys. + * + * Retrieve a preserved sub FDT named @name and store its physical + * address in @phys. + * + * Return: 0 on success, error code on failure + */ +int kho_retrieve_subtree(const char *name, phys_addr_t *phys) +{ + const void *fdt = kho_get_fdt(); + const u64 *val; + int offset, len; + + if (!fdt) + return -ENOENT; + + if (!phys) + return -EINVAL; + + offset = fdt_subnode_offset(fdt, 0, name); + if (offset < 0) + return -ENOENT; + + val = fdt_getprop(fdt, offset, PROP_SUB_FDT, &len); + if (!val || len != sizeof(*val)) + return -EINVAL; + + *phys = (phys_addr_t)*val; + + return 0; +} +EXPORT_SYMBOL_GPL(kho_retrieve_subtree); + +static __init int kho_out_fdt_setup(void) +{ + void *root = kho_out.fdt; + u64 empty_mem_map = 0; + int err; + + err = fdt_create(root, PAGE_SIZE); + err |= fdt_finish_reservemap(root); + err |= fdt_begin_node(root, ""); + err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE); + err |= fdt_property(root, PROP_PRESERVED_MEMORY_MAP, &empty_mem_map, + sizeof(empty_mem_map)); + err |= fdt_end_node(root); + err |= fdt_finish(root); + + return err; +} + +static __init int kho_init(void) +{ + const void *fdt = kho_get_fdt(); + int err = 0; + + if (!kho_enable) + return 0; + + kho_out.fdt = kho_alloc_preserve(PAGE_SIZE); + if (IS_ERR(kho_out.fdt)) { + err = PTR_ERR(kho_out.fdt); + goto err_free_scratch; + } + + err = kho_debugfs_init(); + if (err) + goto err_free_fdt; + + err = kho_out_debugfs_init(&kho_out.dbg); + if (err) + goto err_free_fdt; + + err = kho_out_fdt_setup(); + if (err) + goto err_free_fdt; + + if (fdt) { + kho_in_debugfs_init(&kho_in.dbg, fdt); + return 0; + } + + for (int i = 0; i < kho_scratch_cnt; i++) { + unsigned long base_pfn = PHYS_PFN(kho_scratch[i].addr); + unsigned long count = kho_scratch[i].size >> PAGE_SHIFT; + unsigned long pfn; + + /* + * When debug_pagealloc is enabled, __free_pages() clears the + * corresponding PRESENT bit in the kernel page table. + * Subsequent kmemleak scans of these pages cause the + * non-PRESENT page faults. + * Mark scratch areas with kmemleak_ignore_phys() to exclude + * them from kmemleak scanning. + */ + kmemleak_ignore_phys(kho_scratch[i].addr); + for (pfn = base_pfn; pfn < base_pfn + count; + pfn += pageblock_nr_pages) + init_cma_reserved_pageblock(pfn_to_page(pfn)); + } + + WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, "fdt", + kho_out.fdt, true)); + + return 0; + +err_free_fdt: + kho_unpreserve_free(kho_out.fdt); +err_free_scratch: + kho_out.fdt = NULL; + for (int i = 0; i < kho_scratch_cnt; i++) { + void *start = __va(kho_scratch[i].addr); + void *end = start + kho_scratch[i].size; + + free_reserved_area(start, end, -1, ""); + } + kho_enable = false; + return err; +} +fs_initcall(kho_init); + +static void __init kho_release_scratch(void) +{ + phys_addr_t start, end; + u64 i; + + memmap_init_kho_scratch_pages(); + + /* + * Mark scratch mem as CMA before we return it. That way we + * ensure that no kernel allocations happen on it. That means + * we can reuse it as scratch memory again later. + */ + __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, + MEMBLOCK_KHO_SCRATCH, &start, &end, NULL) { + ulong start_pfn = pageblock_start_pfn(PFN_DOWN(start)); + ulong end_pfn = pageblock_align(PFN_UP(end)); + ulong pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) + init_pageblock_migratetype(pfn_to_page(pfn), + MIGRATE_CMA, false); + } +} + +void __init kho_memory_init(void) +{ + if (kho_in.scratch_phys) { + kho_scratch = phys_to_virt(kho_in.scratch_phys); + kho_release_scratch(); + + if (!kho_mem_deserialize(kho_get_fdt())) + kho_in.fdt_phys = 0; + } else { + kho_reserve_scratch(); + } +} + +void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, + phys_addr_t scratch_phys, u64 scratch_len) +{ + void *fdt = NULL; + struct kho_scratch *scratch = NULL; + int err = 0; + unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch); + + /* Validate the input FDT */ + fdt = early_memremap(fdt_phys, fdt_len); + if (!fdt) { + pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys); + err = -EFAULT; + goto out; + } + err = fdt_check_header(fdt); + if (err) { + pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n", + fdt_phys, err); + err = -EINVAL; + goto out; + } + err = fdt_node_check_compatible(fdt, 0, KHO_FDT_COMPATIBLE); + if (err) { + pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n", + fdt_phys, KHO_FDT_COMPATIBLE, err); + err = -EINVAL; + goto out; + } + + scratch = early_memremap(scratch_phys, scratch_len); + if (!scratch) { + pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n", + scratch_phys, scratch_len); + err = -EFAULT; + goto out; + } + + /* + * We pass a safe contiguous blocks of memory to use for early boot + * purporses from the previous kernel so that we can resize the + * memblock array as needed. + */ + for (int i = 0; i < scratch_cnt; i++) { + struct kho_scratch *area = &scratch[i]; + u64 size = area->size; + + memblock_add(area->addr, size); + err = memblock_mark_kho_scratch(area->addr, size); + if (WARN_ON(err)) { + pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %pe", + &area->addr, &size, ERR_PTR(err)); + goto out; + } + pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size); + } + + memblock_reserve(scratch_phys, scratch_len); + + /* + * Now that we have a viable region of scratch memory, let's tell + * the memblocks allocator to only use that for any allocations. + * That way we ensure that nothing scribbles over in use data while + * we initialize the page tables which we will need to ingest all + * memory reservations from the previous kernel. + */ + memblock_set_kho_scratch_only(); + + kho_in.fdt_phys = fdt_phys; + kho_in.scratch_phys = scratch_phys; + kho_scratch_cnt = scratch_cnt; + pr_info("found kexec handover data.\n"); + +out: + if (fdt) + early_memunmap(fdt, fdt_len); + if (scratch) + early_memunmap(scratch, scratch_len); + if (err) + pr_warn("disabling KHO revival: %d\n", err); +} + +/* Helper functions for kexec_file_load */ + +int kho_fill_kimage(struct kimage *image) +{ + ssize_t scratch_size; + int err = 0; + struct kexec_buf scratch; + + if (!kho_enable) + return 0; + + image->kho.fdt = virt_to_phys(kho_out.fdt); + + scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt; + scratch = (struct kexec_buf){ + .image = image, + .buffer = kho_scratch, + .bufsz = scratch_size, + .mem = KEXEC_BUF_MEM_UNKNOWN, + .memsz = scratch_size, + .buf_align = SZ_64K, /* Makes it easier to map */ + .buf_max = ULONG_MAX, + .top_down = true, + }; + err = kexec_add_buffer(&scratch); + if (err) + return err; + image->kho.scratch = &image->segment[image->nr_segments - 1]; + + return 0; +} + +static int kho_walk_scratch(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + int ret = 0; + int i; + + for (i = 0; i < kho_scratch_cnt; i++) { + struct resource res = { + .start = kho_scratch[i].addr, + .end = kho_scratch[i].addr + kho_scratch[i].size - 1, + }; + + /* Try to fit the kimage into our KHO scratch region */ + ret = func(&res, kbuf); + if (ret) + break; + } + + return ret; +} + +int kho_locate_mem_hole(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + int ret; + + if (!kho_enable || kbuf->image->type == KEXEC_TYPE_CRASH) + return 1; + + ret = kho_walk_scratch(kbuf, func); + + return ret == 1 ? 0 : -EADDRNOTAVAIL; +} diff --git a/kernel/liveupdate/kexec_handover_debug.c b/kernel/liveupdate/kexec_handover_debug.c new file mode 100644 index 000000000000..6efb696f5426 --- /dev/null +++ b/kernel/liveupdate/kexec_handover_debug.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kexec_handover_debug.c - kexec handover optional debug functionality + * Copyright (C) 2025 Google LLC, Pasha Tatashin <pasha.tatashin@soleen.com> + */ + +#define pr_fmt(fmt) "KHO: " fmt + +#include "kexec_handover_internal.h" + +bool kho_scratch_overlap(phys_addr_t phys, size_t size) +{ + phys_addr_t scratch_start, scratch_end; + unsigned int i; + + for (i = 0; i < kho_scratch_cnt; i++) { + scratch_start = kho_scratch[i].addr; + scratch_end = kho_scratch[i].addr + kho_scratch[i].size; + + if (phys < scratch_end && (phys + size) > scratch_start) + return true; + } + + return false; +} diff --git a/kernel/liveupdate/kexec_handover_debugfs.c b/kernel/liveupdate/kexec_handover_debugfs.c new file mode 100644 index 000000000000..2abbf62ba942 --- /dev/null +++ b/kernel/liveupdate/kexec_handover_debugfs.c @@ -0,0 +1,221 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kexec_handover_debugfs.c - kexec handover debugfs interfaces + * Copyright (C) 2023 Alexander Graf <graf@amazon.com> + * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org> + * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com> + * Copyright (C) 2025 Google LLC, Pasha Tatashin <pasha.tatashin@soleen.com> + */ + +#define pr_fmt(fmt) "KHO: " fmt + +#include <linux/init.h> +#include <linux/io.h> +#include <linux/libfdt.h> +#include <linux/mm.h> +#include "kexec_handover_internal.h" + +static struct dentry *debugfs_root; + +struct fdt_debugfs { + struct list_head list; + struct debugfs_blob_wrapper wrapper; + struct dentry *file; +}; + +static int __kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, + const char *name, const void *fdt) +{ + struct fdt_debugfs *f; + struct dentry *file; + + f = kmalloc(sizeof(*f), GFP_KERNEL); + if (!f) + return -ENOMEM; + + f->wrapper.data = (void *)fdt; + f->wrapper.size = fdt_totalsize(fdt); + + file = debugfs_create_blob(name, 0400, dir, &f->wrapper); + if (IS_ERR(file)) { + kfree(f); + return PTR_ERR(file); + } + + f->file = file; + list_add(&f->list, list); + + return 0; +} + +int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, + const void *fdt, bool root) +{ + struct dentry *dir; + + if (root) + dir = dbg->dir; + else + dir = dbg->sub_fdt_dir; + + return __kho_debugfs_fdt_add(&dbg->fdt_list, dir, name, fdt); +} + +void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt) +{ + struct fdt_debugfs *ff; + + list_for_each_entry(ff, &dbg->fdt_list, list) { + if (ff->wrapper.data == fdt) { + debugfs_remove(ff->file); + list_del(&ff->list); + kfree(ff); + break; + } + } +} + +static int kho_out_finalize_get(void *data, u64 *val) +{ + *val = kho_finalized(); + + return 0; +} + +static int kho_out_finalize_set(void *data, u64 val) +{ + if (val) + return kho_finalize(); + else + return -EINVAL; +} + +DEFINE_DEBUGFS_ATTRIBUTE(kho_out_finalize_fops, kho_out_finalize_get, + kho_out_finalize_set, "%llu\n"); + +static int scratch_phys_show(struct seq_file *m, void *v) +{ + for (int i = 0; i < kho_scratch_cnt; i++) + seq_printf(m, "0x%llx\n", kho_scratch[i].addr); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(scratch_phys); + +static int scratch_len_show(struct seq_file *m, void *v) +{ + for (int i = 0; i < kho_scratch_cnt; i++) + seq_printf(m, "0x%llx\n", kho_scratch[i].size); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(scratch_len); + +__init void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt) +{ + struct dentry *dir, *sub_fdt_dir; + int err, child; + + INIT_LIST_HEAD(&dbg->fdt_list); + + dir = debugfs_create_dir("in", debugfs_root); + if (IS_ERR(dir)) { + err = PTR_ERR(dir); + goto err_out; + } + + sub_fdt_dir = debugfs_create_dir("sub_fdts", dir); + if (IS_ERR(sub_fdt_dir)) { + err = PTR_ERR(sub_fdt_dir); + goto err_rmdir; + } + + err = __kho_debugfs_fdt_add(&dbg->fdt_list, dir, "fdt", fdt); + if (err) + goto err_rmdir; + + fdt_for_each_subnode(child, fdt, 0) { + int len = 0; + const char *name = fdt_get_name(fdt, child, NULL); + const u64 *fdt_phys; + + fdt_phys = fdt_getprop(fdt, child, "fdt", &len); + if (!fdt_phys) + continue; + if (len != sizeof(*fdt_phys)) { + pr_warn("node %s prop fdt has invalid length: %d\n", + name, len); + continue; + } + err = __kho_debugfs_fdt_add(&dbg->fdt_list, sub_fdt_dir, name, + phys_to_virt(*fdt_phys)); + if (err) { + pr_warn("failed to add fdt %s to debugfs: %pe\n", name, + ERR_PTR(err)); + continue; + } + } + + dbg->dir = dir; + dbg->sub_fdt_dir = sub_fdt_dir; + + return; +err_rmdir: + debugfs_remove_recursive(dir); +err_out: + /* + * Failure to create /sys/kernel/debug/kho/in does not prevent + * reviving state from KHO and setting up KHO for the next + * kexec. + */ + if (err) { + pr_err("failed exposing handover FDT in debugfs: %pe\n", + ERR_PTR(err)); + } +} + +__init int kho_out_debugfs_init(struct kho_debugfs *dbg) +{ + struct dentry *dir, *f, *sub_fdt_dir; + + INIT_LIST_HEAD(&dbg->fdt_list); + + dir = debugfs_create_dir("out", debugfs_root); + if (IS_ERR(dir)) + return -ENOMEM; + + sub_fdt_dir = debugfs_create_dir("sub_fdts", dir); + if (IS_ERR(sub_fdt_dir)) + goto err_rmdir; + + f = debugfs_create_file("scratch_phys", 0400, dir, NULL, + &scratch_phys_fops); + if (IS_ERR(f)) + goto err_rmdir; + + f = debugfs_create_file("scratch_len", 0400, dir, NULL, + &scratch_len_fops); + if (IS_ERR(f)) + goto err_rmdir; + + f = debugfs_create_file("finalize", 0600, dir, NULL, + &kho_out_finalize_fops); + if (IS_ERR(f)) + goto err_rmdir; + + dbg->dir = dir; + dbg->sub_fdt_dir = sub_fdt_dir; + return 0; + +err_rmdir: + debugfs_remove_recursive(dir); + return -ENOENT; +} + +__init int kho_debugfs_init(void) +{ + debugfs_root = debugfs_create_dir("kho", NULL); + if (IS_ERR(debugfs_root)) + return -ENOENT; + return 0; +} diff --git a/kernel/liveupdate/kexec_handover_internal.h b/kernel/liveupdate/kexec_handover_internal.h new file mode 100644 index 000000000000..0202c85ad14f --- /dev/null +++ b/kernel/liveupdate/kexec_handover_internal.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_KEXEC_HANDOVER_INTERNAL_H +#define LINUX_KEXEC_HANDOVER_INTERNAL_H + +#include <linux/kexec_handover.h> +#include <linux/list.h> +#include <linux/types.h> + +#ifdef CONFIG_KEXEC_HANDOVER_DEBUGFS +#include <linux/debugfs.h> + +struct kho_debugfs { + struct dentry *dir; + struct dentry *sub_fdt_dir; + struct list_head fdt_list; +}; + +#else +struct kho_debugfs {}; +#endif + +extern struct kho_scratch *kho_scratch; +extern unsigned int kho_scratch_cnt; + +bool kho_finalized(void); +int kho_finalize(void); + +#ifdef CONFIG_KEXEC_HANDOVER_DEBUGFS +int kho_debugfs_init(void); +void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt); +int kho_out_debugfs_init(struct kho_debugfs *dbg); +int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, + const void *fdt, bool root); +void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt); +#else +static inline int kho_debugfs_init(void) { return 0; } +static inline void kho_in_debugfs_init(struct kho_debugfs *dbg, + const void *fdt) { } +static inline int kho_out_debugfs_init(struct kho_debugfs *dbg) { return 0; } +static inline int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, + const void *fdt, bool root) { return 0; } +static inline void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, + void *fdt) { } +#endif /* CONFIG_KEXEC_HANDOVER_DEBUGFS */ + +#ifdef CONFIG_KEXEC_HANDOVER_DEBUG +bool kho_scratch_overlap(phys_addr_t phys, size_t size); +#else +static inline bool kho_scratch_overlap(phys_addr_t phys, size_t size) +{ + return false; +} +#endif /* CONFIG_KEXEC_HANDOVER_DEBUG */ + +#endif /* LINUX_KEXEC_HANDOVER_INTERNAL_H */ diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c new file mode 100644 index 000000000000..944663d99dd9 --- /dev/null +++ b/kernel/liveupdate/luo_core.c @@ -0,0 +1,448 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin <pasha.tatashin@soleen.com> + */ + +/** + * DOC: Live Update Orchestrator (LUO) + * + * Live Update is a specialized, kexec-based reboot process that allows a + * running kernel to be updated from one version to another while preserving + * the state of selected resources and keeping designated hardware devices + * operational. For these devices, DMA activity may continue throughout the + * kernel transition. + * + * While the primary use case driving this work is supporting live updates of + * the Linux kernel when it is used as a hypervisor in cloud environments, the + * LUO framework itself is designed to be workload-agnostic. Live Update + * facilitates a full kernel version upgrade for any type of system. + * + * For example, a non-hypervisor system running an in-memory cache like + * memcached with many gigabytes of data can use LUO. The userspace service + * can place its cache into a memfd, have its state preserved by LUO, and + * restore it immediately after the kernel kexec. + * + * Whether the system is running virtual machines, containers, a + * high-performance database, or networking services, LUO's primary goal is to + * enable a full kernel update by preserving critical userspace state and + * keeping essential devices operational. + * + * The core of LUO is a mechanism that tracks the progress of a live update, + * along with a callback API that allows other kernel subsystems to participate + * in the process. Example subsystems that can hook into LUO include: kvm, + * iommu, interrupts, vfio, participating filesystems, and memory management. + * + * LUO uses Kexec Handover to transfer memory state from the current kernel to + * the next kernel. For more details see + * Documentation/core-api/kho/concepts.rst. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/atomic.h> +#include <linux/errno.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/kernel.h> +#include <linux/kexec_handover.h> +#include <linux/kho/abi/luo.h> +#include <linux/kobject.h> +#include <linux/libfdt.h> +#include <linux/liveupdate.h> +#include <linux/miscdevice.h> +#include <linux/mm.h> +#include <linux/sizes.h> +#include <linux/string.h> +#include <linux/unaligned.h> + +#include "kexec_handover_internal.h" +#include "luo_internal.h" + +static struct { + bool enabled; + void *fdt_out; + void *fdt_in; + u64 liveupdate_num; +} luo_global; + +static int __init early_liveupdate_param(char *buf) +{ + return kstrtobool(buf, &luo_global.enabled); +} +early_param("liveupdate", early_liveupdate_param); + +static int __init luo_early_startup(void) +{ + phys_addr_t fdt_phys; + int err, ln_size; + const void *ptr; + + if (!kho_is_enabled()) { + if (liveupdate_enabled()) + pr_warn("Disabling liveupdate because KHO is disabled\n"); + luo_global.enabled = false; + return 0; + } + + /* Retrieve LUO subtree, and verify its format. */ + err = kho_retrieve_subtree(LUO_FDT_KHO_ENTRY_NAME, &fdt_phys); + if (err) { + if (err != -ENOENT) { + pr_err("failed to retrieve FDT '%s' from KHO: %pe\n", + LUO_FDT_KHO_ENTRY_NAME, ERR_PTR(err)); + return err; + } + + return 0; + } + + luo_global.fdt_in = phys_to_virt(fdt_phys); + err = fdt_node_check_compatible(luo_global.fdt_in, 0, + LUO_FDT_COMPATIBLE); + if (err) { + pr_err("FDT '%s' is incompatible with '%s' [%d]\n", + LUO_FDT_KHO_ENTRY_NAME, LUO_FDT_COMPATIBLE, err); + + return -EINVAL; + } + + ln_size = 0; + ptr = fdt_getprop(luo_global.fdt_in, 0, LUO_FDT_LIVEUPDATE_NUM, + &ln_size); + if (!ptr || ln_size != sizeof(luo_global.liveupdate_num)) { + pr_err("Unable to get live update number '%s' [%d]\n", + LUO_FDT_LIVEUPDATE_NUM, ln_size); + + return -EINVAL; + } + + luo_global.liveupdate_num = get_unaligned((u64 *)ptr); + pr_info("Retrieved live update data, liveupdate number: %lld\n", + luo_global.liveupdate_num); + + err = luo_session_setup_incoming(luo_global.fdt_in); + if (err) + return err; + + return 0; +} + +static int __init liveupdate_early_init(void) +{ + int err; + + err = luo_early_startup(); + if (err) { + luo_global.enabled = false; + luo_restore_fail("The incoming tree failed to initialize properly [%pe], disabling live update\n", + ERR_PTR(err)); + } + + return err; +} +early_initcall(liveupdate_early_init); + +/* Called during boot to create outgoing LUO fdt tree */ +static int __init luo_fdt_setup(void) +{ + const u64 ln = luo_global.liveupdate_num + 1; + void *fdt_out; + int err; + + fdt_out = kho_alloc_preserve(LUO_FDT_SIZE); + if (IS_ERR(fdt_out)) { + pr_err("failed to allocate/preserve FDT memory\n"); + return PTR_ERR(fdt_out); + } + + err = fdt_create(fdt_out, LUO_FDT_SIZE); + err |= fdt_finish_reservemap(fdt_out); + err |= fdt_begin_node(fdt_out, ""); + err |= fdt_property_string(fdt_out, "compatible", LUO_FDT_COMPATIBLE); + err |= fdt_property(fdt_out, LUO_FDT_LIVEUPDATE_NUM, &ln, sizeof(ln)); + err |= luo_session_setup_outgoing(fdt_out); + err |= fdt_end_node(fdt_out); + err |= fdt_finish(fdt_out); + if (err) + goto exit_free; + + err = kho_add_subtree(LUO_FDT_KHO_ENTRY_NAME, fdt_out); + if (err) + goto exit_free; + luo_global.fdt_out = fdt_out; + + return 0; + +exit_free: + kho_unpreserve_free(fdt_out); + pr_err("failed to prepare LUO FDT: %d\n", err); + + return err; +} + +/* + * late initcall because it initializes the outgoing tree that is needed only + * once userspace starts using /dev/liveupdate. + */ +static int __init luo_late_startup(void) +{ + int err; + + if (!liveupdate_enabled()) + return 0; + + err = luo_fdt_setup(); + if (err) + luo_global.enabled = false; + + return err; +} +late_initcall(luo_late_startup); + +/* Public Functions */ + +/** + * liveupdate_reboot() - Kernel reboot notifier for live update final + * serialization. + * + * This function is invoked directly from the reboot() syscall pathway + * if kexec is in progress. + * + * If any callback fails, this function aborts KHO, undoes the freeze() + * callbacks, and returns an error. + */ +int liveupdate_reboot(void) +{ + int err; + + if (!liveupdate_enabled()) + return 0; + + err = luo_session_serialize(); + if (err) + return err; + + err = kho_finalize(); + if (err) { + pr_err("kho_finalize failed %d\n", err); + /* + * kho_finalize() may return libfdt errors, to aboid passing to + * userspace unknown errors, change this to EAGAIN. + */ + err = -EAGAIN; + } + + return err; +} + +/** + * liveupdate_enabled - Check if the live update feature is enabled. + * + * This function returns the state of the live update feature flag, which + * can be controlled via the ``liveupdate`` kernel command-line parameter. + * + * @return true if live update is enabled, false otherwise. + */ +bool liveupdate_enabled(void) +{ + return luo_global.enabled; +} + +/** + * DOC: LUO ioctl Interface + * + * The IOCTL user-space control interface for the LUO subsystem. + * It registers a character device, typically found at ``/dev/liveupdate``, + * which allows a userspace agent to manage the LUO state machine and its + * associated resources, such as preservable file descriptors. + * + * To ensure that the state machine is controlled by a single entity, access + * to this device is exclusive: only one process is permitted to have + * ``/dev/liveupdate`` open at any given time. Subsequent open attempts will + * fail with -EBUSY until the first process closes its file descriptor. + * This singleton model simplifies state management by preventing conflicting + * commands from multiple userspace agents. + */ + +struct luo_device_state { + struct miscdevice miscdev; + atomic_t in_use; +}; + +static int luo_ioctl_create_session(struct luo_ucmd *ucmd) +{ + struct liveupdate_ioctl_create_session *argp = ucmd->cmd; + struct file *file; + int err; + + argp->fd = get_unused_fd_flags(O_CLOEXEC); + if (argp->fd < 0) + return argp->fd; + + err = luo_session_create(argp->name, &file); + if (err) + goto err_put_fd; + + err = luo_ucmd_respond(ucmd, sizeof(*argp)); + if (err) + goto err_put_file; + + fd_install(argp->fd, file); + + return 0; + +err_put_file: + fput(file); +err_put_fd: + put_unused_fd(argp->fd); + + return err; +} + +static int luo_ioctl_retrieve_session(struct luo_ucmd *ucmd) +{ + struct liveupdate_ioctl_retrieve_session *argp = ucmd->cmd; + struct file *file; + int err; + + argp->fd = get_unused_fd_flags(O_CLOEXEC); + if (argp->fd < 0) + return argp->fd; + + err = luo_session_retrieve(argp->name, &file); + if (err < 0) + goto err_put_fd; + + err = luo_ucmd_respond(ucmd, sizeof(*argp)); + if (err) + goto err_put_file; + + fd_install(argp->fd, file); + + return 0; + +err_put_file: + fput(file); +err_put_fd: + put_unused_fd(argp->fd); + + return err; +} + +static int luo_open(struct inode *inodep, struct file *filep) +{ + struct luo_device_state *ldev = container_of(filep->private_data, + struct luo_device_state, + miscdev); + + if (atomic_cmpxchg(&ldev->in_use, 0, 1)) + return -EBUSY; + + /* Always return -EIO to user if deserialization fail */ + if (luo_session_deserialize()) { + atomic_set(&ldev->in_use, 0); + return -EIO; + } + + return 0; +} + +static int luo_release(struct inode *inodep, struct file *filep) +{ + struct luo_device_state *ldev = container_of(filep->private_data, + struct luo_device_state, + miscdev); + atomic_set(&ldev->in_use, 0); + + return 0; +} + +union ucmd_buffer { + struct liveupdate_ioctl_create_session create; + struct liveupdate_ioctl_retrieve_session retrieve; +}; + +struct luo_ioctl_op { + unsigned int size; + unsigned int min_size; + unsigned int ioctl_num; + int (*execute)(struct luo_ucmd *ucmd); +}; + +#define IOCTL_OP(_ioctl, _fn, _struct, _last) \ + [_IOC_NR(_ioctl) - LIVEUPDATE_CMD_BASE] = { \ + .size = sizeof(_struct) + \ + BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \ + sizeof(_struct)), \ + .min_size = offsetofend(_struct, _last), \ + .ioctl_num = _ioctl, \ + .execute = _fn, \ + } + +static const struct luo_ioctl_op luo_ioctl_ops[] = { + IOCTL_OP(LIVEUPDATE_IOCTL_CREATE_SESSION, luo_ioctl_create_session, + struct liveupdate_ioctl_create_session, name), + IOCTL_OP(LIVEUPDATE_IOCTL_RETRIEVE_SESSION, luo_ioctl_retrieve_session, + struct liveupdate_ioctl_retrieve_session, name), +}; + +static long luo_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) +{ + const struct luo_ioctl_op *op; + struct luo_ucmd ucmd = {}; + union ucmd_buffer buf; + unsigned int nr; + int err; + + nr = _IOC_NR(cmd); + if (nr - LIVEUPDATE_CMD_BASE >= ARRAY_SIZE(luo_ioctl_ops)) + return -EINVAL; + + ucmd.ubuffer = (void __user *)arg; + err = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer); + if (err) + return err; + + op = &luo_ioctl_ops[nr - LIVEUPDATE_CMD_BASE]; + if (op->ioctl_num != cmd) + return -ENOIOCTLCMD; + if (ucmd.user_size < op->min_size) + return -EINVAL; + + ucmd.cmd = &buf; + err = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer, + ucmd.user_size); + if (err) + return err; + + return op->execute(&ucmd); +} + +static const struct file_operations luo_fops = { + .owner = THIS_MODULE, + .open = luo_open, + .release = luo_release, + .unlocked_ioctl = luo_ioctl, +}; + +static struct luo_device_state luo_dev = { + .miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "liveupdate", + .fops = &luo_fops, + }, + .in_use = ATOMIC_INIT(0), +}; + +static int __init liveupdate_ioctl_init(void) +{ + if (!liveupdate_enabled()) + return 0; + + return misc_register(&luo_dev.miscdev); +} +late_initcall(liveupdate_ioctl_init); diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c new file mode 100644 index 000000000000..a32a777f6df8 --- /dev/null +++ b/kernel/liveupdate/luo_file.c @@ -0,0 +1,892 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin <pasha.tatashin@soleen.com> + */ + +/** + * DOC: LUO File Descriptors + * + * LUO provides the infrastructure to preserve specific, stateful file + * descriptors across a kexec-based live update. The primary goal is to allow + * workloads, such as virtual machines using vfio, memfd, or iommufd, to + * retain access to their essential resources without interruption. + * + * The framework is built around a callback-based handler model and a well- + * defined lifecycle for each preserved file. + * + * Handler Registration: + * Kernel modules responsible for a specific file type (e.g., memfd, vfio) + * register a &struct liveupdate_file_handler. This handler provides a set of + * callbacks that LUO invokes at different stages of the update process, most + * notably: + * + * - can_preserve(): A lightweight check to determine if the handler is + * compatible with a given 'struct file'. + * - preserve(): The heavyweight operation that saves the file's state and + * returns an opaque u64 handle. This is typically performed while the + * workload is still active to minimize the downtime during the + * actual reboot transition. + * - unpreserve(): Cleans up any resources allocated by .preserve(), called + * if the preservation process is aborted before the reboot (i.e. session is + * closed). + * - freeze(): A final pre-reboot opportunity to prepare the state for kexec. + * We are already in reboot syscall, and therefore userspace cannot mutate + * the file anymore. + * - unfreeze(): Undoes the actions of .freeze(), called if the live update + * is aborted after the freeze phase. + * - retrieve(): Reconstructs the file in the new kernel from the preserved + * handle. + * - finish(): Performs final check and cleanup in the new kernel. After + * succesul finish call, LUO gives up ownership to this file. + * + * File Preservation Lifecycle happy path: + * + * 1. Preserve (Normal Operation): A userspace agent preserves files one by one + * via an ioctl. For each file, luo_preserve_file() finds a compatible + * handler, calls its .preserve() operation, and creates an internal &struct + * luo_file to track the live state. + * + * 2. Freeze (Pre-Reboot): Just before the kexec, luo_file_freeze() is called. + * It iterates through all preserved files, calls their respective .freeze() + * operation, and serializes their final metadata (compatible string, token, + * and data handle) into a contiguous memory block for KHO. + * + * 3. Deserialize: After kexec, luo_file_deserialize() runs when session gets + * deserialized (which is when /dev/liveupdate is first opened). It reads the + * serialized data from the KHO memory region and reconstructs the in-memory + * list of &struct luo_file instances for the new kernel, linking them to + * their corresponding handlers. + * + * 4. Retrieve (New Kernel - Userspace Ready): The userspace agent can now + * restore file descriptors by providing a token. luo_retrieve_file() + * searches for the matching token, calls the handler's .retrieve() op to + * re-create the 'struct file', and returns a new FD. Files can be + * retrieved in ANY order. + * + * 5. Finish (New Kernel - Cleanup): Once a session retrival is complete, + * luo_file_finish() is called. It iterates through all files, invokes their + * .finish() operations for final cleanup, and releases all associated kernel + * resources. + * + * File Preservation Lifecycle unhappy paths: + * + * 1. Abort Before Reboot: If the userspace agent aborts the live update + * process before calling reboot (e.g., by closing the session file + * descriptor), the session's release handler calls + * luo_file_unpreserve_files(). This invokes the .unpreserve() callback on + * all preserved files, ensuring all allocated resources are cleaned up and + * returning the system to a clean state. + * + * 2. Freeze Failure: During the reboot() syscall, if any handler's .freeze() + * op fails, the .unfreeze() op is invoked on all previously *successful* + * freezes to roll back their state. The reboot() syscall then returns an + * error to userspace, canceling the live update. + * + * 3. Finish Failure: In the new kernel, if a handler's .finish() op fails, + * the luo_file_finish() operation is aborted. LUO retains ownership of + * all files within that session, including those that were not yet + * processed. The userspace agent can attempt to call the finish operation + * again later. If the issue cannot be resolved, these resources will be held + * by LUO until the next live update cycle, at which point they will be + * discarded. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cleanup.h> +#include <linux/compiler.h> +#include <linux/err.h> +#include <linux/errno.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/io.h> +#include <linux/kexec_handover.h> +#include <linux/kho/abi/luo.h> +#include <linux/liveupdate.h> +#include <linux/module.h> +#include <linux/sizes.h> +#include <linux/slab.h> +#include <linux/string.h> +#include "luo_internal.h" + +static LIST_HEAD(luo_file_handler_list); + +/* 2 4K pages, give space for 128 files per file_set */ +#define LUO_FILE_PGCNT 2ul +#define LUO_FILE_MAX \ + ((LUO_FILE_PGCNT << PAGE_SHIFT) / sizeof(struct luo_file_ser)) + +/** + * struct luo_file - Represents a single preserved file instance. + * @fh: Pointer to the &struct liveupdate_file_handler that manages + * this type of file. + * @file: Pointer to the kernel's &struct file that is being preserved. + * This is NULL in the new kernel until the file is successfully + * retrieved. + * @serialized_data: The opaque u64 handle to the serialized state of the file. + * This handle is passed back to the handler's .freeze(), + * .retrieve(), and .finish() callbacks, allowing it to track + * and update its serialized state across phases. + * @private_data: Pointer to the private data for the file used to hold runtime + * state that is not preserved. Set by the handler's .preserve() + * callback, and must be freed in the handler's .unpreserve() + * callback. + * @retrieved: A flag indicating whether a user/kernel in the new kernel has + * successfully called retrieve() on this file. This prevents + * multiple retrieval attempts. + * @mutex: A mutex that protects the fields of this specific instance + * (e.g., @retrieved, @file), ensuring that operations like + * retrieving or finishing a file are atomic. + * @list: The list_head linking this instance into its parent + * file_set's list of preserved files. + * @token: The user-provided unique token used to identify this file. + * + * This structure is the core in-kernel representation of a single file being + * managed through a live update. An instance is created by luo_preserve_file() + * to link a 'struct file' to its corresponding handler, a user-provided token, + * and the serialized state handle returned by the handler's .preserve() + * operation. + * + * These instances are tracked in a per-file_set list. The @serialized_data + * field, which holds a handle to the file's serialized state, may be updated + * during the .freeze() callback before being serialized for the next kernel. + * After reboot, these structures are recreated by luo_file_deserialize() and + * are finally cleaned up by luo_file_finish(). + */ +struct luo_file { + struct liveupdate_file_handler *fh; + struct file *file; + u64 serialized_data; + void *private_data; + bool retrieved; + struct mutex mutex; + struct list_head list; + u64 token; +}; + +static int luo_alloc_files_mem(struct luo_file_set *file_set) +{ + size_t size; + void *mem; + + if (file_set->files) + return 0; + + WARN_ON_ONCE(file_set->count); + + size = LUO_FILE_PGCNT << PAGE_SHIFT; + mem = kho_alloc_preserve(size); + if (IS_ERR(mem)) + return PTR_ERR(mem); + + file_set->files = mem; + + return 0; +} + +static void luo_free_files_mem(struct luo_file_set *file_set) +{ + /* If file_set has files, no need to free preservation memory */ + if (file_set->count) + return; + + if (!file_set->files) + return; + + kho_unpreserve_free(file_set->files); + file_set->files = NULL; +} + +static bool luo_token_is_used(struct luo_file_set *file_set, u64 token) +{ + struct luo_file *iter; + + list_for_each_entry(iter, &file_set->files_list, list) { + if (iter->token == token) + return true; + } + + return false; +} + +/** + * luo_preserve_file - Initiate the preservation of a file descriptor. + * @file_set: The file_set to which the preserved file will be added. + * @token: A unique, user-provided identifier for the file. + * @fd: The file descriptor to be preserved. + * + * This function orchestrates the first phase of preserving a file. Upon entry, + * it takes a reference to the 'struct file' via fget(), effectively making LUO + * a co-owner of the file. This reference is held until the file is either + * unpreserved or successfully finished in the next kernel, preventing the file + * from being prematurely destroyed. + * + * This function orchestrates the first phase of preserving a file. It performs + * the following steps: + * + * 1. Validates that the @token is not already in use within the file_set. + * 2. Ensures the file_set's memory for files serialization is allocated + * (allocates if needed). + * 3. Iterates through registered handlers, calling can_preserve() to find one + * compatible with the given @fd. + * 4. Calls the handler's .preserve() operation, which saves the file's state + * and returns an opaque private data handle. + * 5. Adds the new instance to the file_set's internal list. + * + * On success, LUO takes a reference to the 'struct file' and considers it + * under its management until it is unpreserved or finished. + * + * In case of any failure, all intermediate allocations (file reference, memory + * for the 'luo_file' struct, etc.) are cleaned up before returning an error. + * + * Context: Can be called from an ioctl handler during normal system operation. + * Return: 0 on success. Returns a negative errno on failure: + * -EEXIST if the token is already used. + * -EBADF if the file descriptor is invalid. + * -ENOSPC if the file_set is full. + * -ENOENT if no compatible handler is found. + * -ENOMEM on memory allocation failure. + * Other erros might be returned by .preserve(). + */ +int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd) +{ + struct liveupdate_file_op_args args = {0}; + struct liveupdate_file_handler *fh; + struct luo_file *luo_file; + struct file *file; + int err; + + if (luo_token_is_used(file_set, token)) + return -EEXIST; + + if (file_set->count == LUO_FILE_MAX) + return -ENOSPC; + + file = fget(fd); + if (!file) + return -EBADF; + + err = luo_alloc_files_mem(file_set); + if (err) + goto err_fput; + + err = -ENOENT; + luo_list_for_each_private(fh, &luo_file_handler_list, list) { + if (fh->ops->can_preserve(fh, file)) { + err = 0; + break; + } + } + + /* err is still -ENOENT if no handler was found */ + if (err) + goto err_free_files_mem; + + luo_file = kzalloc(sizeof(*luo_file), GFP_KERNEL); + if (!luo_file) { + err = -ENOMEM; + goto err_free_files_mem; + } + + luo_file->file = file; + luo_file->fh = fh; + luo_file->token = token; + luo_file->retrieved = false; + mutex_init(&luo_file->mutex); + + args.handler = fh; + args.file = file; + err = fh->ops->preserve(&args); + if (err) + goto err_kfree; + + luo_file->serialized_data = args.serialized_data; + luo_file->private_data = args.private_data; + list_add_tail(&luo_file->list, &file_set->files_list); + file_set->count++; + + return 0; + +err_kfree: + kfree(luo_file); +err_free_files_mem: + luo_free_files_mem(file_set); +err_fput: + fput(file); + + return err; +} + +/** + * luo_file_unpreserve_files - Unpreserves all files from a file_set. + * @file_set: The files to be cleaned up. + * + * This function serves as the primary cleanup path for a file_set. It is + * invoked when the userspace agent closes the file_set's file descriptor. + * + * For each file, it performs the following cleanup actions: + * 1. Calls the handler's .unpreserve() callback to allow the handler to + * release any resources it allocated. + * 2. Removes the file from the file_set's internal tracking list. + * 3. Releases the reference to the 'struct file' that was taken by + * luo_preserve_file() via fput(), returning ownership. + * 4. Frees the memory associated with the internal 'struct luo_file'. + * + * After all individual files are unpreserved, it frees the contiguous memory + * block that was allocated to hold their serialization data. + */ +void luo_file_unpreserve_files(struct luo_file_set *file_set) +{ + struct luo_file *luo_file; + + while (!list_empty(&file_set->files_list)) { + struct liveupdate_file_op_args args = {0}; + + luo_file = list_last_entry(&file_set->files_list, + struct luo_file, list); + + args.handler = luo_file->fh; + args.file = luo_file->file; + args.serialized_data = luo_file->serialized_data; + args.private_data = luo_file->private_data; + luo_file->fh->ops->unpreserve(&args); + + list_del(&luo_file->list); + file_set->count--; + + fput(luo_file->file); + mutex_destroy(&luo_file->mutex); + kfree(luo_file); + } + + luo_free_files_mem(file_set); +} + +static int luo_file_freeze_one(struct luo_file_set *file_set, + struct luo_file *luo_file) +{ + int err = 0; + + guard(mutex)(&luo_file->mutex); + + if (luo_file->fh->ops->freeze) { + struct liveupdate_file_op_args args = {0}; + + args.handler = luo_file->fh; + args.file = luo_file->file; + args.serialized_data = luo_file->serialized_data; + args.private_data = luo_file->private_data; + + err = luo_file->fh->ops->freeze(&args); + if (!err) + luo_file->serialized_data = args.serialized_data; + } + + return err; +} + +static void luo_file_unfreeze_one(struct luo_file_set *file_set, + struct luo_file *luo_file) +{ + guard(mutex)(&luo_file->mutex); + + if (luo_file->fh->ops->unfreeze) { + struct liveupdate_file_op_args args = {0}; + + args.handler = luo_file->fh; + args.file = luo_file->file; + args.serialized_data = luo_file->serialized_data; + args.private_data = luo_file->private_data; + + luo_file->fh->ops->unfreeze(&args); + } + + luo_file->serialized_data = 0; +} + +static void __luo_file_unfreeze(struct luo_file_set *file_set, + struct luo_file *failed_entry) +{ + struct list_head *files_list = &file_set->files_list; + struct luo_file *luo_file; + + list_for_each_entry(luo_file, files_list, list) { + if (luo_file == failed_entry) + break; + + luo_file_unfreeze_one(file_set, luo_file); + } + + memset(file_set->files, 0, LUO_FILE_PGCNT << PAGE_SHIFT); +} + +/** + * luo_file_freeze - Freezes all preserved files and serializes their metadata. + * @file_set: The file_set whose files are to be frozen. + * @file_set_ser: Where to put the serialized file_set. + * + * This function is called from the reboot() syscall path, just before the + * kernel transitions to the new image via kexec. Its purpose is to perform the + * final preparation and serialization of all preserved files in the file_set. + * + * It iterates through each preserved file in FIFO order (the order of + * preservation) and performs two main actions: + * + * 1. Freezes the File: It calls the handler's .freeze() callback for each + * file. This gives the handler a final opportunity to quiesce the device or + * prepare its state for the upcoming reboot. The handler may update its + * private data handle during this step. + * + * 2. Serializes Metadata: After a successful freeze, it copies the final file + * metadata—the handler's compatible string, the user token, and the final + * private data handle—into the pre-allocated contiguous memory buffer + * (file_set->files) that will be handed over to the next kernel via KHO. + * + * Error Handling (Rollback): + * This function is atomic. If any handler's .freeze() operation fails, the + * entire live update is aborted. The __luo_file_unfreeze() helper is + * immediately called to invoke the .unfreeze() op on all files that were + * successfully frozen before the point of failure, rolling them back to a + * running state. The function then returns an error, causing the reboot() + * syscall to fail. + * + * Context: Called only from the liveupdate_reboot() path. + * Return: 0 on success, or a negative errno on failure. + */ +int luo_file_freeze(struct luo_file_set *file_set, + struct luo_file_set_ser *file_set_ser) +{ + struct luo_file_ser *file_ser = file_set->files; + struct luo_file *luo_file; + int err; + int i; + + if (!file_set->count) + return 0; + + if (WARN_ON(!file_ser)) + return -EINVAL; + + i = 0; + list_for_each_entry(luo_file, &file_set->files_list, list) { + err = luo_file_freeze_one(file_set, luo_file); + if (err < 0) { + pr_warn("Freeze failed for token[%#0llx] handler[%s] err[%pe]\n", + luo_file->token, luo_file->fh->compatible, + ERR_PTR(err)); + goto err_unfreeze; + } + + strscpy(file_ser[i].compatible, luo_file->fh->compatible, + sizeof(file_ser[i].compatible)); + file_ser[i].data = luo_file->serialized_data; + file_ser[i].token = luo_file->token; + i++; + } + + file_set_ser->count = file_set->count; + if (file_set->files) + file_set_ser->files = virt_to_phys(file_set->files); + + return 0; + +err_unfreeze: + __luo_file_unfreeze(file_set, luo_file); + + return err; +} + +/** + * luo_file_unfreeze - Unfreezes all files in a file_set and clear serialization + * @file_set: The file_set whose files are to be unfrozen. + * @file_set_ser: Serialized file_set. + * + * This function rolls back the state of all files in a file_set after the + * freeze phase has begun but must be aborted. It is the counterpart to + * luo_file_freeze(). + * + * It invokes the __luo_file_unfreeze() helper with a NULL argument, which + * signals the helper to iterate through all files in the file_set and call + * their respective .unfreeze() handler callbacks. + * + * Context: This is called when the live update is aborted during + * the reboot() syscall, after luo_file_freeze() has been called. + */ +void luo_file_unfreeze(struct luo_file_set *file_set, + struct luo_file_set_ser *file_set_ser) +{ + if (!file_set->count) + return; + + __luo_file_unfreeze(file_set, NULL); + memset(file_set_ser, 0, sizeof(*file_set_ser)); +} + +/** + * luo_retrieve_file - Restores a preserved file from a file_set by its token. + * @file_set: The file_set from which to retrieve the file. + * @token: The unique token identifying the file to be restored. + * @filep: Output parameter; on success, this is populated with a pointer + * to the newly retrieved 'struct file'. + * + * This function is the primary mechanism for recreating a file in the new + * kernel after a live update. It searches the file_set's list of deserialized + * files for an entry matching the provided @token. + * + * The operation is idempotent: if a file has already been successfully + * retrieved, this function will simply return a pointer to the existing + * 'struct file' and report success without re-executing the retrieve + * operation. This is handled by checking the 'retrieved' flag under a lock. + * + * File retrieval can happen in any order; it is not bound by the order of + * preservation. + * + * Context: Can be called from an ioctl or other in-kernel code in the new + * kernel. + * Return: 0 on success. Returns a negative errno on failure: + * -ENOENT if no file with the matching token is found. + * Any error code returned by the handler's .retrieve() op. + */ +int luo_retrieve_file(struct luo_file_set *file_set, u64 token, + struct file **filep) +{ + struct liveupdate_file_op_args args = {0}; + struct luo_file *luo_file; + bool found = false; + int err; + + if (list_empty(&file_set->files_list)) + return -ENOENT; + + list_for_each_entry(luo_file, &file_set->files_list, list) { + if (luo_file->token == token) { + found = true; + break; + } + } + + if (!found) + return -ENOENT; + + guard(mutex)(&luo_file->mutex); + if (luo_file->retrieved) { + /* + * Someone is asking for this file again, so get a reference + * for them. + */ + get_file(luo_file->file); + *filep = luo_file->file; + return 0; + } + + args.handler = luo_file->fh; + args.serialized_data = luo_file->serialized_data; + err = luo_file->fh->ops->retrieve(&args); + if (!err) { + luo_file->file = args.file; + + /* Get reference so we can keep this file in LUO until finish */ + get_file(luo_file->file); + *filep = luo_file->file; + luo_file->retrieved = true; + } + + return err; +} + +static int luo_file_can_finish_one(struct luo_file_set *file_set, + struct luo_file *luo_file) +{ + bool can_finish = true; + + guard(mutex)(&luo_file->mutex); + + if (luo_file->fh->ops->can_finish) { + struct liveupdate_file_op_args args = {0}; + + args.handler = luo_file->fh; + args.file = luo_file->file; + args.serialized_data = luo_file->serialized_data; + args.retrieved = luo_file->retrieved; + can_finish = luo_file->fh->ops->can_finish(&args); + } + + return can_finish ? 0 : -EBUSY; +} + +static void luo_file_finish_one(struct luo_file_set *file_set, + struct luo_file *luo_file) +{ + struct liveupdate_file_op_args args = {0}; + + guard(mutex)(&luo_file->mutex); + + args.handler = luo_file->fh; + args.file = luo_file->file; + args.serialized_data = luo_file->serialized_data; + args.retrieved = luo_file->retrieved; + + luo_file->fh->ops->finish(&args); +} + +/** + * luo_file_finish - Completes the lifecycle for all files in a file_set. + * @file_set: The file_set to be finalized. + * + * This function orchestrates the final teardown of a live update file_set in + * the new kernel. It should be called after all necessary files have been + * retrieved and the userspace agent is ready to release the preserved state. + * + * The function iterates through all tracked files. For each file, it performs + * the following sequence of cleanup actions: + * + * 1. If file is not yet retrieved, retrieves it, and calls can_finish() on + * every file in the file_set. If all can_finish return true, continue to + * finish. + * 2. Calls the handler's .finish() callback (via luo_file_finish_one) to + * allow for final resource cleanup within the handler. + * 3. Releases LUO's ownership reference on the 'struct file' via fput(). This + * is the counterpart to the get_file() call in luo_retrieve_file(). + * 4. Removes the 'struct luo_file' from the file_set's internal list. + * 5. Frees the memory for the 'struct luo_file' instance itself. + * + * After successfully finishing all individual files, it frees the + * contiguous memory block that was used to transfer the serialized metadata + * from the previous kernel. + * + * Error Handling (Atomic Failure): + * This operation is atomic. If any handler's .can_finish() op fails, the entire + * function aborts immediately and returns an error. + * + * Context: Can be called from an ioctl handler in the new kernel. + * Return: 0 on success, or a negative errno on failure. + */ +int luo_file_finish(struct luo_file_set *file_set) +{ + struct list_head *files_list = &file_set->files_list; + struct luo_file *luo_file; + int err; + + if (!file_set->count) + return 0; + + list_for_each_entry(luo_file, files_list, list) { + err = luo_file_can_finish_one(file_set, luo_file); + if (err) + return err; + } + + while (!list_empty(&file_set->files_list)) { + luo_file = list_last_entry(&file_set->files_list, + struct luo_file, list); + + luo_file_finish_one(file_set, luo_file); + + if (luo_file->file) + fput(luo_file->file); + list_del(&luo_file->list); + file_set->count--; + mutex_destroy(&luo_file->mutex); + kfree(luo_file); + } + + if (file_set->files) { + kho_restore_free(file_set->files); + file_set->files = NULL; + } + + return 0; +} + +/** + * luo_file_deserialize - Reconstructs the list of preserved files in the new kernel. + * @file_set: The incoming file_set to fill with deserialized data. + * @file_set_ser: Serialized KHO file_set data from the previous kernel. + * + * This function is called during the early boot process of the new kernel. It + * takes the raw, contiguous memory block of 'struct luo_file_ser' entries, + * provided by the previous kernel, and transforms it back into a live, + * in-memory linked list of 'struct luo_file' instances. + * + * For each serialized entry, it performs the following steps: + * 1. Reads the 'compatible' string. + * 2. Searches the global list of registered file handlers for one that + * matches the compatible string. + * 3. Allocates a new 'struct luo_file'. + * 4. Populates the new structure with the deserialized data (token, private + * data handle) and links it to the found handler. The 'file' pointer is + * initialized to NULL, as the file has not been retrieved yet. + * 5. Adds the new 'struct luo_file' to the file_set's files_list. + * + * This prepares the file_set for userspace, which can later call + * luo_retrieve_file() to restore the actual file descriptors. + * + * Context: Called from session deserialization. + */ +int luo_file_deserialize(struct luo_file_set *file_set, + struct luo_file_set_ser *file_set_ser) +{ + struct luo_file_ser *file_ser; + u64 i; + + if (!file_set_ser->files) { + WARN_ON(file_set_ser->count); + return 0; + } + + file_set->count = file_set_ser->count; + file_set->files = phys_to_virt(file_set_ser->files); + + /* + * Note on error handling: + * + * If deserialization fails (e.g., allocation failure or corrupt data), + * we intentionally skip cleanup of files that were already restored. + * + * A partial failure leaves the preserved state inconsistent. + * Implementing a safe "undo" to unwind complex dependencies (sessions, + * files, hardware state) is error-prone and provides little value, as + * the system is effectively in a broken state. + * + * We treat these resources as leaked. The expected recovery path is for + * userspace to detect the failure and trigger a reboot, which will + * reliably reset devices and reclaim memory. + */ + file_ser = file_set->files; + for (i = 0; i < file_set->count; i++) { + struct liveupdate_file_handler *fh; + bool handler_found = false; + struct luo_file *luo_file; + + luo_list_for_each_private(fh, &luo_file_handler_list, list) { + if (!strcmp(fh->compatible, file_ser[i].compatible)) { + handler_found = true; + break; + } + } + + if (!handler_found) { + pr_warn("No registered handler for compatible '%s'\n", + file_ser[i].compatible); + return -ENOENT; + } + + luo_file = kzalloc(sizeof(*luo_file), GFP_KERNEL); + if (!luo_file) + return -ENOMEM; + + luo_file->fh = fh; + luo_file->file = NULL; + luo_file->serialized_data = file_ser[i].data; + luo_file->token = file_ser[i].token; + luo_file->retrieved = false; + mutex_init(&luo_file->mutex); + list_add_tail(&luo_file->list, &file_set->files_list); + } + + return 0; +} + +void luo_file_set_init(struct luo_file_set *file_set) +{ + INIT_LIST_HEAD(&file_set->files_list); +} + +void luo_file_set_destroy(struct luo_file_set *file_set) +{ + WARN_ON(file_set->count); + WARN_ON(!list_empty(&file_set->files_list)); +} + +/** + * liveupdate_register_file_handler - Register a file handler with LUO. + * @fh: Pointer to a caller-allocated &struct liveupdate_file_handler. + * The caller must initialize this structure, including a unique + * 'compatible' string and a valid 'fh' callbacks. This function adds the + * handler to the global list of supported file handlers. + * + * Context: Typically called during module initialization for file types that + * support live update preservation. + * + * Return: 0 on success. Negative errno on failure. + */ +int liveupdate_register_file_handler(struct liveupdate_file_handler *fh) +{ + struct liveupdate_file_handler *fh_iter; + int err; + + if (!liveupdate_enabled()) + return -EOPNOTSUPP; + + /* Sanity check that all required callbacks are set */ + if (!fh->ops->preserve || !fh->ops->unpreserve || !fh->ops->retrieve || + !fh->ops->finish || !fh->ops->can_preserve) { + return -EINVAL; + } + + /* + * Ensure the system is quiescent (no active sessions). + * This prevents registering new handlers while sessions are active or + * while deserialization is in progress. + */ + if (!luo_session_quiesce()) + return -EBUSY; + + /* Check for duplicate compatible strings */ + luo_list_for_each_private(fh_iter, &luo_file_handler_list, list) { + if (!strcmp(fh_iter->compatible, fh->compatible)) { + pr_err("File handler registration failed: Compatible string '%s' already registered.\n", + fh->compatible); + err = -EEXIST; + goto err_resume; + } + } + + /* Pin the module implementing the handler */ + if (!try_module_get(fh->ops->owner)) { + err = -EAGAIN; + goto err_resume; + } + + INIT_LIST_HEAD(&ACCESS_PRIVATE(fh, list)); + list_add_tail(&ACCESS_PRIVATE(fh, list), &luo_file_handler_list); + luo_session_resume(); + + return 0; + +err_resume: + luo_session_resume(); + return err; +} + +/** + * liveupdate_unregister_file_handler - Unregister a liveupdate file handler + * @fh: The file handler to unregister + * + * Unregisters the file handler from the liveupdate core. This function + * reverses the operations of liveupdate_register_file_handler(). + * + * It ensures safe removal by checking that: + * No live update session is currently in progress. + * + * If the unregistration fails, the internal test state is reverted. + * + * Return: 0 Success. -EOPNOTSUPP when live update is not enabled. -EBUSY A live + * update is in progress, can't quiesce live update. + */ +int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) +{ + if (!liveupdate_enabled()) + return -EOPNOTSUPP; + + if (!luo_session_quiesce()) + return -EBUSY; + + list_del(&ACCESS_PRIVATE(fh, list)); + module_put(fh->ops->owner); + luo_session_resume(); + + return 0; +} diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h new file mode 100644 index 000000000000..c8973b543d1d --- /dev/null +++ b/kernel/liveupdate/luo_internal.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin <pasha.tatashin@soleen.com> + */ + +#ifndef _LINUX_LUO_INTERNAL_H +#define _LINUX_LUO_INTERNAL_H + +#include <linux/liveupdate.h> +#include <linux/uaccess.h> + +struct luo_ucmd { + void __user *ubuffer; + u32 user_size; + void *cmd; +}; + +static inline int luo_ucmd_respond(struct luo_ucmd *ucmd, + size_t kernel_cmd_size) +{ + /* + * Copy the minimum of what the user provided and what we actually + * have. + */ + if (copy_to_user(ucmd->ubuffer, ucmd->cmd, + min_t(size_t, ucmd->user_size, kernel_cmd_size))) { + return -EFAULT; + } + return 0; +} + +/* + * Handles a deserialization failure: devices and memory is in unpredictable + * state. + * + * Continuing the boot process after a failure is dangerous because it could + * lead to leaks of private data. + */ +#define luo_restore_fail(__fmt, ...) panic(__fmt, ##__VA_ARGS__) + +/* Mimics list_for_each_entry() but for private list head entries */ +#define luo_list_for_each_private(pos, head, member) \ + for (struct list_head *__iter = (head)->next; \ + __iter != (head) && \ + ({ pos = container_of(__iter, typeof(*(pos)), member); 1; }); \ + __iter = __iter->next) + +/** + * struct luo_file_set - A set of files that belong to the same sessions. + * @files_list: An ordered list of files associated with this session, it is + * ordered by preservation time. + * @files: The physically contiguous memory block that holds the serialized + * state of files. + * @count: A counter tracking the number of files currently stored in the + * @files_list for this session. + */ +struct luo_file_set { + struct list_head files_list; + struct luo_file_ser *files; + long count; +}; + +/** + * struct luo_session - Represents an active or incoming Live Update session. + * @name: A unique name for this session, used for identification and + * retrieval. + * @ser: Pointer to the serialized data for this session. + * @list: A list_head member used to link this session into a global list + * of either outgoing (to be preserved) or incoming (restored from + * previous kernel) sessions. + * @retrieved: A boolean flag indicating whether this session has been + * retrieved by a consumer in the new kernel. + * @file_set: A set of files that belong to this session. + * @mutex: protects fields in the luo_session. + */ +struct luo_session { + char name[LIVEUPDATE_SESSION_NAME_LENGTH]; + struct luo_session_ser *ser; + struct list_head list; + bool retrieved; + struct luo_file_set file_set; + struct mutex mutex; +}; + +int luo_session_create(const char *name, struct file **filep); +int luo_session_retrieve(const char *name, struct file **filep); +int __init luo_session_setup_outgoing(void *fdt); +int __init luo_session_setup_incoming(void *fdt); +int luo_session_serialize(void); +int luo_session_deserialize(void); +bool luo_session_quiesce(void); +void luo_session_resume(void); + +int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd); +void luo_file_unpreserve_files(struct luo_file_set *file_set); +int luo_file_freeze(struct luo_file_set *file_set, + struct luo_file_set_ser *file_set_ser); +void luo_file_unfreeze(struct luo_file_set *file_set, + struct luo_file_set_ser *file_set_ser); +int luo_retrieve_file(struct luo_file_set *file_set, u64 token, + struct file **filep); +int luo_file_finish(struct luo_file_set *file_set); +int luo_file_deserialize(struct luo_file_set *file_set, + struct luo_file_set_ser *file_set_ser); +void luo_file_set_init(struct luo_file_set *file_set); +void luo_file_set_destroy(struct luo_file_set *file_set); + +#endif /* _LINUX_LUO_INTERNAL_H */ diff --git a/kernel/liveupdate/luo_session.c b/kernel/liveupdate/luo_session.c new file mode 100644 index 000000000000..dbdbc3bd7929 --- /dev/null +++ b/kernel/liveupdate/luo_session.c @@ -0,0 +1,646 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin <pasha.tatashin@soleen.com> + */ + +/** + * DOC: LUO Sessions + * + * LUO Sessions provide the core mechanism for grouping and managing `struct + * file *` instances that need to be preserved across a kexec-based live + * update. Each session acts as a named container for a set of file objects, + * allowing a userspace agent to manage the lifecycle of resources critical to a + * workload. + * + * Core Concepts: + * + * - Named Containers: Sessions are identified by a unique, user-provided name, + * which is used for both creation in the current kernel and retrieval in the + * next kernel. + * + * - Userspace Interface: Session management is driven from userspace via + * ioctls on /dev/liveupdate. + * + * - Serialization: Session metadata is preserved using the KHO framework. When + * a live update is triggered via kexec, an array of `struct luo_session_ser` + * is populated and placed in a preserved memory region. An FDT node is also + * created, containing the count of sessions and the physical address of this + * array. + * + * Session Lifecycle: + * + * 1. Creation: A userspace agent calls `luo_session_create()` to create a + * new, empty session and receives a file descriptor for it. + * + * 2. Serialization: When the `reboot(LINUX_REBOOT_CMD_KEXEC)` syscall is + * made, `luo_session_serialize()` is called. It iterates through all + * active sessions and writes their metadata into a memory area preserved + * by KHO. + * + * 3. Deserialization (in new kernel): After kexec, `luo_session_deserialize()` + * runs, reading the serialized data and creating a list of `struct + * luo_session` objects representing the preserved sessions. + * + * 4. Retrieval: A userspace agent in the new kernel can then call + * `luo_session_retrieve()` with a session name to get a new file + * descriptor and access the preserved state. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/anon_inodes.h> +#include <linux/cleanup.h> +#include <linux/err.h> +#include <linux/errno.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/io.h> +#include <linux/kexec_handover.h> +#include <linux/kho/abi/luo.h> +#include <linux/libfdt.h> +#include <linux/list.h> +#include <linux/liveupdate.h> +#include <linux/mutex.h> +#include <linux/rwsem.h> +#include <linux/slab.h> +#include <linux/unaligned.h> +#include <uapi/linux/liveupdate.h> +#include "luo_internal.h" + +/* 16 4K pages, give space for 744 sessions */ +#define LUO_SESSION_PGCNT 16ul +#define LUO_SESSION_MAX (((LUO_SESSION_PGCNT << PAGE_SHIFT) - \ + sizeof(struct luo_session_header_ser)) / \ + sizeof(struct luo_session_ser)) + +/** + * struct luo_session_header - Header struct for managing LUO sessions. + * @count: The number of sessions currently tracked in the @list. + * @list: The head of the linked list of `struct luo_session` instances. + * @rwsem: A read-write semaphore providing synchronized access to the + * session list and other fields in this structure. + * @header_ser: The header data of serialization array. + * @ser: The serialized session data (an array of + * `struct luo_session_ser`). + * @active: Set to true when first initialized. If previous kernel did not + * send session data, active stays false for incoming. + */ +struct luo_session_header { + long count; + struct list_head list; + struct rw_semaphore rwsem; + struct luo_session_header_ser *header_ser; + struct luo_session_ser *ser; + bool active; +}; + +/** + * struct luo_session_global - Global container for managing LUO sessions. + * @incoming: The sessions passed from the previous kernel. + * @outgoing: The sessions that are going to be passed to the next kernel. + */ +struct luo_session_global { + struct luo_session_header incoming; + struct luo_session_header outgoing; +}; + +static struct luo_session_global luo_session_global = { + .incoming = { + .list = LIST_HEAD_INIT(luo_session_global.incoming.list), + .rwsem = __RWSEM_INITIALIZER(luo_session_global.incoming.rwsem), + }, + .outgoing = { + .list = LIST_HEAD_INIT(luo_session_global.outgoing.list), + .rwsem = __RWSEM_INITIALIZER(luo_session_global.outgoing.rwsem), + }, +}; + +static struct luo_session *luo_session_alloc(const char *name) +{ + struct luo_session *session = kzalloc(sizeof(*session), GFP_KERNEL); + + if (!session) + return ERR_PTR(-ENOMEM); + + strscpy(session->name, name, sizeof(session->name)); + INIT_LIST_HEAD(&session->file_set.files_list); + luo_file_set_init(&session->file_set); + INIT_LIST_HEAD(&session->list); + mutex_init(&session->mutex); + + return session; +} + +static void luo_session_free(struct luo_session *session) +{ + luo_file_set_destroy(&session->file_set); + mutex_destroy(&session->mutex); + kfree(session); +} + +static int luo_session_insert(struct luo_session_header *sh, + struct luo_session *session) +{ + struct luo_session *it; + + guard(rwsem_write)(&sh->rwsem); + + /* + * For outgoing we should make sure there is room in serialization array + * for new session. + */ + if (sh == &luo_session_global.outgoing) { + if (sh->count == LUO_SESSION_MAX) + return -ENOMEM; + } + + /* + * For small number of sessions this loop won't hurt performance + * but if we ever start using a lot of sessions, this might + * become a bottle neck during deserialization time, as it would + * cause O(n*n) complexity. + */ + list_for_each_entry(it, &sh->list, list) { + if (!strncmp(it->name, session->name, sizeof(it->name))) + return -EEXIST; + } + list_add_tail(&session->list, &sh->list); + sh->count++; + + return 0; +} + +static void luo_session_remove(struct luo_session_header *sh, + struct luo_session *session) +{ + guard(rwsem_write)(&sh->rwsem); + list_del(&session->list); + sh->count--; +} + +static int luo_session_finish_one(struct luo_session *session) +{ + guard(mutex)(&session->mutex); + return luo_file_finish(&session->file_set); +} + +static void luo_session_unfreeze_one(struct luo_session *session, + struct luo_session_ser *ser) +{ + guard(mutex)(&session->mutex); + luo_file_unfreeze(&session->file_set, &ser->file_set_ser); +} + +static int luo_session_freeze_one(struct luo_session *session, + struct luo_session_ser *ser) +{ + guard(mutex)(&session->mutex); + return luo_file_freeze(&session->file_set, &ser->file_set_ser); +} + +static int luo_session_release(struct inode *inodep, struct file *filep) +{ + struct luo_session *session = filep->private_data; + struct luo_session_header *sh; + + /* If retrieved is set, it means this session is from incoming list */ + if (session->retrieved) { + int err = luo_session_finish_one(session); + + if (err) { + pr_warn("Unable to finish session [%s] on release\n", + session->name); + return err; + } + sh = &luo_session_global.incoming; + } else { + scoped_guard(mutex, &session->mutex) + luo_file_unpreserve_files(&session->file_set); + sh = &luo_session_global.outgoing; + } + + luo_session_remove(sh, session); + luo_session_free(session); + + return 0; +} + +static int luo_session_preserve_fd(struct luo_session *session, + struct luo_ucmd *ucmd) +{ + struct liveupdate_session_preserve_fd *argp = ucmd->cmd; + int err; + + guard(mutex)(&session->mutex); + err = luo_preserve_file(&session->file_set, argp->token, argp->fd); + if (err) + return err; + + err = luo_ucmd_respond(ucmd, sizeof(*argp)); + if (err) + pr_warn("The file was successfully preserved, but response to user failed\n"); + + return err; +} + +static int luo_session_retrieve_fd(struct luo_session *session, + struct luo_ucmd *ucmd) +{ + struct liveupdate_session_retrieve_fd *argp = ucmd->cmd; + struct file *file; + int err; + + argp->fd = get_unused_fd_flags(O_CLOEXEC); + if (argp->fd < 0) + return argp->fd; + + guard(mutex)(&session->mutex); + err = luo_retrieve_file(&session->file_set, argp->token, &file); + if (err < 0) + goto err_put_fd; + + err = luo_ucmd_respond(ucmd, sizeof(*argp)); + if (err) + goto err_put_file; + + fd_install(argp->fd, file); + + return 0; + +err_put_file: + fput(file); +err_put_fd: + put_unused_fd(argp->fd); + + return err; +} + +static int luo_session_finish(struct luo_session *session, + struct luo_ucmd *ucmd) +{ + struct liveupdate_session_finish *argp = ucmd->cmd; + int err = luo_session_finish_one(session); + + if (err) + return err; + + return luo_ucmd_respond(ucmd, sizeof(*argp)); +} + +union ucmd_buffer { + struct liveupdate_session_finish finish; + struct liveupdate_session_preserve_fd preserve; + struct liveupdate_session_retrieve_fd retrieve; +}; + +struct luo_ioctl_op { + unsigned int size; + unsigned int min_size; + unsigned int ioctl_num; + int (*execute)(struct luo_session *session, struct luo_ucmd *ucmd); +}; + +#define IOCTL_OP(_ioctl, _fn, _struct, _last) \ + [_IOC_NR(_ioctl) - LIVEUPDATE_CMD_SESSION_BASE] = { \ + .size = sizeof(_struct) + \ + BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \ + sizeof(_struct)), \ + .min_size = offsetofend(_struct, _last), \ + .ioctl_num = _ioctl, \ + .execute = _fn, \ + } + +static const struct luo_ioctl_op luo_session_ioctl_ops[] = { + IOCTL_OP(LIVEUPDATE_SESSION_FINISH, luo_session_finish, + struct liveupdate_session_finish, reserved), + IOCTL_OP(LIVEUPDATE_SESSION_PRESERVE_FD, luo_session_preserve_fd, + struct liveupdate_session_preserve_fd, token), + IOCTL_OP(LIVEUPDATE_SESSION_RETRIEVE_FD, luo_session_retrieve_fd, + struct liveupdate_session_retrieve_fd, token), +}; + +static long luo_session_ioctl(struct file *filep, unsigned int cmd, + unsigned long arg) +{ + struct luo_session *session = filep->private_data; + const struct luo_ioctl_op *op; + struct luo_ucmd ucmd = {}; + union ucmd_buffer buf; + unsigned int nr; + int ret; + + nr = _IOC_NR(cmd); + if (nr < LIVEUPDATE_CMD_SESSION_BASE || (nr - LIVEUPDATE_CMD_SESSION_BASE) >= + ARRAY_SIZE(luo_session_ioctl_ops)) { + return -EINVAL; + } + + ucmd.ubuffer = (void __user *)arg; + ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer); + if (ret) + return ret; + + op = &luo_session_ioctl_ops[nr - LIVEUPDATE_CMD_SESSION_BASE]; + if (op->ioctl_num != cmd) + return -ENOIOCTLCMD; + if (ucmd.user_size < op->min_size) + return -EINVAL; + + ucmd.cmd = &buf; + ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer, + ucmd.user_size); + if (ret) + return ret; + + return op->execute(session, &ucmd); +} + +static const struct file_operations luo_session_fops = { + .owner = THIS_MODULE, + .release = luo_session_release, + .unlocked_ioctl = luo_session_ioctl, +}; + +/* Create a "struct file" for session */ +static int luo_session_getfile(struct luo_session *session, struct file **filep) +{ + char name_buf[128]; + struct file *file; + + lockdep_assert_held(&session->mutex); + snprintf(name_buf, sizeof(name_buf), "[luo_session] %s", session->name); + file = anon_inode_getfile(name_buf, &luo_session_fops, session, O_RDWR); + if (IS_ERR(file)) + return PTR_ERR(file); + + *filep = file; + + return 0; +} + +int luo_session_create(const char *name, struct file **filep) +{ + struct luo_session *session; + int err; + + session = luo_session_alloc(name); + if (IS_ERR(session)) + return PTR_ERR(session); + + err = luo_session_insert(&luo_session_global.outgoing, session); + if (err) + goto err_free; + + scoped_guard(mutex, &session->mutex) + err = luo_session_getfile(session, filep); + if (err) + goto err_remove; + + return 0; + +err_remove: + luo_session_remove(&luo_session_global.outgoing, session); +err_free: + luo_session_free(session); + + return err; +} + +int luo_session_retrieve(const char *name, struct file **filep) +{ + struct luo_session_header *sh = &luo_session_global.incoming; + struct luo_session *session = NULL; + struct luo_session *it; + int err; + + scoped_guard(rwsem_read, &sh->rwsem) { + list_for_each_entry(it, &sh->list, list) { + if (!strncmp(it->name, name, sizeof(it->name))) { + session = it; + break; + } + } + } + + if (!session) + return -ENOENT; + + guard(mutex)(&session->mutex); + if (session->retrieved) + return -EINVAL; + + err = luo_session_getfile(session, filep); + if (!err) + session->retrieved = true; + + return err; +} + +int __init luo_session_setup_outgoing(void *fdt_out) +{ + struct luo_session_header_ser *header_ser; + u64 header_ser_pa; + int err; + + header_ser = kho_alloc_preserve(LUO_SESSION_PGCNT << PAGE_SHIFT); + if (IS_ERR(header_ser)) + return PTR_ERR(header_ser); + header_ser_pa = virt_to_phys(header_ser); + + err = fdt_begin_node(fdt_out, LUO_FDT_SESSION_NODE_NAME); + err |= fdt_property_string(fdt_out, "compatible", + LUO_FDT_SESSION_COMPATIBLE); + err |= fdt_property(fdt_out, LUO_FDT_SESSION_HEADER, &header_ser_pa, + sizeof(header_ser_pa)); + err |= fdt_end_node(fdt_out); + + if (err) + goto err_unpreserve; + + luo_session_global.outgoing.header_ser = header_ser; + luo_session_global.outgoing.ser = (void *)(header_ser + 1); + luo_session_global.outgoing.active = true; + + return 0; + +err_unpreserve: + kho_unpreserve_free(header_ser); + return err; +} + +int __init luo_session_setup_incoming(void *fdt_in) +{ + struct luo_session_header_ser *header_ser; + int err, header_size, offset; + u64 header_ser_pa; + const void *ptr; + + offset = fdt_subnode_offset(fdt_in, 0, LUO_FDT_SESSION_NODE_NAME); + if (offset < 0) { + pr_err("Unable to get session node: [%s]\n", + LUO_FDT_SESSION_NODE_NAME); + return -EINVAL; + } + + err = fdt_node_check_compatible(fdt_in, offset, + LUO_FDT_SESSION_COMPATIBLE); + if (err) { + pr_err("Session node incompatible [%s]\n", + LUO_FDT_SESSION_COMPATIBLE); + return -EINVAL; + } + + header_size = 0; + ptr = fdt_getprop(fdt_in, offset, LUO_FDT_SESSION_HEADER, &header_size); + if (!ptr || header_size != sizeof(u64)) { + pr_err("Unable to get session header '%s' [%d]\n", + LUO_FDT_SESSION_HEADER, header_size); + return -EINVAL; + } + + header_ser_pa = get_unaligned((u64 *)ptr); + header_ser = phys_to_virt(header_ser_pa); + + luo_session_global.incoming.header_ser = header_ser; + luo_session_global.incoming.ser = (void *)(header_ser + 1); + luo_session_global.incoming.active = true; + + return 0; +} + +int luo_session_deserialize(void) +{ + struct luo_session_header *sh = &luo_session_global.incoming; + static bool is_deserialized; + static int err; + + /* If has been deserialized, always return the same error code */ + if (is_deserialized) + return err; + + is_deserialized = true; + if (!sh->active) + return 0; + + /* + * Note on error handling: + * + * If deserialization fails (e.g., allocation failure or corrupt data), + * we intentionally skip cleanup of sessions that were already restored. + * + * A partial failure leaves the preserved state inconsistent. + * Implementing a safe "undo" to unwind complex dependencies (sessions, + * files, hardware state) is error-prone and provides little value, as + * the system is effectively in a broken state. + * + * We treat these resources as leaked. The expected recovery path is for + * userspace to detect the failure and trigger a reboot, which will + * reliably reset devices and reclaim memory. + */ + for (int i = 0; i < sh->header_ser->count; i++) { + struct luo_session *session; + + session = luo_session_alloc(sh->ser[i].name); + if (IS_ERR(session)) { + pr_warn("Failed to allocate session [%s] during deserialization %pe\n", + sh->ser[i].name, session); + return PTR_ERR(session); + } + + err = luo_session_insert(sh, session); + if (err) { + pr_warn("Failed to insert session [%s] %pe\n", + session->name, ERR_PTR(err)); + luo_session_free(session); + return err; + } + + scoped_guard(mutex, &session->mutex) { + luo_file_deserialize(&session->file_set, + &sh->ser[i].file_set_ser); + } + } + + kho_restore_free(sh->header_ser); + sh->header_ser = NULL; + sh->ser = NULL; + + return 0; +} + +int luo_session_serialize(void) +{ + struct luo_session_header *sh = &luo_session_global.outgoing; + struct luo_session *session; + int i = 0; + int err; + + guard(rwsem_write)(&sh->rwsem); + list_for_each_entry(session, &sh->list, list) { + err = luo_session_freeze_one(session, &sh->ser[i]); + if (err) + goto err_undo; + + strscpy(sh->ser[i].name, session->name, + sizeof(sh->ser[i].name)); + i++; + } + sh->header_ser->count = sh->count; + + return 0; + +err_undo: + list_for_each_entry_continue_reverse(session, &sh->list, list) { + i--; + luo_session_unfreeze_one(session, &sh->ser[i]); + memset(sh->ser[i].name, 0, sizeof(sh->ser[i].name)); + } + + return err; +} + +/** + * luo_session_quiesce - Ensure no active sessions exist and lock session lists. + * + * Acquires exclusive write locks on both incoming and outgoing session lists. + * It then validates no sessions exist in either list. + * + * This mechanism is used during file handler un/registration to ensure that no + * sessions are currently using the handler, and no new sessions can be created + * while un/registration is in progress. + * + * This prevents registering new handlers while sessions are active or + * while deserialization is in progress. + * + * Return: + * true - System is quiescent (0 sessions) and locked. + * false - Active sessions exist. The locks are released internally. + */ +bool luo_session_quiesce(void) +{ + down_write(&luo_session_global.incoming.rwsem); + down_write(&luo_session_global.outgoing.rwsem); + + if (luo_session_global.incoming.count || + luo_session_global.outgoing.count) { + up_write(&luo_session_global.outgoing.rwsem); + up_write(&luo_session_global.incoming.rwsem); + return false; + } + + return true; +} + +/** + * luo_session_resume - Unlock session lists and resume normal activity. + * + * Releases the exclusive locks acquired by a successful call to + * luo_session_quiesce(). + */ +void luo_session_resume(void) +{ + up_write(&luo_session_global.outgoing.rwsem); + up_write(&luo_session_global.incoming.rwsem); +} diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 58d78a33ac65..2d4c5bab5af8 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -219,6 +219,7 @@ static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES); static struct hlist_head lock_keys_hash[KEYHASH_SIZE]; unsigned long nr_lock_classes; unsigned long nr_zapped_classes; +unsigned long nr_dynamic_keys; unsigned long max_lock_class_idx; struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS); @@ -296,33 +297,30 @@ static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) dst->nr += src->nr; } -struct lock_class_stats lock_stats(struct lock_class *class) +void lock_stats(struct lock_class *class, struct lock_class_stats *stats) { - struct lock_class_stats stats; int cpu, i; - memset(&stats, 0, sizeof(struct lock_class_stats)); + memset(stats, 0, sizeof(struct lock_class_stats)); for_each_possible_cpu(cpu) { struct lock_class_stats *pcs = &per_cpu(cpu_lock_stats, cpu)[class - lock_classes]; - for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) - stats.contention_point[i] += pcs->contention_point[i]; + for (i = 0; i < ARRAY_SIZE(stats->contention_point); i++) + stats->contention_point[i] += pcs->contention_point[i]; - for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++) - stats.contending_point[i] += pcs->contending_point[i]; + for (i = 0; i < ARRAY_SIZE(stats->contending_point); i++) + stats->contending_point[i] += pcs->contending_point[i]; - lock_time_add(&pcs->read_waittime, &stats.read_waittime); - lock_time_add(&pcs->write_waittime, &stats.write_waittime); + lock_time_add(&pcs->read_waittime, &stats->read_waittime); + lock_time_add(&pcs->write_waittime, &stats->write_waittime); - lock_time_add(&pcs->read_holdtime, &stats.read_holdtime); - lock_time_add(&pcs->write_holdtime, &stats.write_holdtime); + lock_time_add(&pcs->read_holdtime, &stats->read_holdtime); + lock_time_add(&pcs->write_holdtime, &stats->write_holdtime); - for (i = 0; i < ARRAY_SIZE(stats.bounces); i++) - stats.bounces[i] += pcs->bounces[i]; + for (i = 0; i < ARRAY_SIZE(stats->bounces); i++) + stats->bounces[i] += pcs->bounces[i]; } - - return stats; } void clear_lock_stats(struct lock_class *class) @@ -1238,6 +1236,7 @@ void lockdep_register_key(struct lock_class_key *key) goto out_unlock; } hlist_add_head_rcu(&key->hash_entry, hash_head); + nr_dynamic_keys++; out_unlock: graph_unlock(); restore_irqs: @@ -1977,41 +1976,6 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, } /* - * We are about to add A -> B into the dependency graph, and in __bfs() a - * strong dependency path A -> .. -> B is found: hlock_class equals - * entry->class. - * - * If A -> .. -> B can replace A -> B in any __bfs() search (means the former - * is _stronger_ than or equal to the latter), we consider A -> B as redundant. - * For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A - * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the - * dependency graph, as any strong path ..-> A -> B ->.. we can get with - * having dependency A -> B, we could already get a equivalent path ..-> A -> - * .. -> B -> .. with A -> .. -> B. Therefore A -> B is redundant. - * - * We need to make sure both the start and the end of A -> .. -> B is not - * weaker than A -> B. For the start part, please see the comment in - * check_redundant(). For the end part, we need: - * - * Either - * - * a) A -> B is -(*R)-> (everything is not weaker than that) - * - * or - * - * b) A -> .. -> B is -(*N)-> (nothing is stronger than this) - * - */ -static inline bool hlock_equal(struct lock_list *entry, void *data) -{ - struct held_lock *hlock = (struct held_lock *)data; - - return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */ - (hlock->read == 2 || /* A -> B is -(*R)-> */ - !entry->only_xr); /* A -> .. -> B is -(*N)-> */ -} - -/* * We are about to add B -> A into the dependency graph, and in __bfs() a * strong dependency path A -> .. -> B is found: hlock_class equals * entry->class. @@ -2916,6 +2880,41 @@ static inline bool usage_skip(struct lock_list *entry, void *mask) #ifdef CONFIG_LOCKDEP_SMALL /* + * We are about to add A -> B into the dependency graph, and in __bfs() a + * strong dependency path A -> .. -> B is found: hlock_class equals + * entry->class. + * + * If A -> .. -> B can replace A -> B in any __bfs() search (means the former + * is _stronger_ than or equal to the latter), we consider A -> B as redundant. + * For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A + * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the + * dependency graph, as any strong path ..-> A -> B ->.. we can get with + * having dependency A -> B, we could already get a equivalent path ..-> A -> + * .. -> B -> .. with A -> .. -> B. Therefore A -> B is redundant. + * + * We need to make sure both the start and the end of A -> .. -> B is not + * weaker than A -> B. For the start part, please see the comment in + * check_redundant(). For the end part, we need: + * + * Either + * + * a) A -> B is -(*R)-> (everything is not weaker than that) + * + * or + * + * b) A -> .. -> B is -(*N)-> (nothing is stronger than this) + * + */ +static inline bool hlock_equal(struct lock_list *entry, void *data) +{ + struct held_lock *hlock = (struct held_lock *)data; + + return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */ + (hlock->read == 2 || /* A -> B is -(*R)-> */ + !entry->only_xr); /* A -> .. -> B is -(*N)-> */ +} + +/* * Check that the dependency graph starting at <src> can lead to * <target> or not. If it can, <src> -> <target> dependency is already * in the graph. @@ -5101,6 +5100,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, lockevent_inc(lockdep_nocheck); } + if (DEBUG_LOCKS_WARN_ON(subclass >= MAX_LOCKDEP_SUBCLASSES)) + return 0; + if (subclass < NR_LOCKDEP_CACHING_CLASSES) class = lock->class_cache[subclass]; /* @@ -6606,6 +6608,7 @@ void lockdep_unregister_key(struct lock_class_key *key) pf = get_pending_free(); __lockdep_free_key_range(pf, key, 1); need_callback = prepare_call_rcu_zapped(pf); + nr_dynamic_keys--; } lockdep_unlock(); raw_local_irq_restore(flags); @@ -6613,8 +6616,16 @@ void lockdep_unregister_key(struct lock_class_key *key) if (need_callback) call_rcu(&delayed_free.rcu_head, free_zapped_rcu); - /* Wait until is_dynamic_key() has finished accessing k->hash_entry. */ - synchronize_rcu(); + /* + * Wait until is_dynamic_key() has finished accessing k->hash_entry. + * + * Some operations like __qdisc_destroy() will call this in a debug + * kernel, and the network traffic is disabled while waiting, hence + * the delay of the wait matters in debugging cases. Currently use a + * synchronize_rcu_expedited() to speed up the wait at the cost of + * system IPIs. TODO: Replace RCU with hazptr for this. + */ + synchronize_rcu_expedited(); } EXPORT_SYMBOL_GPL(lockdep_unregister_key); diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 20f9ef58d3d0..0e5e6ffe91a3 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -47,29 +47,31 @@ enum { __LOCKF(USED_READ) }; +enum { #define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE | -static const unsigned long LOCKF_ENABLED_IRQ = + LOCKF_ENABLED_IRQ = #include "lockdep_states.h" - 0; + 0, #undef LOCKDEP_STATE #define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE | -static const unsigned long LOCKF_USED_IN_IRQ = + LOCKF_USED_IN_IRQ = #include "lockdep_states.h" - 0; + 0, #undef LOCKDEP_STATE #define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE##_READ | -static const unsigned long LOCKF_ENABLED_IRQ_READ = + LOCKF_ENABLED_IRQ_READ = #include "lockdep_states.h" - 0; + 0, #undef LOCKDEP_STATE #define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE##_READ | -static const unsigned long LOCKF_USED_IN_IRQ_READ = + LOCKF_USED_IN_IRQ_READ = #include "lockdep_states.h" - 0; + 0, #undef LOCKDEP_STATE +}; #define LOCKF_ENABLED_IRQ_ALL (LOCKF_ENABLED_IRQ | LOCKF_ENABLED_IRQ_READ) #define LOCKF_USED_IN_IRQ_ALL (LOCKF_USED_IN_IRQ | LOCKF_USED_IN_IRQ_READ) @@ -138,6 +140,7 @@ extern unsigned long nr_lock_classes; extern unsigned long nr_zapped_classes; extern unsigned long nr_zapped_lock_chains; extern unsigned long nr_list_entries; +extern unsigned long nr_dynamic_keys; long lockdep_next_lockchain(long i); unsigned long lock_chain_count(void); extern unsigned long nr_stack_trace_entries; diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 6db0f43fc4df..1916db9aa46b 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -286,6 +286,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v) #endif seq_printf(m, " lock-classes: %11lu [max: %lu]\n", nr_lock_classes, MAX_LOCKDEP_KEYS); + seq_printf(m, " dynamic-keys: %11lu\n", + nr_dynamic_keys); seq_printf(m, " direct dependencies: %11lu [max: %lu]\n", nr_list_entries, MAX_LOCKDEP_ENTRIES); seq_printf(m, " indirect dependencies: %11lu\n", @@ -655,7 +657,7 @@ static int lock_stat_open(struct inode *inode, struct file *file) if (!test_bit(idx, lock_classes_in_use)) continue; iter->class = class; - iter->stats = lock_stats(class); + lock_stats(class, &iter->stats); iter++; } diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index ce0362f0a871..6567e5eeacc0 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -103,8 +103,8 @@ static const struct kernel_param_ops lt_bind_ops = { .get = param_get_cpumask, }; -module_param_cb(bind_readers, <_bind_ops, &bind_readers, 0644); -module_param_cb(bind_writers, <_bind_ops, &bind_writers, 0644); +module_param_cb(bind_readers, <_bind_ops, &bind_readers, 0444); +module_param_cb(bind_writers, <_bind_ops, &bind_writers, 0444); long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool dowarn); @@ -1211,6 +1211,10 @@ end: cxt.cur_ops->exit(); cxt.init_called = false; } + + free_cpumask_var(bind_readers); + free_cpumask_var(bind_writers); + torture_cleanup_end(); } diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 6e6f6071cfa2..2c6b02d4699b 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -53,17 +53,18 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, { lockdep_assert_held(&lock->wait_lock); - /* Mark the current thread as blocked on the lock: */ - task->blocked_on = waiter; + /* Current thread can't be already blocked (since it's executing!) */ + DEBUG_LOCKS_WARN_ON(__get_task_blocked_on(task)); } void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task) { + struct mutex *blocked_on = __get_task_blocked_on(task); + DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); DEBUG_LOCKS_WARN_ON(waiter->task != task); - DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter); - task->blocked_on = NULL; + DEBUG_LOCKS_WARN_ON(blocked_on && blocked_on != lock); INIT_LIST_HEAD(&waiter->list); waiter->task = NULL; @@ -77,16 +78,8 @@ void debug_mutex_unlock(struct mutex *lock) } } -void debug_mutex_init(struct mutex *lock, const char *name, - struct lock_class_key *key) +void debug_mutex_init(struct mutex *lock) { -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); -#endif lock->magic = lock; } diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 555e2b3a665a..2a1d165b3167 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -29,6 +29,7 @@ #include <linux/interrupt.h> #include <linux/debug_locks.h> #include <linux/osq_lock.h> +#include <linux/hung_task.h> #define CREATE_TRACE_POINTS #include <trace/events/lock.h> @@ -42,8 +43,7 @@ # define MUTEX_WARN_ON(cond) #endif -void -__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) +static void __mutex_init_generic(struct mutex *lock) { atomic_long_set(&lock->owner, 0); raw_spin_lock_init(&lock->wait_lock); @@ -51,10 +51,8 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) #ifdef CONFIG_MUTEX_SPIN_ON_OWNER osq_lock_init(&lock->osq); #endif - - debug_mutex_init(lock, name, key); + debug_mutex_init(lock); } -EXPORT_SYMBOL(__mutex_init); static inline struct task_struct *__owner_task(unsigned long owner) { @@ -141,6 +139,11 @@ static inline bool __mutex_trylock(struct mutex *lock) * There is nothing that would stop spreading the lockdep annotations outwards * except more code. */ +void mutex_init_generic(struct mutex *lock) +{ + __mutex_init_generic(lock); +} +EXPORT_SYMBOL(mutex_init_generic); /* * Optimistic trylock that only works in the uncontended case. Make sure to @@ -165,7 +168,21 @@ static __always_inline bool __mutex_unlock_fast(struct mutex *lock) return atomic_long_try_cmpxchg_release(&lock->owner, &curr, 0UL); } -#endif + +#else /* !CONFIG_DEBUG_LOCK_ALLOC */ + +void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key) +{ + __mutex_init_generic(lock); + + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); +} +EXPORT_SYMBOL(mutex_init_lockep); +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ static inline void __mutex_set_flag(struct mutex *lock, unsigned long flag) { @@ -190,9 +207,7 @@ static void __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct list_head *list) { -#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER - WRITE_ONCE(current->blocker_mutex, lock); -#endif + hung_task_set_blocker(lock, BLOCKER_TYPE_MUTEX); debug_mutex_add_waiter(lock, waiter, current); list_add_tail(&waiter->list, list); @@ -208,9 +223,7 @@ __mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter) __mutex_clear_flag(lock, MUTEX_FLAGS); debug_mutex_remove_waiter(lock, waiter, current); -#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER - WRITE_ONCE(current->blocker_mutex, NULL); -#endif + hung_task_clear_blocker(); } /* @@ -643,6 +656,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas goto err_early_kill; } + __set_task_blocked_on(current, lock); set_current_state(state); trace_contention_begin(lock, LCB_F_MUTEX); for (;;) { @@ -679,6 +693,12 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas first = __mutex_waiter_is_first(lock, &waiter); + /* + * As we likely have been woken up by task + * that has cleared our blocked_on state, re-set + * it to the lock we are trying to acquire. + */ + set_task_blocked_on(current, lock); set_current_state(state); /* * Here we order against unlock; we must either see it change @@ -690,8 +710,15 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas if (first) { trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN); + /* + * mutex_optimistic_spin() can call schedule(), so + * clear blocked on so we don't become unselectable + * to run. + */ + clear_task_blocked_on(current, lock); if (mutex_optimistic_spin(lock, ww_ctx, &waiter)) break; + set_task_blocked_on(current, lock); trace_contention_begin(lock, LCB_F_MUTEX); } @@ -699,6 +726,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas } raw_spin_lock_irqsave(&lock->wait_lock, flags); acquired: + __clear_task_blocked_on(current, lock); __set_current_state(TASK_RUNNING); if (ww_ctx) { @@ -728,9 +756,11 @@ skip_wait: return 0; err: + __clear_task_blocked_on(current, lock); __set_current_state(TASK_RUNNING); __mutex_remove_waiter(lock, &waiter); err_early_kill: + WARN_ON(__get_task_blocked_on(current)); trace_contention_end(lock, ret); raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); debug_mutex_free_waiter(&waiter); @@ -808,11 +838,12 @@ _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); int __sched -mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) +_mutex_lock_killable(struct mutex *lock, unsigned int subclass, + struct lockdep_map *nest) { - return __mutex_lock(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); + return __mutex_lock(lock, TASK_KILLABLE, subclass, nest, _RET_IP_); } -EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); +EXPORT_SYMBOL_GPL(_mutex_lock_killable); int __sched mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) @@ -940,6 +971,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne next = waiter->task; debug_mutex_wake_waiter(lock, waiter); + __clear_task_blocked_on(next, lock); wake_q_add(&wake_q, next); } @@ -1062,6 +1094,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, #endif +#ifndef CONFIG_DEBUG_LOCK_ALLOC /** * mutex_trylock - try to acquire the mutex, without waiting * @lock: the mutex to be acquired @@ -1078,17 +1111,24 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, */ int __sched mutex_trylock(struct mutex *lock) { + MUTEX_WARN_ON(lock->magic != lock); + return __mutex_trylock(lock); +} +EXPORT_SYMBOL(mutex_trylock); +#else +int __sched _mutex_trylock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock) +{ bool locked; MUTEX_WARN_ON(lock->magic != lock); - locked = __mutex_trylock(lock); if (locked) - mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + mutex_acquire_nest(&lock->dep_map, 0, 1, nest_lock, _RET_IP_); return locked; } -EXPORT_SYMBOL(mutex_trylock); +EXPORT_SYMBOL(_mutex_trylock_nest_lock); +#endif #ifndef CONFIG_DEBUG_LOCK_ALLOC int __sched diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index cbff35b9b7ae..9ad4da8cea00 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -6,7 +6,7 @@ * * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> */ - +#ifndef CONFIG_PREEMPT_RT /* * This is the control structure for tasks blocked on mutex, which resides * on the blocked task's kernel stack: @@ -59,8 +59,7 @@ extern void debug_mutex_add_waiter(struct mutex *lock, extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task); extern void debug_mutex_unlock(struct mutex *lock); -extern void debug_mutex_init(struct mutex *lock, const char *name, - struct lock_class_key *key); +extern void debug_mutex_init(struct mutex *lock); #else /* CONFIG_DEBUG_MUTEXES */ # define debug_mutex_lock_common(lock, waiter) do { } while (0) # define debug_mutex_wake_waiter(lock, waiter) do { } while (0) @@ -68,5 +67,6 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, # define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) # define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0) # define debug_mutex_unlock(lock) do { } while (0) -# define debug_mutex_init(lock, name, key) do { } while (0) +# define debug_mutex_init(lock) do { } while (0) #endif /* !CONFIG_DEBUG_MUTEXES */ +#endif /* CONFIG_PREEMPT_RT */ diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index d6964fc29f51..ef234469baac 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -138,7 +138,8 @@ static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry, return !reader; /* wake (readers until) 1 writer */ } -static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) +static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader, + bool freeze) { DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function); bool wait; @@ -156,7 +157,8 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) spin_unlock_irq(&sem->waiters.lock); while (wait) { - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE | + (freeze ? TASK_FREEZABLE : 0)); if (!smp_load_acquire(&wq_entry.private)) break; schedule(); @@ -164,7 +166,8 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) __set_current_state(TASK_RUNNING); } -bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) +bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try, + bool freeze) { if (__percpu_down_read_trylock(sem)) return true; @@ -174,7 +177,7 @@ bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_READ); preempt_enable(); - percpu_rwsem_wait(sem, /* .reader = */ true); + percpu_rwsem_wait(sem, /* .reader = */ true, freeze); preempt_disable(); trace_contention_end(sem, 0); @@ -237,7 +240,7 @@ void __sched percpu_down_write(struct percpu_rw_semaphore *sem) */ if (!__percpu_down_write_trylock(sem)) { trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE); - percpu_rwsem_wait(sem, /* .reader = */ false); + percpu_rwsem_wait(sem, /* .reader = */ false, false); contended = true; } diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 191e4720e546..59dbd29cb219 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -13,6 +13,24 @@ */ int max_lock_depth = 1024; +static const struct ctl_table rtmutex_sysctl_table[] = { + { + .procname = "max_lock_depth", + .data = &max_lock_depth, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +}; + +static int __init init_rtmutex_sysctl(void) +{ + register_sysctl_init("kernel", rtmutex_sysctl_table); + return 0; +} + +subsys_initcall(init_rtmutex_sysctl); + /* * Debug aware fast / slowpath lock,trylock,unlock * @@ -497,13 +515,11 @@ void rt_mutex_debug_task_free(struct task_struct *task) #ifdef CONFIG_PREEMPT_RT /* Mutexes */ -void __mutex_rt_init(struct mutex *mutex, const char *name, - struct lock_class_key *key) +static void __mutex_rt_init_generic(struct mutex *mutex) { + rt_mutex_base_init(&mutex->rtmutex); debug_check_no_locks_freed((void *)mutex, sizeof(*mutex)); - lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP); } -EXPORT_SYMBOL(__mutex_rt_init); static __always_inline int __mutex_lock_common(struct mutex *lock, unsigned int state, @@ -524,6 +540,13 @@ static __always_inline int __mutex_lock_common(struct mutex *lock, } #ifdef CONFIG_DEBUG_LOCK_ALLOC +void mutex_rt_init_lockdep(struct mutex *mutex, const char *name, struct lock_class_key *key) +{ + __mutex_rt_init_generic(mutex); + lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP); +} +EXPORT_SYMBOL(mutex_rt_init_lockdep); + void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass) { __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); @@ -544,12 +567,12 @@ int __sched mutex_lock_interruptible_nested(struct mutex *lock, } EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); -int __sched mutex_lock_killable_nested(struct mutex *lock, - unsigned int subclass) +int __sched _mutex_lock_killable(struct mutex *lock, unsigned int subclass, + struct lockdep_map *nest_lock) { - return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); + return __mutex_lock_common(lock, TASK_KILLABLE, subclass, nest_lock, _RET_IP_); } -EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); +EXPORT_SYMBOL_GPL(_mutex_lock_killable); void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass) { @@ -563,8 +586,29 @@ void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass) } EXPORT_SYMBOL_GPL(mutex_lock_io_nested); +int __sched _mutex_trylock_nest_lock(struct mutex *lock, + struct lockdep_map *nest_lock) +{ + int ret; + + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) + return 0; + + ret = __rt_mutex_trylock(&lock->rtmutex); + if (ret) + mutex_acquire_nest(&lock->dep_map, 0, 1, nest_lock, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL_GPL(_mutex_trylock_nest_lock); #else /* CONFIG_DEBUG_LOCK_ALLOC */ +void mutex_rt_init_generic(struct mutex *mutex) +{ + __mutex_rt_init_generic(mutex); +} +EXPORT_SYMBOL(mutex_rt_init_generic); + void __sched mutex_lock(struct mutex *lock) { __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); @@ -591,22 +635,16 @@ void __sched mutex_lock_io(struct mutex *lock) io_schedule_finish(token); } EXPORT_SYMBOL(mutex_lock_io); -#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ int __sched mutex_trylock(struct mutex *lock) { - int ret; - if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) return 0; - ret = __rt_mutex_trylock(&lock->rtmutex); - if (ret) - mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); - - return ret; + return __rt_mutex_trylock(&lock->rtmutex); } EXPORT_SYMBOL(mutex_trylock); +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ void __sched mutex_unlock(struct mutex *lock) { diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 78dd3d8c6554..cf6ddd1b23a2 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -153,15 +153,6 @@ static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p) pi_tree.entry); } -#define RT_MUTEX_HAS_WAITERS 1UL - -static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) -{ - unsigned long owner = (unsigned long) READ_ONCE(lock->owner); - - return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); -} - /* * Constants for rt mutex functions which have a selectable deadlock * detection. diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 2ddb827e3bea..24df4d98f7d2 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -27,6 +27,7 @@ #include <linux/export.h> #include <linux/rwsem.h> #include <linux/atomic.h> +#include <linux/hung_task.h> #include <trace/events/lock.h> #ifndef CONFIG_PREEMPT_RT @@ -181,11 +182,11 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) __rwsem_set_reader_owned(sem, current); } -#ifdef CONFIG_DEBUG_RWSEMS +#if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER) /* * Return just the real task structure pointer of the owner */ -static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) +struct task_struct *rwsem_owner(struct rw_semaphore *sem) { return (struct task_struct *) (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK); @@ -194,7 +195,7 @@ static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) /* * Return true if the rwsem is owned by a reader. */ -static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) +bool is_rwsem_reader_owned(struct rw_semaphore *sem) { /* * Check the count to see if it is write-locked. @@ -207,10 +208,10 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) } /* - * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there - * is a task pointer in owner of a reader-owned rwsem, it will be the - * real owner or one of the real owners. The only exception is when the - * unlock is done by up_read_non_owner(). + * With CONFIG_DEBUG_RWSEMS or CONFIG_DETECT_HUNG_TASK_BLOCKER configured, + * it will make sure that the owner field of a reader-owned rwsem either + * points to a real reader-owner(s) or gets cleared. The only exception is + * when the unlock is done by up_read_non_owner(). */ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) { @@ -727,8 +728,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) return ret; } -#define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER) - static inline enum owner_state rwsem_owner_state(struct task_struct *owner, unsigned long flags) { @@ -835,7 +834,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) enum owner_state owner_state; owner_state = rwsem_spin_on_owner(sem); - if (!(owner_state & OWNER_SPINNABLE)) + if (owner_state == OWNER_NONSPINNABLE) break; /* @@ -1065,10 +1064,13 @@ queue: wake_up_q(&wake_q); trace_contention_begin(sem, LCB_F_READ); + set_current_state(state); + + if (state == TASK_UNINTERRUPTIBLE) + hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_READER); /* wait to be given the lock */ for (;;) { - set_current_state(state); if (!smp_load_acquire(&waiter.task)) { /* Matches rwsem_mark_wake()'s smp_store_release(). */ break; @@ -1083,8 +1085,12 @@ queue: } schedule_preempt_disabled(); lockevent_inc(rwsem_sleep_reader); + set_current_state(state); } + if (state == TASK_UNINTERRUPTIBLE) + hung_task_clear_blocker(); + __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock); trace_contention_end(sem, 0); @@ -1146,6 +1152,9 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) set_current_state(state); trace_contention_begin(sem, LCB_F_WRITE); + if (state == TASK_UNINTERRUPTIBLE) + hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_WRITER); + for (;;) { if (rwsem_try_write_lock(sem, &waiter)) { /* rwsem_try_write_lock() implies ACQUIRE on success */ @@ -1179,6 +1188,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) trylock_again: raw_spin_lock_irq(&sem->wait_lock); } + + if (state == TASK_UNINTERRUPTIBLE) + hung_task_clear_blocker(); + __set_current_state(TASK_RUNNING); raw_spin_unlock_irq(&sem->wait_lock); lockevent_inc(rwsem_wlock); diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index de9117c0e671..3ef032e22f7e 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -34,6 +34,7 @@ #include <linux/spinlock.h> #include <linux/ftrace.h> #include <trace/events/lock.h> +#include <linux/hung_task.h> static noinline void __down(struct semaphore *sem); static noinline int __down_interruptible(struct semaphore *sem); @@ -41,6 +42,41 @@ static noinline int __down_killable(struct semaphore *sem); static noinline int __down_timeout(struct semaphore *sem, long timeout); static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q); +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER +static inline void hung_task_sem_set_holder(struct semaphore *sem) +{ + WRITE_ONCE((sem)->last_holder, (unsigned long)current); +} + +static inline void hung_task_sem_clear_if_holder(struct semaphore *sem) +{ + if (READ_ONCE((sem)->last_holder) == (unsigned long)current) + WRITE_ONCE((sem)->last_holder, 0UL); +} + +unsigned long sem_last_holder(struct semaphore *sem) +{ + return READ_ONCE(sem->last_holder); +} +#else +static inline void hung_task_sem_set_holder(struct semaphore *sem) +{ +} +static inline void hung_task_sem_clear_if_holder(struct semaphore *sem) +{ +} +unsigned long sem_last_holder(struct semaphore *sem) +{ + return 0UL; +} +#endif + +static inline void __sem_acquire(struct semaphore *sem) +{ + sem->count--; + hung_task_sem_set_holder(sem); +} + /** * down - acquire the semaphore * @sem: the semaphore to be acquired @@ -59,7 +95,7 @@ void __sched down(struct semaphore *sem) might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else __down(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -83,7 +119,7 @@ int __sched down_interruptible(struct semaphore *sem) might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else result = __down_interruptible(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -110,7 +146,7 @@ int __sched down_killable(struct semaphore *sem) might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else result = __down_killable(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -140,7 +176,7 @@ int __sched down_trylock(struct semaphore *sem) raw_spin_lock_irqsave(&sem->lock, flags); count = sem->count - 1; if (likely(count >= 0)) - sem->count = count; + __sem_acquire(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); return (count < 0); @@ -165,7 +201,7 @@ int __sched down_timeout(struct semaphore *sem, long timeout) might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else result = __down_timeout(sem, timeout); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -187,6 +223,9 @@ void __sched up(struct semaphore *sem) DEFINE_WAKE_Q(wake_q); raw_spin_lock_irqsave(&sem->lock, flags); + + hung_task_sem_clear_if_holder(sem); + if (likely(list_empty(&sem->wait_list))) sem->count++; else @@ -228,8 +267,10 @@ static inline int __sched ___down_common(struct semaphore *sem, long state, raw_spin_unlock_irq(&sem->lock); timeout = schedule_timeout(timeout); raw_spin_lock_irq(&sem->lock); - if (waiter.up) + if (waiter.up) { + hung_task_sem_set_holder(sem); return 0; + } } timed_out: @@ -246,10 +287,14 @@ static inline int __sched __down_common(struct semaphore *sem, long state, { int ret; + hung_task_set_blocker(sem, BLOCKER_TYPE_SEM); + trace_contention_begin(sem, 0); ret = ___down_common(sem, state, timeout); trace_contention_end(sem, ret); + hung_task_clear_blocker(); + return ret; } diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 87b03d2e41db..2338b3adfb55 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -184,8 +184,8 @@ void do_raw_read_unlock(rwlock_t *lock) static inline void debug_write_lock_before(rwlock_t *lock) { RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - RWLOCK_BUG_ON(lock->owner == current, lock, "recursion"); - RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), + RWLOCK_BUG_ON(READ_ONCE(lock->owner) == current, lock, "recursion"); + RWLOCK_BUG_ON(READ_ONCE(lock->owner_cpu) == raw_smp_processor_id(), lock, "cpu recursion"); } diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 37f025a096c9..31a785afee6c 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -284,6 +284,12 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, #ifndef WW_RT debug_mutex_wake_waiter(lock, waiter); #endif + /* + * When waking up the task to die, be sure to clear the + * blocked_on pointer. Otherwise we can see circular + * blocked_on relationships that can't resolve. + */ + __clear_task_blocked_on(waiter->task, lock); wake_q_add(wake_q, waiter->task); } @@ -331,9 +337,19 @@ static bool __ww_mutex_wound(struct MUTEX *lock, * it's wounded in __ww_mutex_check_kill() or has a * wakeup pending to re-read the wounded state. */ - if (owner != current) + if (owner != current) { + /* + * When waking up the task to wound, be sure to clear the + * blocked_on pointer. Otherwise we can see circular + * blocked_on relationships that can't resolve. + * + * NOTE: We pass NULL here instead of lock, because we + * are waking the mutex owner, who may be currently + * blocked on a different mutex. + */ + __clear_task_blocked_on(owner, NULL); wake_q_add(wake_q, owner); - + } return true; } diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index d7762ef5949a..2a1beebf1d37 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -192,6 +192,11 @@ config GENDWARFKSYMS depends on !DEBUG_INFO_REDUCED && !DEBUG_INFO_SPLIT # Requires ELF object files. depends on !LTO + # To avoid conflicts with the discarded __gendwarfksyms_ptr symbols on + # X86, requires pahole before commit 47dcb534e253 ("btf_encoder: Stop + # indexing symbols for VARs") or after commit 9810758003ce ("btf_encoder: + # Verify 0 address DWARF variables are in ELF section"). + depends on !X86 || !DEBUG_INFO_BTF || PAHOLE_VERSION < 128 || PAHOLE_VERSION > 129 help Calculate symbol versions from DWARF debugging information using gendwarfksyms. Requires DEBUG_INFO to be enabled. @@ -455,6 +460,6 @@ config UNUSED_KSYMS_WHITELIST config MODULES_TREE_LOOKUP def_bool y - depends on PERF_EVENTS || TRACING || CFI_CLANG + depends on PERF_EVENTS || TRACING || CFI endif # MODULES diff --git a/kernel/module/internal.h b/kernel/module/internal.h index 626cf8668a7e..618202578b42 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -58,6 +58,9 @@ extern const struct kernel_symbol __stop___ksymtab_gpl[]; extern const u32 __start___kcrctab[]; extern const u32 __start___kcrctab_gpl[]; +#define KMOD_PATH_LEN 256 +extern char modprobe_path[]; + struct load_info { const char *name; /* pointer to module in temporary copy, freed at end of load_module() */ @@ -109,6 +112,13 @@ struct find_symbol_arg { enum mod_license license; }; +/* modules using other modules */ +struct module_use { + struct list_head source_list; + struct list_head target_list; + struct module *source, *target; +}; + int mod_verify_sig(const void *mod, struct load_info *info); int try_to_force_load(struct module *mod, const char *reason); bool find_symbol(struct find_symbol_arg *fsa); @@ -322,8 +332,11 @@ int module_enable_rodata_ro(const struct module *mod); int module_enable_rodata_ro_after_init(const struct module *mod); int module_enable_data_nx(const struct module *mod); int module_enable_text_rox(const struct module *mod); -int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, - char *secstrings, struct module *mod); +int module_enforce_rwx_sections(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, + const char *secstrings, + const struct module *mod); +void module_mark_ro_after_init(const Elf_Ehdr *hdr, Elf_Shdr *sechdrs, + const char *secstrings); #ifdef CONFIG_MODULE_SIG int module_sig_check(struct load_info *info, int flags); diff --git a/kernel/module/main.c b/kernel/module/main.c index a2859dc3eea6..710ee30b3bea 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -126,9 +126,37 @@ static void mod_update_bounds(struct module *mod) } /* Block module loading/unloading? */ -int modules_disabled; +static int modules_disabled; core_param(nomodule, modules_disabled, bint, 0); +static const struct ctl_table module_sysctl_table[] = { + { + .procname = "modprobe", + .data = &modprobe_path, + .maxlen = KMOD_PATH_LEN, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "modules_disabled", + .data = &modules_disabled, + .maxlen = sizeof(int), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, + }, +}; + +static int __init init_module_sysctl(void) +{ + register_sysctl_init("kernel", module_sysctl_table); + return 0; +} + +subsys_initcall(init_module_sysctl); + /* Waiting for a module to finish initializing? */ static DECLARE_WAIT_QUEUE_HEAD(module_wq); @@ -170,6 +198,30 @@ static inline void add_taint_module(struct module *mod, unsigned flag, } /* + * Like strncmp(), except s/-/_/g as per scripts/Makefile.lib:name-fix-token rule. + */ +static int mod_strncmp(const char *str_a, const char *str_b, size_t n) +{ + for (int i = 0; i < n; i++) { + char a = str_a[i]; + char b = str_b[i]; + int d; + + if (a == '-') a = '_'; + if (b == '-') b = '_'; + + d = a - b; + if (d) + return d; + + if (!a) + break; + } + + return 0; +} + +/* * A thread that wants to hold a reference to a module only while it * is running can call this to safely exit. */ @@ -556,7 +608,7 @@ MODINFO_ATTR(version); MODINFO_ATTR(srcversion); static struct { - char name[MODULE_NAME_LEN + 1]; + char name[MODULE_NAME_LEN]; char taints[MODULE_FLAGS_BUF_SIZE]; } last_unloaded_module; @@ -727,14 +779,16 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, struct module *mod; char name[MODULE_NAME_LEN]; char buf[MODULE_FLAGS_BUF_SIZE]; - int ret, forced = 0; + int ret, len, forced = 0; if (!capable(CAP_SYS_MODULE) || modules_disabled) return -EPERM; - if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0) - return -EFAULT; - name[MODULE_NAME_LEN-1] = '\0'; + len = strncpy_from_user(name, name_user, MODULE_NAME_LEN); + if (len == 0 || len == MODULE_NAME_LEN) + return -ENOENT; + if (len < 0) + return len; audit_log_kern_module(name); @@ -900,7 +954,7 @@ size_t module_flags_taint(unsigned long taints, char *buf) int i; for (i = 0; i < TAINT_FLAGS_COUNT; i++) { - if (taint_flags[i].module && test_bit(i, &taints)) + if (test_bit(i, &taints)) buf[l++] = taint_flags[i].c_true; } @@ -1083,6 +1137,46 @@ static char *get_modinfo(const struct load_info *info, const char *tag) return get_next_modinfo(info, tag, NULL); } +/** + * verify_module_namespace() - does @modname have access to this symbol's @namespace + * @namespace: export symbol namespace + * @modname: module name + * + * If @namespace is prefixed with "module:" to indicate it is a module namespace + * then test if @modname matches any of the comma separated patterns. + * + * The patterns only support tail-glob. + */ +static bool verify_module_namespace(const char *namespace, const char *modname) +{ + size_t len, modlen = strlen(modname); + const char *prefix = "module:"; + const char *sep; + bool glob; + + if (!strstarts(namespace, prefix)) + return false; + + for (namespace += strlen(prefix); *namespace; namespace = sep) { + sep = strchrnul(namespace, ','); + len = sep - namespace; + + glob = false; + if (sep[-1] == '*') { + len--; + glob = true; + } + + if (*sep) + sep++; + + if (mod_strncmp(namespace, modname, len) == 0 && (glob || len == modlen)) + return true; + } + + return false; +} + static int verify_namespace_is_imported(const struct load_info *info, const struct kernel_symbol *sym, struct module *mod) @@ -1092,6 +1186,10 @@ static int verify_namespace_is_imported(const struct load_info *info, namespace = kernel_symbol_namespace(sym); if (namespace && namespace[0]) { + + if (verify_module_namespace(namespace, mod->name)) + return 0; + for_each_modinfo_entry(imported_namespace, info, "import_ns") { if (strcmp(namespace, imported_namespace) == 0) return 0; @@ -1224,20 +1322,11 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type) else execmem_type = EXECMEM_MODULE_TEXT; - ptr = execmem_alloc(execmem_type, size); + ptr = execmem_alloc_rw(execmem_type, size); if (!ptr) return -ENOMEM; - if (execmem_is_rox(execmem_type)) { - int err = execmem_make_temp_rw(ptr, size); - - if (err) { - execmem_free(ptr); - return -ENOMEM; - } - - mod->mem[type].is_rox = true; - } + mod->mem[type].is_rox = execmem_is_rox(execmem_type); /* * The pointer to these blocks of memory are stored on the module @@ -1505,8 +1594,14 @@ static int apply_relocations(struct module *mod, const struct load_info *info) if (infosec >= info->hdr->e_shnum) continue; - /* Don't bother with non-allocated sections */ - if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC)) + /* + * Don't bother with non-allocated sections. + * An exception is the percpu section, which has separate allocations + * for individual CPUs. We relocate the percpu section in the initial + * ELF template and subsequently copy it to the per-CPU destinations. + */ + if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC) && + (!infosec || infosec != info->index.pcpu)) continue; if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH) @@ -1562,12 +1657,11 @@ static void __layout_sections(struct module *mod, struct load_info *info, bool i { unsigned int m, i; + /* + * { Mask of required section header flags, + * Mask of excluded section header flags } + */ static const unsigned long masks[][2] = { - /* - * NOTE: all executable code must be the first section - * in this array; otherwise modify the text_size - * finder in the two loops below - */ { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL }, { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL }, { SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL }, @@ -1659,15 +1753,30 @@ static void module_license_taint_check(struct module *mod, const char *license) } } -static void setup_modinfo(struct module *mod, struct load_info *info) +static int setup_modinfo(struct module *mod, struct load_info *info) { const struct module_attribute *attr; + char *imported_namespace; int i; for (i = 0; (attr = modinfo_attrs[i]); i++) { if (attr->setup) attr->setup(mod, get_modinfo(info, attr->attr.name)); } + + for_each_modinfo_entry(imported_namespace, info, "import_ns") { + /* + * 'module:' prefixed namespaces are implicit, disallow + * explicit imports. + */ + if (strstarts(imported_namespace, "module:")) { + pr_err("%s: module tries to import module namespace: %s\n", + mod->name, imported_namespace); + return -EPERM; + } + } + + return 0; } static void free_modinfo(struct module *mod) @@ -2557,7 +2666,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->trace_bprintk_fmt_start), &mod->num_trace_bprintk_fmt); #endif -#ifdef CONFIG_FTRACE_MCOUNT_RECORD +#ifdef CONFIG_DYNAMIC_FTRACE /* sechdrs[0].sh_size is always zero */ mod->ftrace_callsites = section_objs(info, FTRACE_CALLSITE_SECTION, sizeof(*mod->ftrace_callsites), @@ -2614,9 +2723,8 @@ static int find_module_sections(struct module *mod, struct load_info *info) static int move_module(struct module *mod, struct load_info *info) { - int i; - enum mod_mem_type t = 0; - int ret = -ENOMEM; + int i, ret; + enum mod_mem_type t = MOD_MEM_NUM_TYPES; bool codetag_section_found = false; for_each_mod_mem_type(type) { @@ -2694,7 +2802,7 @@ static int move_module(struct module *mod, struct load_info *info) return 0; out_err: module_memory_restore_rox(mod); - for (t--; t >= 0; t--) + while (t--) module_memory_free(mod, t); if (codetag_section_found) codetag_free_module_sections(mod); @@ -2768,7 +2876,6 @@ core_param(module_blacklist, module_blacklist, charp, 0400); static struct module *layout_and_allocate(struct load_info *info, int flags) { struct module *mod; - unsigned int ndx; int err; /* Allow arches to frob section contents and sizes. */ @@ -2786,22 +2893,11 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC; /* - * Mark ro_after_init section with SHF_RO_AFTER_INIT so that - * layout_sections() can put it in the right place. + * Mark relevant sections as SHF_RO_AFTER_INIT so layout_sections() can + * put them in the right place. * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set. */ - ndx = find_sec(info, ".data..ro_after_init"); - if (ndx) - info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT; - /* - * Mark the __jump_table section as ro_after_init as well: these data - * structures are never modified, with the exception of entries that - * refer to code in the __init section, which are annotated as such - * at module load time. - */ - ndx = find_sec(info, "__jump_table"); - if (ndx) - info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT; + module_mark_ro_after_init(info->hdr, info->sechdrs, info->secstrings); /* * Determine total sizes, and put offsets in sh_entsize. For now @@ -2829,6 +2925,7 @@ static void module_deallocate(struct module *mod, struct load_info *info) { percpu_modfree(mod); module_arch_freeing_init(mod); + codetag_free_module_sections(mod); free_mod_mem(mod); } @@ -3297,7 +3394,7 @@ static int load_module(struct load_info *info, const char __user *uargs, module_allocated = true; - audit_log_kern_module(mod->name); + audit_log_kern_module(info->name); /* Reserve our place in the list. */ err = add_unformed_module(mod); @@ -3335,7 +3432,9 @@ static int load_module(struct load_info *info, const char __user *uargs, goto free_unload; /* Set up MODINFO_ATTR fields */ - setup_modinfo(mod, info); + err = setup_modinfo(mod, info); + if (err) + goto free_modinfo; /* Fix up syms, so that st_value is a pointer to location. */ err = simplify_symbols(mod, info); @@ -3398,11 +3497,12 @@ static int load_module(struct load_info *info, const char __user *uargs, goto sysfs_cleanup; } + if (codetag_load_module(mod)) + goto sysfs_cleanup; + /* Get rid of temporary copy. */ free_copy(info, flags); - codetag_load_module(mod); - /* Done! */ trace_module_load(mod); @@ -3458,8 +3558,10 @@ static int load_module(struct load_info *info, const char __user *uargs, * failures once the proper module was allocated and * before that. */ - if (!module_allocated) + if (!module_allocated) { + audit_log_kern_module(info->name ? info->name : "?"); mod_stat_bump_becoming(info, flags); + } free_copy(info, flags); return err; } @@ -3573,24 +3675,35 @@ static int idempotent_wait_for_completion(struct idempotent *u) static int init_module_from_file(struct file *f, const char __user * uargs, int flags) { + bool compressed = !!(flags & MODULE_INIT_COMPRESSED_FILE); struct load_info info = { }; void *buf = NULL; int len; + int err; - len = kernel_read_file(f, 0, &buf, INT_MAX, NULL, READING_MODULE); + len = kernel_read_file(f, 0, &buf, INT_MAX, NULL, + compressed ? READING_MODULE_COMPRESSED : + READING_MODULE); if (len < 0) { mod_stat_inc(&failed_kreads); return len; } - if (flags & MODULE_INIT_COMPRESSED_FILE) { - int err = module_decompress(&info, buf, len); + if (compressed) { + err = module_decompress(&info, buf, len); vfree(buf); /* compressed data is no longer needed */ if (err) { mod_stat_inc(&failed_decompress); mod_stat_add_long(len, &invalid_decompress_bytes); return err; } + err = security_kernel_post_read_file(f, (char *)info.hdr, info.len, + READING_MODULE); + if (err) { + mod_stat_inc(&failed_kreads); + free_copy(&info, flags); + return err; + } } else { info.hdr = buf; info.len = len; diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c index 03f4142cfbf4..8fd438529fbc 100644 --- a/kernel/module/strict_rwx.c +++ b/kernel/module/strict_rwx.c @@ -87,8 +87,9 @@ int module_enable_data_nx(const struct module *mod) return 0; } -int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, - char *secstrings, struct module *mod) +int module_enforce_rwx_sections(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, + const char *secstrings, + const struct module *mod) { const unsigned long shf_wx = SHF_WRITE | SHF_EXECINSTR; int i; @@ -106,3 +107,45 @@ int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, return 0; } + +static const char *const ro_after_init[] = { + /* + * Section .data..ro_after_init holds data explicitly annotated by + * __ro_after_init. + */ + ".data..ro_after_init", + + /* + * Section __jump_table holds data structures that are never modified, + * with the exception of entries that refer to code in the __init + * section, which are marked as such at module load time. + */ + "__jump_table", + +#ifdef CONFIG_HAVE_STATIC_CALL_INLINE + /* + * Section .static_call_sites holds data structures that need to be + * sorted and processed at module load time but are never modified + * afterwards. + */ + ".static_call_sites", +#endif +}; + +void module_mark_ro_after_init(const Elf_Ehdr *hdr, Elf_Shdr *sechdrs, + const char *secstrings) +{ + int i, j; + + for (i = 1; i < hdr->e_shnum; i++) { + Elf_Shdr *shdr = &sechdrs[i]; + + for (j = 0; j < ARRAY_SIZE(ro_after_init); j++) { + if (strcmp(secstrings + shdr->sh_name, + ro_after_init[j]) == 0) { + shdr->sh_flags |= SHF_RO_AFTER_INIT; + break; + } + } + } +} diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c index b401ff4b02d2..c7622ff5226a 100644 --- a/kernel/module/sysfs.c +++ b/kernel/module/sysfs.c @@ -56,9 +56,9 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs) { const struct bin_attribute *const *bin_attr; - for (bin_attr = sect_attrs->grp.bin_attrs_new; *bin_attr; bin_attr++) + for (bin_attr = sect_attrs->grp.bin_attrs; *bin_attr; bin_attr++) kfree((*bin_attr)->attr.name); - kfree(sect_attrs->grp.bin_attrs_new); + kfree(sect_attrs->grp.bin_attrs); kfree(sect_attrs); } @@ -86,7 +86,7 @@ static int add_sect_attrs(struct module *mod, const struct load_info *info) /* Setup section attributes. */ sect_attrs->grp.name = "sections"; - sect_attrs->grp.bin_attrs_new = gattr; + sect_attrs->grp.bin_attrs = gattr; sattr = §_attrs->attrs[0]; for (i = 0; i < info->hdr->e_shnum; i++) { @@ -101,7 +101,7 @@ static int add_sect_attrs(struct module *mod, const struct load_info *info) ret = -ENOMEM; goto out; } - sattr->read_new = module_sect_read; + sattr->read = module_sect_read; sattr->private = (void *)sec->sh_addr; sattr->size = MODULE_SECT_READ_SIZE; sattr->attr.mode = 0400; @@ -144,7 +144,7 @@ struct module_notes_attrs { static void free_notes_attrs(struct module_notes_attrs *notes_attrs) { - kfree(notes_attrs->grp.bin_attrs_new); + kfree(notes_attrs->grp.bin_attrs); kfree(notes_attrs); } @@ -178,7 +178,7 @@ static int add_notes_attrs(struct module *mod, const struct load_info *info) } notes_attrs->grp.name = "notes"; - notes_attrs->grp.bin_attrs_new = gattr; + notes_attrs->grp.bin_attrs = gattr; nattr = ¬es_attrs->attrs[0]; for (loaded = i = 0; i < info->hdr->e_shnum; ++i) { @@ -190,7 +190,7 @@ static int add_notes_attrs(struct module *mod, const struct load_info *info) nattr->attr.mode = 0444; nattr->size = info->sechdrs[i].sh_size; nattr->private = (void *)info->sechdrs[i].sh_addr; - nattr->read_new = sysfs_bin_attr_simple_read; + nattr->read = sysfs_bin_attr_simple_read; *(gattr++) = nattr++; } ++loaded; diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c index d3204c5c74eb..f8e8c126705c 100644 --- a/kernel/module/tree_lookup.c +++ b/kernel/module/tree_lookup.c @@ -14,7 +14,7 @@ * Use a latched RB-tree for __module_address(); this allows us to use * RCU lookups of the address from any context. * - * This is conditional on PERF_EVENTS || TRACING || CFI_CLANG because those can + * This is conditional on PERF_EVENTS || TRACING || CFI because those can * really hit __module_address() hard by doing a lot of stack unwinding; * potentially from NMI context. */ diff --git a/kernel/nscommon.c b/kernel/nscommon.c new file mode 100644 index 000000000000..bdc3c86231d3 --- /dev/null +++ b/kernel/nscommon.c @@ -0,0 +1,311 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ + +#include <linux/ns_common.h> +#include <linux/nstree.h> +#include <linux/proc_ns.h> +#include <linux/user_namespace.h> +#include <linux/vfsdebug.h> + +#ifdef CONFIG_DEBUG_VFS +static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) +{ + switch (ns->ns_type) { +#ifdef CONFIG_CGROUPS + case CLONE_NEWCGROUP: + VFS_WARN_ON_ONCE(ops != &cgroupns_operations); + break; +#endif +#ifdef CONFIG_IPC_NS + case CLONE_NEWIPC: + VFS_WARN_ON_ONCE(ops != &ipcns_operations); + break; +#endif + case CLONE_NEWNS: + VFS_WARN_ON_ONCE(ops != &mntns_operations); + break; +#ifdef CONFIG_NET_NS + case CLONE_NEWNET: + VFS_WARN_ON_ONCE(ops != &netns_operations); + break; +#endif +#ifdef CONFIG_PID_NS + case CLONE_NEWPID: + VFS_WARN_ON_ONCE(ops != &pidns_operations); + break; +#endif +#ifdef CONFIG_TIME_NS + case CLONE_NEWTIME: + VFS_WARN_ON_ONCE(ops != &timens_operations); + break; +#endif +#ifdef CONFIG_USER_NS + case CLONE_NEWUSER: + VFS_WARN_ON_ONCE(ops != &userns_operations); + break; +#endif +#ifdef CONFIG_UTS_NS + case CLONE_NEWUTS: + VFS_WARN_ON_ONCE(ops != &utsns_operations); + break; +#endif + } +} +#endif + +int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) +{ + int ret = 0; + + refcount_set(&ns->__ns_ref, 1); + ns->stashed = NULL; + ns->ops = ops; + ns->ns_id = 0; + ns->ns_type = ns_type; + ns_tree_node_init(&ns->ns_tree_node); + ns_tree_node_init(&ns->ns_unified_node); + ns_tree_node_init(&ns->ns_owner_node); + ns_tree_root_init(&ns->ns_owner_root); + +#ifdef CONFIG_DEBUG_VFS + ns_debug(ns, ops); +#endif + + if (inum) + ns->inum = inum; + else + ret = proc_alloc_inum(&ns->inum); + if (ret) + return ret; + /* + * Tree ref starts at 0. It's incremented when namespace enters + * active use (installed in nsproxy) and decremented when all + * active uses are gone. Initial namespaces are always active. + */ + if (is_ns_init_inum(ns)) + atomic_set(&ns->__ns_ref_active, 1); + else + atomic_set(&ns->__ns_ref_active, 0); + return 0; +} + +void __ns_common_free(struct ns_common *ns) +{ + proc_free_inum(ns->inum); +} + +struct ns_common *__must_check ns_owner(struct ns_common *ns) +{ + struct user_namespace *owner; + + if (unlikely(!ns->ops)) + return NULL; + VFS_WARN_ON_ONCE(!ns->ops->owner); + owner = ns->ops->owner(ns); + VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns)); + if (!owner) + return NULL; + /* Skip init_user_ns as it's always active */ + if (owner == &init_user_ns) + return NULL; + return to_ns_common(owner); +} + +/* + * The active reference count works by having each namespace that gets + * created take a single active reference on its owning user namespace. + * That single reference is only released once the child namespace's + * active count itself goes down. + * + * A regular namespace tree might look as follow: + * Legend: + * + : adding active reference + * - : dropping active reference + * x : always active (initial namespace) + * + * + * net_ns pid_ns + * \ / + * + + + * user_ns1 (2) + * | + * ipc_ns | uts_ns + * \ | / + * + + + + * user_ns2 (3) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * If both net_ns and pid_ns put their last active reference on + * themselves it will cascade to user_ns1 dropping its own active + * reference and dropping one active reference on user_ns2: + * + * net_ns pid_ns + * \ / + * - - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * + - + + * user_ns2 (2) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * The iteration stops once we reach a namespace that still has active + * references. + */ +void __ns_ref_active_put(struct ns_common *ns) +{ + /* Initial namespaces are always active. */ + if (is_ns_init_id(ns)) + return; + + if (!atomic_dec_and_test(&ns->__ns_ref_active)) { + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); + return; + } + + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); + VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); + + for (;;) { + ns = ns_owner(ns); + if (!ns) + return; + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); + if (!atomic_dec_and_test(&ns->__ns_ref_active)) { + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); + return; + } + } +} + +/* + * The active reference count works by having each namespace that gets + * created take a single active reference on its owning user namespace. + * That single reference is only released once the child namespace's + * active count itself goes down. This makes it possible to efficiently + * resurrect a namespace tree: + * + * A regular namespace tree might look as follow: + * Legend: + * + : adding active reference + * - : dropping active reference + * x : always active (initial namespace) + * + * + * net_ns pid_ns + * \ / + * + + + * user_ns1 (2) + * | + * ipc_ns | uts_ns + * \ | / + * + + + + * user_ns2 (3) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * If both net_ns and pid_ns put their last active reference on + * themselves it will cascade to user_ns1 dropping its own active + * reference and dropping one active reference on user_ns2: + * + * net_ns pid_ns + * \ / + * - - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * + - + + * user_ns2 (2) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * Assume the whole tree is dead but all namespaces are still active: + * + * net_ns pid_ns + * \ / + * - - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * - - - + * user_ns2 (0) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()): + * + * net_ns pid_ns + * \ / + * + - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * - + - + * user_ns2 (0) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * If net_ns had a zero reference count and we bumped it we also need to + * take another reference on its owning user namespace. Similarly, if + * pid_ns had a zero reference count it also needs to take another + * reference on its owning user namespace. So both net_ns and pid_ns + * will each have their own reference on the owning user namespace. + * + * If the owning user namespace user_ns1 had a zero reference count then + * it also needs to take another reference on its owning user namespace + * and so on. + */ +void __ns_ref_active_get(struct ns_common *ns) +{ + int prev; + + /* Initial namespaces are always active. */ + if (is_ns_init_id(ns)) + return; + + /* If we didn't resurrect the namespace we're done. */ + prev = atomic_fetch_add(1, &ns->__ns_ref_active); + VFS_WARN_ON_ONCE(prev < 0); + if (likely(prev)) + return; + + /* + * We did resurrect it. Walk the ownership hierarchy upwards + * until we found an owning user namespace that is active. + */ + for (;;) { + ns = ns_owner(ns); + if (!ns) + return; + + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); + prev = atomic_fetch_add(1, &ns->__ns_ref_active); + VFS_WARN_ON_ONCE(prev < 0); + if (likely(prev)) + return; + } +} diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index c9d97ed20122..259c4b4f1eeb 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -26,6 +26,7 @@ #include <linux/syscalls.h> #include <linux/cgroup.h> #include <linux/perf_event.h> +#include <linux/nstree.h> static struct kmem_cache *nsproxy_cachep; @@ -59,12 +60,31 @@ static inline struct nsproxy *create_nsproxy(void) return nsproxy; } +static inline void nsproxy_free(struct nsproxy *ns) +{ + put_mnt_ns(ns->mnt_ns); + put_uts_ns(ns->uts_ns); + put_ipc_ns(ns->ipc_ns); + put_pid_ns(ns->pid_ns_for_children); + put_time_ns(ns->time_ns); + put_time_ns(ns->time_ns_for_children); + put_cgroup_ns(ns->cgroup_ns); + put_net(ns->net_ns); + kmem_cache_free(nsproxy_cachep, ns); +} + +void deactivate_nsproxy(struct nsproxy *ns) +{ + nsproxy_ns_active_put(ns); + nsproxy_free(ns); +} + /* * Create new nsproxy and all of its the associated namespaces. * Return the newly created nsproxy. Do not attach this to the task, * leave it to the caller to do proper locking and attach it to task. */ -static struct nsproxy *create_new_namespaces(unsigned long flags, +static struct nsproxy *create_new_namespaces(u64 flags, struct task_struct *tsk, struct user_namespace *user_ns, struct fs_struct *new_fs) { @@ -128,17 +148,13 @@ out_time: out_net: put_cgroup_ns(new_nsp->cgroup_ns); out_cgroup: - if (new_nsp->pid_ns_for_children) - put_pid_ns(new_nsp->pid_ns_for_children); + put_pid_ns(new_nsp->pid_ns_for_children); out_pid: - if (new_nsp->ipc_ns) - put_ipc_ns(new_nsp->ipc_ns); + put_ipc_ns(new_nsp->ipc_ns); out_ipc: - if (new_nsp->uts_ns) - put_uts_ns(new_nsp->uts_ns); + put_uts_ns(new_nsp->uts_ns); out_uts: - if (new_nsp->mnt_ns) - put_mnt_ns(new_nsp->mnt_ns); + put_mnt_ns(new_nsp->mnt_ns); out_ns: kmem_cache_free(nsproxy_cachep, new_nsp); return ERR_PTR(err); @@ -148,7 +164,7 @@ out_ns: * called from clone. This now handles copy for nsproxy and all * namespaces therein. */ -int copy_namespaces(unsigned long flags, struct task_struct *tsk) +int copy_namespaces(u64 flags, struct task_struct *tsk) { struct nsproxy *old_ns = tsk->nsproxy; struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); @@ -183,29 +199,11 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) if ((flags & CLONE_VM) == 0) timens_on_fork(new_ns, tsk); + nsproxy_ns_active_get(new_ns); tsk->nsproxy = new_ns; return 0; } -void free_nsproxy(struct nsproxy *ns) -{ - if (ns->mnt_ns) - put_mnt_ns(ns->mnt_ns); - if (ns->uts_ns) - put_uts_ns(ns->uts_ns); - if (ns->ipc_ns) - put_ipc_ns(ns->ipc_ns); - if (ns->pid_ns_for_children) - put_pid_ns(ns->pid_ns_for_children); - if (ns->time_ns) - put_time_ns(ns->time_ns); - if (ns->time_ns_for_children) - put_time_ns(ns->time_ns_for_children); - put_cgroup_ns(ns->cgroup_ns); - put_net(ns->net_ns); - kmem_cache_free(nsproxy_cachep, ns); -} - /* * Called from unshare. Unshare all the namespaces part of nsproxy. * On success, returns the new nsproxy. @@ -242,6 +240,9 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) might_sleep(); + if (new) + nsproxy_ns_active_get(new); + task_lock(p); ns = p->nsproxy; p->nsproxy = new; @@ -251,11 +252,27 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) put_nsproxy(ns); } -void exit_task_namespaces(struct task_struct *p) +void exit_nsproxy_namespaces(struct task_struct *p) { switch_task_namespaces(p, NULL); } +void switch_cred_namespaces(const struct cred *old, const struct cred *new) +{ + ns_ref_active_get(new->user_ns); + ns_ref_active_put(old->user_ns); +} + +void get_cred_namespaces(struct task_struct *tsk) +{ + ns_ref_active_get(tsk->real_cred->user_ns); +} + +void exit_cred_namespaces(struct task_struct *tsk) +{ + ns_ref_active_put(tsk->real_cred->user_ns); +} + int exec_task_namespaces(void) { struct task_struct *tsk = current; @@ -325,7 +342,7 @@ static void put_nsset(struct nsset *nsset) if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) free_fs_struct(nsset->fs); if (nsset->nsproxy) - free_nsproxy(nsset->nsproxy); + nsproxy_free(nsset->nsproxy); } static int prepare_nsset(unsigned flags, struct nsset *nsset) @@ -555,9 +572,9 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags) if (proc_ns_file(fd_file(f))) { ns = get_proc_ns(file_inode(fd_file(f))); - if (flags && (ns->ops->type != flags)) + if (flags && (ns->ns_type != flags)) err = -EINVAL; - flags = ns->ops->type; + flags = ns->ns_type; } else if (!IS_ERR(pidfd_pid(fd_file(f)))) { err = check_setns_flags(flags); } else { diff --git a/kernel/nstree.c b/kernel/nstree.c new file mode 100644 index 000000000000..f36c59e6951d --- /dev/null +++ b/kernel/nstree.c @@ -0,0 +1,813 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ + +#include <linux/nstree.h> +#include <linux/proc_ns.h> +#include <linux/rculist.h> +#include <linux/vfsdebug.h> +#include <linux/syscalls.h> +#include <linux/user_namespace.h> + +static __cacheline_aligned_in_smp DEFINE_SEQLOCK(ns_tree_lock); + +DEFINE_LOCK_GUARD_0(ns_tree_writer, + write_seqlock(&ns_tree_lock), + write_sequnlock(&ns_tree_lock)) + +DEFINE_LOCK_GUARD_0(ns_tree_locked_reader, + read_seqlock_excl(&ns_tree_lock), + read_sequnlock_excl(&ns_tree_lock)) + +static struct ns_tree_root ns_unified_root = { /* protected by ns_tree_lock */ + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(ns_unified_root.ns_list_head), +}; + +struct ns_tree_root mnt_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(mnt_ns_tree.ns_list_head), +}; + +struct ns_tree_root net_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(net_ns_tree.ns_list_head), +}; +EXPORT_SYMBOL_GPL(net_ns_tree); + +struct ns_tree_root uts_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(uts_ns_tree.ns_list_head), +}; + +struct ns_tree_root user_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(user_ns_tree.ns_list_head), +}; + +struct ns_tree_root ipc_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(ipc_ns_tree.ns_list_head), +}; + +struct ns_tree_root pid_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(pid_ns_tree.ns_list_head), +}; + +struct ns_tree_root cgroup_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(cgroup_ns_tree.ns_list_head), +}; + +struct ns_tree_root time_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(time_ns_tree.ns_list_head), +}; + +/** + * ns_tree_node_init - Initialize a namespace tree node + * @node: The node to initialize + * + * Initializes both the rbtree node and list entry. + */ +void ns_tree_node_init(struct ns_tree_node *node) +{ + RB_CLEAR_NODE(&node->ns_node); + INIT_LIST_HEAD(&node->ns_list_entry); +} + +/** + * ns_tree_root_init - Initialize a namespace tree root + * @root: The root to initialize + * + * Initializes both the rbtree root and list head. + */ +void ns_tree_root_init(struct ns_tree_root *root) +{ + root->ns_rb = RB_ROOT; + INIT_LIST_HEAD(&root->ns_list_head); +} + +/** + * ns_tree_node_empty - Check if a namespace tree node is empty + * @node: The node to check + * + * Returns true if the node is not in any tree. + */ +bool ns_tree_node_empty(const struct ns_tree_node *node) +{ + return RB_EMPTY_NODE(&node->ns_node); +} + +/** + * ns_tree_node_add - Add a node to a namespace tree + * @node: The node to add + * @root: The tree root to add to + * @cmp: Comparison function for rbtree insertion + * + * Adds the node to both the rbtree and the list, maintaining sorted order. + * The list is maintained in the same order as the rbtree to enable efficient + * iteration. + * + * Returns: NULL if insertion succeeded, existing node if duplicate found + */ +struct rb_node *ns_tree_node_add(struct ns_tree_node *node, + struct ns_tree_root *root, + int (*cmp)(struct rb_node *, const struct rb_node *)) +{ + struct rb_node *ret, *prev; + + /* Add to rbtree */ + ret = rb_find_add_rcu(&node->ns_node, &root->ns_rb, cmp); + + /* Add to list in sorted order */ + prev = rb_prev(&node->ns_node); + if (!prev) { + /* No previous node, add at head */ + list_add_rcu(&node->ns_list_entry, &root->ns_list_head); + } else { + /* Add after previous node */ + struct ns_tree_node *prev_node; + prev_node = rb_entry(prev, struct ns_tree_node, ns_node); + list_add_rcu(&node->ns_list_entry, &prev_node->ns_list_entry); + } + + return ret; +} + +/** + * ns_tree_node_del - Remove a node from a namespace tree + * @node: The node to remove + * @root: The tree root to remove from + * + * Removes the node from both the rbtree and the list atomically. + */ +void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root) +{ + rb_erase(&node->ns_node, &root->ns_rb); + RB_CLEAR_NODE(&node->ns_node); + list_bidir_del_rcu(&node->ns_list_entry); +} + +static inline struct ns_common *node_to_ns(const struct rb_node *node) +{ + if (!node) + return NULL; + return rb_entry(node, struct ns_common, ns_tree_node.ns_node); +} + +static inline struct ns_common *node_to_ns_unified(const struct rb_node *node) +{ + if (!node) + return NULL; + return rb_entry(node, struct ns_common, ns_unified_node.ns_node); +} + +static inline struct ns_common *node_to_ns_owner(const struct rb_node *node) +{ + if (!node) + return NULL; + return rb_entry(node, struct ns_common, ns_owner_node.ns_node); +} + +static int ns_id_cmp(u64 id_a, u64 id_b) +{ + if (id_a < id_b) + return -1; + if (id_a > id_b) + return 1; + return 0; +} + +static int ns_cmp(struct rb_node *a, const struct rb_node *b) +{ + return ns_id_cmp(node_to_ns(a)->ns_id, node_to_ns(b)->ns_id); +} + +static int ns_cmp_unified(struct rb_node *a, const struct rb_node *b) +{ + return ns_id_cmp(node_to_ns_unified(a)->ns_id, node_to_ns_unified(b)->ns_id); +} + +static int ns_cmp_owner(struct rb_node *a, const struct rb_node *b) +{ + return ns_id_cmp(node_to_ns_owner(a)->ns_id, node_to_ns_owner(b)->ns_id); +} + +void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree_root *ns_tree) +{ + struct rb_node *node; + const struct proc_ns_operations *ops = ns->ops; + + VFS_WARN_ON_ONCE(!ns->ns_id); + + guard(ns_tree_writer)(); + + /* Add to per-type tree and list */ + node = ns_tree_node_add(&ns->ns_tree_node, ns_tree, ns_cmp); + + /* Add to unified tree and list */ + ns_tree_node_add(&ns->ns_unified_node, &ns_unified_root, ns_cmp_unified); + + /* Add to owner's tree if applicable */ + if (ops) { + struct user_namespace *user_ns; + + VFS_WARN_ON_ONCE(!ops->owner); + user_ns = ops->owner(ns); + if (user_ns) { + struct ns_common *owner = &user_ns->ns; + VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER); + + /* Insert into owner's tree and list */ + ns_tree_node_add(&ns->ns_owner_node, &owner->ns_owner_root, ns_cmp_owner); + } else { + /* Only the initial user namespace doesn't have an owner. */ + VFS_WARN_ON_ONCE(ns != to_ns_common(&init_user_ns)); + } + } + + VFS_WARN_ON_ONCE(node); +} + +void __ns_tree_remove(struct ns_common *ns, struct ns_tree_root *ns_tree) +{ + const struct proc_ns_operations *ops = ns->ops; + struct user_namespace *user_ns; + + VFS_WARN_ON_ONCE(ns_tree_node_empty(&ns->ns_tree_node)); + VFS_WARN_ON_ONCE(list_empty(&ns->ns_tree_node.ns_list_entry)); + + write_seqlock(&ns_tree_lock); + + /* Remove from per-type tree and list */ + ns_tree_node_del(&ns->ns_tree_node, ns_tree); + + /* Remove from unified tree and list */ + ns_tree_node_del(&ns->ns_unified_node, &ns_unified_root); + + /* Remove from owner's tree if applicable */ + if (ops) { + user_ns = ops->owner(ns); + if (user_ns) { + struct ns_common *owner = &user_ns->ns; + ns_tree_node_del(&ns->ns_owner_node, &owner->ns_owner_root); + } + } + + write_sequnlock(&ns_tree_lock); +} +EXPORT_SYMBOL_GPL(__ns_tree_remove); + +static int ns_find(const void *key, const struct rb_node *node) +{ + const u64 ns_id = *(u64 *)key; + const struct ns_common *ns = node_to_ns(node); + + if (ns_id < ns->ns_id) + return -1; + if (ns_id > ns->ns_id) + return 1; + return 0; +} + +static int ns_find_unified(const void *key, const struct rb_node *node) +{ + const u64 ns_id = *(u64 *)key; + const struct ns_common *ns = node_to_ns_unified(node); + + if (ns_id < ns->ns_id) + return -1; + if (ns_id > ns->ns_id) + return 1; + return 0; +} + +static struct ns_tree_root *ns_tree_from_type(int ns_type) +{ + switch (ns_type) { + case CLONE_NEWCGROUP: + return &cgroup_ns_tree; + case CLONE_NEWIPC: + return &ipc_ns_tree; + case CLONE_NEWNS: + return &mnt_ns_tree; + case CLONE_NEWNET: + return &net_ns_tree; + case CLONE_NEWPID: + return &pid_ns_tree; + case CLONE_NEWUSER: + return &user_ns_tree; + case CLONE_NEWUTS: + return &uts_ns_tree; + case CLONE_NEWTIME: + return &time_ns_tree; + } + + return NULL; +} + +static struct ns_common *__ns_unified_tree_lookup_rcu(u64 ns_id) +{ + struct rb_node *node; + unsigned int seq; + + do { + seq = read_seqbegin(&ns_tree_lock); + node = rb_find_rcu(&ns_id, &ns_unified_root.ns_rb, ns_find_unified); + if (node) + break; + } while (read_seqretry(&ns_tree_lock, seq)); + + return node_to_ns_unified(node); +} + +static struct ns_common *__ns_tree_lookup_rcu(u64 ns_id, int ns_type) +{ + struct ns_tree_root *ns_tree; + struct rb_node *node; + unsigned int seq; + + ns_tree = ns_tree_from_type(ns_type); + if (!ns_tree) + return NULL; + + do { + seq = read_seqbegin(&ns_tree_lock); + node = rb_find_rcu(&ns_id, &ns_tree->ns_rb, ns_find); + if (node) + break; + } while (read_seqretry(&ns_tree_lock, seq)); + + return node_to_ns(node); +} + +struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) +{ + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage"); + + if (ns_type) + return __ns_tree_lookup_rcu(ns_id, ns_type); + + return __ns_unified_tree_lookup_rcu(ns_id); +} + +/** + * __ns_tree_adjoined_rcu - find the next/previous namespace in the same + * tree + * @ns: namespace to start from + * @ns_tree: namespace tree to search in + * @previous: if true find the previous namespace, otherwise the next + * + * Find the next or previous namespace in the same tree as @ns. If + * there is no next/previous namespace, -ENOENT is returned. + */ +struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, + struct ns_tree_root *ns_tree, bool previous) +{ + struct list_head *list; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_adjoined_rcu() usage"); + + if (previous) + list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_tree_node.ns_list_entry)); + else + list = rcu_dereference(list_next_rcu(&ns->ns_tree_node.ns_list_entry)); + if (list_is_head(list, &ns_tree->ns_list_head)) + return ERR_PTR(-ENOENT); + + return list_entry_rcu(list, struct ns_common, ns_tree_node.ns_list_entry); +} + +/** + * __ns_tree_gen_id - generate a new namespace id + * @ns: namespace to generate id for + * @id: if non-zero, this is the initial namespace and this is a fixed id + * + * Generates a new namespace id and assigns it to the namespace. All + * namespaces types share the same id space and thus can be compared + * directly. IOW, when two ids of two namespace are equal, they are + * identical. + */ +u64 __ns_tree_gen_id(struct ns_common *ns, u64 id) +{ + static atomic64_t namespace_cookie = ATOMIC64_INIT(NS_LAST_INIT_ID + 1); + + if (id) + ns->ns_id = id; + else + ns->ns_id = atomic64_inc_return(&namespace_cookie); + return ns->ns_id; +} + +struct klistns { + u64 __user *uns_ids; + u32 nr_ns_ids; + u64 last_ns_id; + u64 user_ns_id; + u32 ns_type; + struct user_namespace *user_ns; + bool userns_capable; + struct ns_common *first_ns; +}; + +static void __free_klistns_free(const struct klistns *kls) +{ + if (kls->user_ns_id != LISTNS_CURRENT_USER) + put_user_ns(kls->user_ns); + if (kls->first_ns && kls->first_ns->ops) + kls->first_ns->ops->put(kls->first_ns); +} + +#define NS_ALL (PID_NS | USER_NS | MNT_NS | UTS_NS | IPC_NS | NET_NS | CGROUP_NS | TIME_NS) + +static int copy_ns_id_req(const struct ns_id_req __user *req, + struct ns_id_req *kreq) +{ + int ret; + size_t usize; + + BUILD_BUG_ON(sizeof(struct ns_id_req) != NS_ID_REQ_SIZE_VER0); + + ret = get_user(usize, &req->size); + if (ret) + return -EFAULT; + if (unlikely(usize > PAGE_SIZE)) + return -E2BIG; + if (unlikely(usize < NS_ID_REQ_SIZE_VER0)) + return -EINVAL; + memset(kreq, 0, sizeof(*kreq)); + ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize); + if (ret) + return ret; + if (kreq->spare != 0) + return -EINVAL; + if (kreq->ns_type & ~NS_ALL) + return -EOPNOTSUPP; + return 0; +} + +static inline int prepare_klistns(struct klistns *kls, struct ns_id_req *kreq, + u64 __user *ns_ids, size_t nr_ns_ids) +{ + kls->last_ns_id = kreq->ns_id; + kls->user_ns_id = kreq->user_ns_id; + kls->nr_ns_ids = nr_ns_ids; + kls->ns_type = kreq->ns_type; + kls->uns_ids = ns_ids; + return 0; +} + +/* + * Lookup a namespace owned by owner with id >= ns_id. + * Returns the namespace with the smallest id that is >= ns_id. + */ +static struct ns_common *lookup_ns_owner_at(u64 ns_id, struct ns_common *owner) +{ + struct ns_common *ret = NULL; + struct rb_node *node; + + VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER); + + guard(ns_tree_locked_reader)(); + + node = owner->ns_owner_root.ns_rb.rb_node; + while (node) { + struct ns_common *ns; + + ns = node_to_ns_owner(node); + if (ns_id <= ns->ns_id) { + ret = ns; + if (ns_id == ns->ns_id) + break; + node = node->rb_left; + } else { + node = node->rb_right; + } + } + + if (ret) + ret = ns_get_unless_inactive(ret); + return ret; +} + +static struct ns_common *lookup_ns_id(u64 mnt_ns_id, int ns_type) +{ + struct ns_common *ns; + + guard(rcu)(); + ns = ns_tree_lookup_rcu(mnt_ns_id, ns_type); + if (!ns) + return NULL; + + if (!ns_get_unless_inactive(ns)) + return NULL; + + return ns; +} + +static inline bool __must_check ns_requested(const struct klistns *kls, + const struct ns_common *ns) +{ + return !kls->ns_type || (kls->ns_type & ns->ns_type); +} + +static inline bool __must_check may_list_ns(const struct klistns *kls, + struct ns_common *ns) +{ + if (kls->user_ns) { + if (kls->userns_capable) + return true; + } else { + struct ns_common *owner; + struct user_namespace *user_ns; + + owner = ns_owner(ns); + if (owner) + user_ns = to_user_ns(owner); + else + user_ns = &init_user_ns; + if (ns_capable_noaudit(user_ns, CAP_SYS_ADMIN)) + return true; + } + + if (is_current_namespace(ns)) + return true; + + if (ns->ns_type != CLONE_NEWUSER) + return false; + + if (ns_capable_noaudit(to_user_ns(ns), CAP_SYS_ADMIN)) + return true; + + return false; +} + +static inline void ns_put(struct ns_common *ns) +{ + if (ns && ns->ops) + ns->ops->put(ns); +} + +DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) ns_put(_T)) + +static inline struct ns_common *__must_check legitimize_ns(const struct klistns *kls, + struct ns_common *candidate) +{ + struct ns_common *ns __free(ns_put) = NULL; + + if (!ns_requested(kls, candidate)) + return NULL; + + ns = ns_get_unless_inactive(candidate); + if (!ns) + return NULL; + + if (!may_list_ns(kls, ns)) + return NULL; + + return no_free_ptr(ns); +} + +static ssize_t do_listns_userns(struct klistns *kls) +{ + u64 __user *ns_ids = kls->uns_ids; + size_t nr_ns_ids = kls->nr_ns_ids; + struct ns_common *ns = NULL, *first_ns = NULL, *prev = NULL; + const struct list_head *head; + ssize_t ret; + + VFS_WARN_ON_ONCE(!kls->user_ns_id); + + if (kls->user_ns_id == LISTNS_CURRENT_USER) + ns = to_ns_common(current_user_ns()); + else if (kls->user_ns_id) + ns = lookup_ns_id(kls->user_ns_id, CLONE_NEWUSER); + if (!ns) + return -EINVAL; + kls->user_ns = to_user_ns(ns); + + /* + * Use the rbtree to find the first namespace we care about and + * then use it's list entry to iterate from there. + */ + if (kls->last_ns_id) { + kls->first_ns = lookup_ns_owner_at(kls->last_ns_id + 1, ns); + if (!kls->first_ns) + return -ENOENT; + first_ns = kls->first_ns; + } + + ret = 0; + head = &to_ns_common(kls->user_ns)->ns_owner_root.ns_list_head; + kls->userns_capable = ns_capable_noaudit(kls->user_ns, CAP_SYS_ADMIN); + + rcu_read_lock(); + + if (!first_ns) + first_ns = list_entry_rcu(head->next, typeof(*first_ns), ns_owner_node.ns_list_entry); + + ns = first_ns; + list_for_each_entry_from_rcu(ns, head, ns_owner_node.ns_list_entry) { + struct ns_common *valid; + + if (!nr_ns_ids) + break; + + valid = legitimize_ns(kls, ns); + if (!valid) + continue; + + rcu_read_unlock(); + + ns_put(prev); + prev = valid; + + if (put_user(valid->ns_id, ns_ids + ret)) { + ns_put(prev); + return -EFAULT; + } + + nr_ns_ids--; + ret++; + + rcu_read_lock(); + } + + rcu_read_unlock(); + ns_put(prev); + return ret; +} + +/* + * Lookup a namespace with id >= ns_id in either the unified tree or a type-specific tree. + * Returns the namespace with the smallest id that is >= ns_id. + */ +static struct ns_common *lookup_ns_id_at(u64 ns_id, int ns_type) +{ + struct ns_common *ret = NULL; + struct ns_tree_root *ns_tree = NULL; + struct rb_node *node; + + if (ns_type) { + ns_tree = ns_tree_from_type(ns_type); + if (!ns_tree) + return NULL; + } + + guard(ns_tree_locked_reader)(); + + if (ns_tree) + node = ns_tree->ns_rb.rb_node; + else + node = ns_unified_root.ns_rb.rb_node; + + while (node) { + struct ns_common *ns; + + if (ns_type) + ns = node_to_ns(node); + else + ns = node_to_ns_unified(node); + + if (ns_id <= ns->ns_id) { + if (ns_type) + ret = node_to_ns(node); + else + ret = node_to_ns_unified(node); + if (ns_id == ns->ns_id) + break; + node = node->rb_left; + } else { + node = node->rb_right; + } + } + + if (ret) + ret = ns_get_unless_inactive(ret); + return ret; +} + +static inline struct ns_common *first_ns_common(const struct list_head *head, + struct ns_tree_root *ns_tree) +{ + if (ns_tree) + return list_entry_rcu(head->next, struct ns_common, ns_tree_node.ns_list_entry); + return list_entry_rcu(head->next, struct ns_common, ns_unified_node.ns_list_entry); +} + +static inline struct ns_common *next_ns_common(struct ns_common *ns, + struct ns_tree_root *ns_tree) +{ + if (ns_tree) + return list_entry_rcu(ns->ns_tree_node.ns_list_entry.next, struct ns_common, ns_tree_node.ns_list_entry); + return list_entry_rcu(ns->ns_unified_node.ns_list_entry.next, struct ns_common, ns_unified_node.ns_list_entry); +} + +static inline bool ns_common_is_head(struct ns_common *ns, + const struct list_head *head, + struct ns_tree_root *ns_tree) +{ + if (ns_tree) + return &ns->ns_tree_node.ns_list_entry == head; + return &ns->ns_unified_node.ns_list_entry == head; +} + +static ssize_t do_listns(struct klistns *kls) +{ + u64 __user *ns_ids = kls->uns_ids; + size_t nr_ns_ids = kls->nr_ns_ids; + struct ns_common *ns, *first_ns = NULL, *prev = NULL; + struct ns_tree_root *ns_tree = NULL; + const struct list_head *head; + u32 ns_type; + ssize_t ret; + + if (hweight32(kls->ns_type) == 1) + ns_type = kls->ns_type; + else + ns_type = 0; + + if (ns_type) { + ns_tree = ns_tree_from_type(ns_type); + if (!ns_tree) + return -EINVAL; + } + + if (kls->last_ns_id) { + kls->first_ns = lookup_ns_id_at(kls->last_ns_id + 1, ns_type); + if (!kls->first_ns) + return -ENOENT; + first_ns = kls->first_ns; + } + + ret = 0; + if (ns_tree) + head = &ns_tree->ns_list_head; + else + head = &ns_unified_root.ns_list_head; + + rcu_read_lock(); + + if (!first_ns) + first_ns = first_ns_common(head, ns_tree); + + for (ns = first_ns; !ns_common_is_head(ns, head, ns_tree) && nr_ns_ids; + ns = next_ns_common(ns, ns_tree)) { + struct ns_common *valid; + + valid = legitimize_ns(kls, ns); + if (!valid) + continue; + + rcu_read_unlock(); + + ns_put(prev); + prev = valid; + + if (put_user(valid->ns_id, ns_ids + ret)) { + ns_put(prev); + return -EFAULT; + } + + nr_ns_ids--; + ret++; + + rcu_read_lock(); + } + + rcu_read_unlock(); + ns_put(prev); + return ret; +} + +SYSCALL_DEFINE4(listns, const struct ns_id_req __user *, req, + u64 __user *, ns_ids, size_t, nr_ns_ids, unsigned int, flags) +{ + struct klistns klns __free(klistns_free) = {}; + const size_t maxcount = 1000000; + struct ns_id_req kreq; + ssize_t ret; + + if (flags) + return -EINVAL; + + if (unlikely(nr_ns_ids > maxcount)) + return -EOVERFLOW; + + if (!access_ok(ns_ids, nr_ns_ids * sizeof(*ns_ids))) + return -EFAULT; + + ret = copy_ns_id_req(req, &kreq); + if (ret) + return ret; + + ret = prepare_klistns(&klns, &kreq, ns_ids, nr_ns_ids); + if (ret) + return ret; + + if (kreq.user_ns_id) + return do_listns_userns(&klns); + + return do_listns(&klns); +} diff --git a/kernel/padata.c b/kernel/padata.c index b3d4eacc4f5d..aa66d91e20f9 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -63,17 +63,6 @@ static inline void padata_put_pd(struct parallel_data *pd) padata_put_pd_cnt(pd, 1); } -static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) -{ - int cpu, target_cpu; - - target_cpu = cpumask_first(pd->cpumask.pcpu); - for (cpu = 0; cpu < cpu_index; cpu++) - target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu); - - return target_cpu; -} - static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr) { /* @@ -82,7 +71,7 @@ static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr) */ int cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu); - return padata_index_to_cpu(pd, cpu_index); + return cpumask_nth(cpu_index, pd->cpumask.pcpu); } static struct padata_work *padata_work_alloc(void) @@ -192,9 +181,9 @@ int padata_do_parallel(struct padata_shell *ps, struct padata_priv *padata, int *cb_cpu) { struct padata_instance *pinst = ps->pinst; - int i, cpu, cpu_index, err; struct parallel_data *pd; struct padata_work *pw; + int cpu_index, err; rcu_read_lock_bh(); @@ -210,12 +199,7 @@ int padata_do_parallel(struct padata_shell *ps, /* Select an alternate fallback CPU and notify the caller. */ cpu_index = *cb_cpu % cpumask_weight(pd->cpumask.cbcpu); - - cpu = cpumask_first(pd->cpumask.cbcpu); - for (i = 0; i < cpu_index; i++) - cpu = cpumask_next(cpu, pd->cpumask.cbcpu); - - *cb_cpu = cpu; + *cb_cpu = cpumask_nth(cpu_index, pd->cpumask.cbcpu); } err = -EBUSY; @@ -261,20 +245,17 @@ EXPORT_SYMBOL(padata_do_parallel); * be parallel processed by another cpu and is not yet present in * the cpu's reorder queue. */ -static struct padata_priv *padata_find_next(struct parallel_data *pd, - bool remove_object) +static struct padata_priv *padata_find_next(struct parallel_data *pd, int cpu, + unsigned int processed) { struct padata_priv *padata; struct padata_list *reorder; - int cpu = pd->cpu; reorder = per_cpu_ptr(pd->reorder_list, cpu); spin_lock(&reorder->lock); - if (list_empty(&reorder->list)) { - spin_unlock(&reorder->lock); - return NULL; - } + if (list_empty(&reorder->list)) + goto notfound; padata = list_entry(reorder->list.next, struct padata_priv, list); @@ -282,96 +263,56 @@ static struct padata_priv *padata_find_next(struct parallel_data *pd, * Checks the rare case where two or more parallel jobs have hashed to * the same CPU and one of the later ones finishes first. */ - if (padata->seq_nr != pd->processed) { - spin_unlock(&reorder->lock); - return NULL; - } - - if (remove_object) { - list_del_init(&padata->list); - ++pd->processed; - pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu); - } + if (padata->seq_nr != processed) + goto notfound; + list_del_init(&padata->list); spin_unlock(&reorder->lock); return padata; + +notfound: + pd->processed = processed; + pd->cpu = cpu; + spin_unlock(&reorder->lock); + return NULL; } -static void padata_reorder(struct parallel_data *pd) +static void padata_reorder(struct padata_priv *padata) { + struct parallel_data *pd = padata->pd; struct padata_instance *pinst = pd->ps->pinst; - int cb_cpu; - struct padata_priv *padata; - struct padata_serial_queue *squeue; - struct padata_list *reorder; + unsigned int processed; + int cpu; - /* - * We need to ensure that only one cpu can work on dequeueing of - * the reorder queue the time. Calculating in which percpu reorder - * queue the next object will arrive takes some time. A spinlock - * would be highly contended. Also it is not clear in which order - * the objects arrive to the reorder queues. So a cpu could wait to - * get the lock just to notice that there is nothing to do at the - * moment. Therefore we use a trylock and let the holder of the lock - * care for all the objects enqueued during the holdtime of the lock. - */ - if (!spin_trylock_bh(&pd->lock)) - return; + processed = pd->processed; + cpu = pd->cpu; - while (1) { - padata = padata_find_next(pd, true); + do { + struct padata_serial_queue *squeue; + int cb_cpu; - /* - * If the next object that needs serialization is parallel - * processed by another cpu and is still on it's way to the - * cpu's reorder queue, nothing to do for now. - */ - if (!padata) - break; + processed++; + /* When sequence wraps around, reset to the first CPU. */ + if (unlikely(processed == 0)) + cpu = cpumask_first(pd->cpumask.pcpu); + else + cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu); cb_cpu = padata->cb_cpu; squeue = per_cpu_ptr(pd->squeue, cb_cpu); spin_lock(&squeue->serial.lock); list_add_tail(&padata->list, &squeue->serial.list); - spin_unlock(&squeue->serial.lock); - queue_work_on(cb_cpu, pinst->serial_wq, &squeue->work); - } - - spin_unlock_bh(&pd->lock); - - /* - * The next object that needs serialization might have arrived to - * the reorder queues in the meantime. - * - * Ensure reorder queue is read after pd->lock is dropped so we see - * new objects from another task in padata_do_serial. Pairs with - * smp_mb in padata_do_serial. - */ - smp_mb(); - reorder = per_cpu_ptr(pd->reorder_list, pd->cpu); - if (!list_empty(&reorder->list) && padata_find_next(pd, false)) { /* - * Other context(eg. the padata_serial_worker) can finish the request. - * To avoid UAF issue, add pd ref here, and put pd ref after reorder_work finish. + * If the next object that needs serialization is parallel + * processed by another cpu and is still on it's way to the + * cpu's reorder queue, end the loop. */ - padata_get_pd(pd); - queue_work(pinst->serial_wq, &pd->reorder_work); - } -} - -static void invoke_padata_reorder(struct work_struct *work) -{ - struct parallel_data *pd; - - local_bh_disable(); - pd = container_of(work, struct parallel_data, reorder_work); - padata_reorder(pd); - local_bh_enable(); - /* Pairs with putting the reorder_work in the serial_wq */ - padata_put_pd(pd); + padata = padata_find_next(pd, cpu, processed); + spin_unlock(&squeue->serial.lock); + } while (padata); } static void padata_serial_worker(struct work_struct *serial_work) @@ -422,6 +363,7 @@ void padata_do_serial(struct padata_priv *padata) struct padata_list *reorder = per_cpu_ptr(pd->reorder_list, hashed_cpu); struct padata_priv *cur; struct list_head *pos; + bool gotit = true; spin_lock(&reorder->lock); /* Sort in ascending order of sequence number. */ @@ -431,17 +373,14 @@ void padata_do_serial(struct padata_priv *padata) if ((signed int)(cur->seq_nr - padata->seq_nr) < 0) break; } - list_add(&padata->list, pos); + if (padata->seq_nr != pd->processed) { + gotit = false; + list_add(&padata->list, pos); + } spin_unlock(&reorder->lock); - /* - * Ensure the addition to the reorder list is ordered correctly - * with the trylock of pd->lock in padata_reorder. Pairs with smp_mb - * in padata_reorder. - */ - smp_mb(); - - padata_reorder(pd); + if (gotit) + padata_reorder(padata); } EXPORT_SYMBOL(padata_do_serial); @@ -551,9 +490,9 @@ void __init padata_do_multithreaded(struct padata_mt_job *job) do { nid = next_node_in(old_node, node_states[N_CPU]); } while (!atomic_try_cmpxchg(&last_used_nid, &old_node, nid)); - queue_work_node(nid, system_unbound_wq, &pw->pw_work); + queue_work_node(nid, system_dfl_wq, &pw->pw_work); } else { - queue_work(system_unbound_wq, &pw->pw_work); + queue_work(system_dfl_wq, &pw->pw_work); } /* Use the current thread, which saves starting a workqueue worker. */ @@ -567,12 +506,6 @@ void __init padata_do_multithreaded(struct padata_mt_job *job) padata_works_free(&works); } -static void __padata_list_init(struct padata_list *pd_list) -{ - INIT_LIST_HEAD(&pd_list->list); - spin_lock_init(&pd_list->lock); -} - /* Initialize all percpu queues used by serial workers */ static void padata_init_squeues(struct parallel_data *pd) { @@ -582,7 +515,8 @@ static void padata_init_squeues(struct parallel_data *pd) for_each_cpu(cpu, pd->cpumask.cbcpu) { squeue = per_cpu_ptr(pd->squeue, cpu); squeue->pd = pd; - __padata_list_init(&squeue->serial); + INIT_LIST_HEAD(&squeue->serial.list); + spin_lock_init(&squeue->serial.lock); INIT_WORK(&squeue->work, padata_serial_worker); } } @@ -595,7 +529,8 @@ static void padata_init_reorder_list(struct parallel_data *pd) for_each_cpu(cpu, pd->cpumask.pcpu) { list = per_cpu_ptr(pd->reorder_list, cpu); - __padata_list_init(list); + INIT_LIST_HEAD(&list->list); + spin_lock_init(&list->lock); } } @@ -631,9 +566,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) padata_init_squeues(pd); pd->seq_nr = -1; refcount_set(&pd->refcnt, 1); - spin_lock_init(&pd->lock); pd->cpu = cpumask_first(pd->cpumask.pcpu); - INIT_WORK(&pd->reorder_work, invoke_padata_reorder); return pd; @@ -1030,8 +963,9 @@ struct padata_instance *padata_alloc(const char *name) cpus_read_lock(); - pinst->serial_wq = alloc_workqueue("%s_serial", WQ_MEM_RECLAIM | - WQ_CPU_INTENSIVE, 1, name); + pinst->serial_wq = alloc_workqueue("%s_serial", + WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE | WQ_PERCPU, + 1, name); if (!pinst->serial_wq) goto err_put_cpus; @@ -1143,12 +1077,6 @@ void padata_free_shell(struct padata_shell *ps) if (!ps) return; - /* - * Wait for all _do_serial calls to finish to avoid touching - * freed pd's and ps's. - */ - synchronize_rcu(); - mutex_lock(&ps->pinst->lock); list_del(&ps->list); pd = rcu_dereference_protected(ps->pd, 1); diff --git a/kernel/panic.c b/kernel/panic.c index a3889f38153d..0d52210a9e2b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -36,6 +36,7 @@ #include <linux/sysfs.h> #include <linux/context_tracking.h> #include <linux/seq_buf.h> +#include <linux/sys_info.h> #include <trace/events/error_report.h> #include <asm/sections.h> @@ -52,7 +53,7 @@ static unsigned int __read_mostly sysctl_oops_all_cpu_backtrace; #define sysctl_oops_all_cpu_backtrace 0 #endif /* CONFIG_SMP */ -int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; +int panic_on_oops = IS_ENABLED(CONFIG_PANIC_ON_OOPS); static unsigned long tainted_mask = IS_ENABLED(CONFIG_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; static int pause_on_oops; @@ -63,27 +64,77 @@ int panic_on_warn __read_mostly; unsigned long panic_on_taint; bool panic_on_taint_nousertaint = false; static unsigned int warn_limit __read_mostly; +static bool panic_console_replay; bool panic_triggering_all_cpu_backtrace; +static bool panic_this_cpu_backtrace_printed; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); -#define PANIC_PRINT_TASK_INFO 0x00000001 -#define PANIC_PRINT_MEM_INFO 0x00000002 -#define PANIC_PRINT_TIMER_INFO 0x00000004 -#define PANIC_PRINT_LOCK_INFO 0x00000008 -#define PANIC_PRINT_FTRACE_INFO 0x00000010 -#define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020 -#define PANIC_PRINT_ALL_CPU_BT 0x00000040 -#define PANIC_PRINT_BLOCKED_TASKS 0x00000080 unsigned long panic_print; ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); +static void panic_print_deprecated(void) +{ + pr_info_once("Kernel: The 'panic_print' parameter is now deprecated. Please use 'panic_sys_info' and 'panic_console_replay' instead.\n"); +} + #ifdef CONFIG_SYSCTL + +/* + * Taint values can only be increased + * This means we can safely use a temporary. + */ +static int proc_taint(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + unsigned long tmptaint = get_taint(); + int err; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &tmptaint; + err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + + if (write) { + int i; + + /* + * If we are relying on panic_on_taint not producing + * false positives due to userspace input, bail out + * before setting the requested taint flags. + */ + if (panic_on_taint_nousertaint && (tmptaint & panic_on_taint)) + return -EINVAL; + + /* + * Poor man's atomic or. Not worth adding a primitive + * to everyone's atomic.h for this + */ + for (i = 0; i < TAINT_FLAGS_COUNT; i++) + if ((1UL << i) & tmptaint) + add_taint(i, LOCKDEP_STILL_OK); + } + + return err; +} + +static int sysctl_panic_print_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + panic_print_deprecated(); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); +} + static const struct ctl_table kern_panic_table[] = { #ifdef CONFIG_SMP { @@ -97,12 +148,65 @@ static const struct ctl_table kern_panic_table[] = { }, #endif { + .procname = "tainted", + .maxlen = sizeof(long), + .mode = 0644, + .proc_handler = proc_taint, + }, + { + .procname = "panic", + .data = &panic_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "panic_on_oops", + .data = &panic_on_oops, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "panic_print", + .data = &panic_print, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = sysctl_panic_print_handler, + }, + { + .procname = "panic_on_warn", + .data = &panic_on_warn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { .procname = "warn_limit", .data = &warn_limit, .maxlen = sizeof(warn_limit), .mode = 0644, .proc_handler = proc_douintvec, }, +#if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \ + defined(CONFIG_DEBUG_STACKOVERFLOW) + { + .procname = "panic_on_stackoverflow", + .data = &sysctl_panic_on_stackoverflow, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "panic_sys_info", + .data = &panic_print, + .maxlen = sizeof(panic_print), + .mode = 0644, + .proc_handler = sysctl_sys_info_handler, + }, }; static __init int kernel_panic_sysctls_init(void) @@ -113,6 +217,15 @@ static __init int kernel_panic_sysctls_init(void) late_initcall(kernel_panic_sysctls_init); #endif +/* The format is "panic_sys_info=tasks,mem,locks,ftrace,..." */ +static int __init setup_panic_sys_info(char *buf) +{ + /* There is no risk of race in kernel boot phase */ + panic_print = sys_info_parse_param(buf); + return 1; +} +__setup("panic_sys_info=", setup_panic_sys_info); + static atomic_t warn_count = ATOMIC_INIT(0); #ifdef CONFIG_SYSFS @@ -187,53 +300,73 @@ void __weak crash_smp_send_stop(void) atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); -/* - * A variant of panic() called from NMI context. We return if we've already - * panicked on this CPU. If another CPU already panicked, loop in - * nmi_panic_self_stop() which can provide architecture dependent code such - * as saving register state for crash dump. - */ -void nmi_panic(struct pt_regs *regs, const char *msg) +bool panic_try_start(void) { int old_cpu, this_cpu; + /* + * Only one CPU is allowed to execute the crash_kexec() code as with + * panic(). Otherwise parallel calls of panic() and crash_kexec() + * may stop each other. To exclude them, we use panic_cpu here too. + */ old_cpu = PANIC_CPU_INVALID; this_cpu = raw_smp_processor_id(); - /* atomic_try_cmpxchg updates old_cpu on failure */ - if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) - panic("%s", msg); - else if (old_cpu != this_cpu) - nmi_panic_self_stop(regs); + return atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu); } -EXPORT_SYMBOL(nmi_panic); +EXPORT_SYMBOL(panic_try_start); -static void panic_print_sys_info(bool console_flush) +void panic_reset(void) { - if (console_flush) { - if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG) - console_flush_on_panic(CONSOLE_REPLAY_ALL); - return; - } - - if (panic_print & PANIC_PRINT_TASK_INFO) - show_state(); - - if (panic_print & PANIC_PRINT_MEM_INFO) - show_mem(); + atomic_set(&panic_cpu, PANIC_CPU_INVALID); +} +EXPORT_SYMBOL(panic_reset); - if (panic_print & PANIC_PRINT_TIMER_INFO) - sysrq_timer_list_show(); +bool panic_in_progress(void) +{ + return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); +} +EXPORT_SYMBOL(panic_in_progress); - if (panic_print & PANIC_PRINT_LOCK_INFO) - debug_show_all_locks(); +/* Return true if a panic is in progress on the current CPU. */ +bool panic_on_this_cpu(void) +{ + /* + * We can use raw_smp_processor_id() here because it is impossible for + * the task to be migrated to the panic_cpu, or away from it. If + * panic_cpu has already been set, and we're not currently executing on + * that CPU, then we never will be. + */ + return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); +} +EXPORT_SYMBOL(panic_on_this_cpu); - if (panic_print & PANIC_PRINT_FTRACE_INFO) - ftrace_dump(DUMP_ALL); +/* + * Return true if a panic is in progress on a remote CPU. + * + * On true, the local CPU should immediately release any printing resources + * that may be needed by the panic CPU. + */ +bool panic_on_other_cpu(void) +{ + return (panic_in_progress() && !panic_on_this_cpu()); +} +EXPORT_SYMBOL(panic_on_other_cpu); - if (panic_print & PANIC_PRINT_BLOCKED_TASKS) - show_state_filter(TASK_UNINTERRUPTIBLE); +/* + * A variant of panic() called from NMI context. We return if we've already + * panicked on this CPU. If another CPU already panicked, loop in + * nmi_panic_self_stop() which can provide architecture dependent code such + * as saving register state for crash dump. + */ +void nmi_panic(struct pt_regs *regs, const char *msg) +{ + if (panic_try_start()) + panic("%s", msg); + else if (panic_on_other_cpu()) + nmi_panic_self_stop(regs); } +EXPORT_SYMBOL(nmi_panic); void check_panic_on_warn(const char *origin) { @@ -248,6 +381,19 @@ void check_panic_on_warn(const char *origin) origin, limit); } +static void panic_trigger_all_cpu_backtrace(void) +{ + /* Temporary allow non-panic CPUs to write their backtraces. */ + panic_triggering_all_cpu_backtrace = true; + + if (panic_this_cpu_backtrace_printed) + trigger_allbutcpu_cpu_backtrace(raw_smp_processor_id()); + else + trigger_all_cpu_backtrace(); + + panic_triggering_all_cpu_backtrace = false; +} + /* * Helper that triggers the NMI backtrace (if set in panic_print) * and then performs the secondary CPUs shutdown - we cannot have @@ -255,12 +401,8 @@ void check_panic_on_warn(const char *origin) */ static void panic_other_cpus_shutdown(bool crash_kexec) { - if (panic_print & PANIC_PRINT_ALL_CPU_BT) { - /* Temporary allow non-panic CPUs to write their backtraces. */ - panic_triggering_all_cpu_backtrace = true; - trigger_all_cpu_backtrace(); - panic_triggering_all_cpu_backtrace = false; - } + if (panic_print & SYS_INFO_ALL_BT) + panic_trigger_all_cpu_backtrace(); /* * Note that smp_send_stop() is the usual SMP shutdown function, @@ -277,20 +419,17 @@ static void panic_other_cpus_shutdown(bool crash_kexec) } /** - * panic - halt the system - * @fmt: The text string to print + * vpanic - halt the system + * @fmt: The text string to print + * @args: Arguments for the format string * - * Display a message, then perform cleanups. - * - * This function never returns. + * Display a message, then perform cleanups. This function never returns. */ -void panic(const char *fmt, ...) +void vpanic(const char *fmt, va_list args) { static char buf[1024]; - va_list args; long i, i_next = 0, len; int state = 0; - int old_cpu, this_cpu; bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; if (panic_on_warn) { @@ -327,32 +466,29 @@ void panic(const char *fmt, ...) * `old_cpu == this_cpu' means we came from nmi_panic() which sets * panic_cpu to this CPU. In this case, this is also the 1st CPU. */ - old_cpu = PANIC_CPU_INVALID; - this_cpu = raw_smp_processor_id(); - /* atomic_try_cmpxchg updates old_cpu on failure */ - if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { + if (panic_try_start()) { /* go ahead */ - } else if (old_cpu != this_cpu) + } else if (panic_on_other_cpu()) panic_smp_self_stop(); console_verbose(); bust_spinlocks(1); - va_start(args, fmt); len = vscnprintf(buf, sizeof(buf), fmt, args); - va_end(args); if (len && buf[len - 1] == '\n') buf[len - 1] = '\0'; pr_emerg("Kernel panic - not syncing: %s\n", buf); -#ifdef CONFIG_DEBUG_BUGVERBOSE /* * Avoid nested stack-dumping if a panic occurs during oops processing */ - if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) + if (test_taint(TAINT_DIE) || oops_in_progress > 1) { + panic_this_cpu_backtrace_printed = true; + } else if (IS_ENABLED(CONFIG_DEBUG_BUGVERBOSE)) { dump_stack(); -#endif + panic_this_cpu_backtrace_printed = true; + } /* * If kgdb is enabled, give it a chance to run before we stop all @@ -382,7 +518,7 @@ void panic(const char *fmt, ...) */ atomic_notifier_call_chain(&panic_notifier_list, 0, buf); - panic_print_sys_info(false); + sys_info(panic_print); kmsg_dump_desc(KMSG_DUMP_PANIC, buf); @@ -411,7 +547,9 @@ void panic(const char *fmt, ...) debug_locks_off(); console_flush_on_panic(CONSOLE_FLUSH_PENDING); - panic_print_sys_info(true); + if ((panic_print & SYS_INFO_PANIC_CONSOLE_REPLAY) || + panic_console_replay) + console_flush_on_panic(CONSOLE_REPLAY_ALL); if (!panic_blink) panic_blink = no_blink; @@ -477,41 +615,53 @@ void panic(const char *fmt, ...) mdelay(PANIC_TIMER_STEP); } } +EXPORT_SYMBOL(vpanic); + +/* Identical to vpanic(), except it takes variadic arguments instead of va_list */ +void panic(const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + vpanic(fmt, args); + va_end(args); +} EXPORT_SYMBOL(panic); -#define TAINT_FLAG(taint, _c_true, _c_false, _module) \ +#define TAINT_FLAG(taint, _c_true, _c_false) \ [ TAINT_##taint ] = { \ .c_true = _c_true, .c_false = _c_false, \ - .module = _module, \ .desc = #taint, \ } /* - * TAINT_FORCED_RMMOD could be a per-module flag but the module - * is being removed anyway. + * NOTE: if you modify the taint_flags or TAINT_FLAGS_COUNT, + * please also modify tools/debugging/kernel-chktaint and + * Documentation/admin-guide/tainted-kernels.rst, including its + * small shell script that prints the TAINT_FLAGS_COUNT bits of + * /proc/sys/kernel/tainted. */ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { - TAINT_FLAG(PROPRIETARY_MODULE, 'P', 'G', true), - TAINT_FLAG(FORCED_MODULE, 'F', ' ', true), - TAINT_FLAG(CPU_OUT_OF_SPEC, 'S', ' ', false), - TAINT_FLAG(FORCED_RMMOD, 'R', ' ', false), - TAINT_FLAG(MACHINE_CHECK, 'M', ' ', false), - TAINT_FLAG(BAD_PAGE, 'B', ' ', false), - TAINT_FLAG(USER, 'U', ' ', false), - TAINT_FLAG(DIE, 'D', ' ', false), - TAINT_FLAG(OVERRIDDEN_ACPI_TABLE, 'A', ' ', false), - TAINT_FLAG(WARN, 'W', ' ', false), - TAINT_FLAG(CRAP, 'C', ' ', true), - TAINT_FLAG(FIRMWARE_WORKAROUND, 'I', ' ', false), - TAINT_FLAG(OOT_MODULE, 'O', ' ', true), - TAINT_FLAG(UNSIGNED_MODULE, 'E', ' ', true), - TAINT_FLAG(SOFTLOCKUP, 'L', ' ', false), - TAINT_FLAG(LIVEPATCH, 'K', ' ', true), - TAINT_FLAG(AUX, 'X', ' ', true), - TAINT_FLAG(RANDSTRUCT, 'T', ' ', true), - TAINT_FLAG(TEST, 'N', ' ', true), - TAINT_FLAG(FWCTL, 'J', ' ', true), + TAINT_FLAG(PROPRIETARY_MODULE, 'P', 'G'), + TAINT_FLAG(FORCED_MODULE, 'F', ' '), + TAINT_FLAG(CPU_OUT_OF_SPEC, 'S', ' '), + TAINT_FLAG(FORCED_RMMOD, 'R', ' '), + TAINT_FLAG(MACHINE_CHECK, 'M', ' '), + TAINT_FLAG(BAD_PAGE, 'B', ' '), + TAINT_FLAG(USER, 'U', ' '), + TAINT_FLAG(DIE, 'D', ' '), + TAINT_FLAG(OVERRIDDEN_ACPI_TABLE, 'A', ' '), + TAINT_FLAG(WARN, 'W', ' '), + TAINT_FLAG(CRAP, 'C', ' '), + TAINT_FLAG(FIRMWARE_WORKAROUND, 'I', ' '), + TAINT_FLAG(OOT_MODULE, 'O', ' '), + TAINT_FLAG(UNSIGNED_MODULE, 'E', ' '), + TAINT_FLAG(SOFTLOCKUP, 'L', ' '), + TAINT_FLAG(LIVEPATCH, 'K', ' '), + TAINT_FLAG(AUX, 'X', ' '), + TAINT_FLAG(RANDSTRUCT, 'T', ' '), + TAINT_FLAG(TEST, 'N', ' '), + TAINT_FLAG(FWCTL, 'J', ' '), }; #undef TAINT_FLAG @@ -725,13 +875,15 @@ void __warn(const char *file, int line, void *caller, unsigned taint, disable_trace_on_warning(); - if (file) - pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", - raw_smp_processor_id(), current->pid, file, line, - caller); - else - pr_warn("WARNING: CPU: %d PID: %d at %pS\n", - raw_smp_processor_id(), current->pid, caller); + if (file) { + pr_warn("WARNING: %s:%d at %pS, CPU#%d: %s/%d\n", + file, line, caller, + raw_smp_processor_id(), current->comm, current->pid); + } else { + pr_warn("WARNING: at %pS, CPU#%d: %s/%d\n", + caller, + raw_smp_processor_id(), current->comm, current->pid); + } #pragma GCC diagnostic push #ifndef __clang__ @@ -849,10 +1001,28 @@ EXPORT_SYMBOL(__stack_chk_fail); #endif core_param(panic, panic_timeout, int, 0644); -core_param(panic_print, panic_print, ulong, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); core_param(panic_on_warn, panic_on_warn, int, 0644); core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644); +core_param(panic_console_replay, panic_console_replay, bool, 0644); + +static int panic_print_set(const char *val, const struct kernel_param *kp) +{ + panic_print_deprecated(); + return param_set_ulong(val, kp); +} + +static int panic_print_get(char *val, const struct kernel_param *kp) +{ + panic_print_deprecated(); + return param_get_ulong(val, kp); +} + +static const struct kernel_param_ops panic_print_ops = { + .set = panic_print_set, + .get = panic_print_get, +}; +__core_param_cb(panic_print, &panic_print_ops, &panic_print, 0644); static int __init oops_setup(char *s) { diff --git a/kernel/params.c b/kernel/params.c index b92d64161b75..b96cfd693c99 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -513,13 +513,14 @@ EXPORT_SYMBOL(param_array_ops); int param_set_copystring(const char *val, const struct kernel_param *kp) { const struct kparam_string *kps = kp->str; + const size_t len = strnlen(val, kps->maxlen); - if (strnlen(val, kps->maxlen) == kps->maxlen) { + if (len == kps->maxlen) { pr_err("%s: string doesn't fit in %u chars.\n", kp->name, kps->maxlen-1); return -ENOSPC; } - strcpy(kps->string, val); + memcpy(kps->string, val, len + 1); return 0; } EXPORT_SYMBOL(param_set_copystring); @@ -841,7 +842,7 @@ static void __init param_sysfs_builtin(void) dot = strchr(kp->name, '.'); if (!dot) { /* This happens for core_param() */ - strcpy(modname, "kernel"); + strscpy(modname, "kernel"); name_len = 0; } else { name_len = dot - kp->name + 1; diff --git a/kernel/pid.c b/kernel/pid.c index 4ac2ce46817f..a31771bc89c1 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -71,16 +71,12 @@ static int pid_max_max = PID_MAX_LIMIT; * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { - .ns.count = REFCOUNT_INIT(2), + .ns = NS_COMMON_INIT(init_pid_ns), .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, - .ns.inum = PROC_PID_INIT_INO, -#ifdef CONFIG_PID_NS - .ns.ops = &pidns_operations, -#endif .pid_max = PID_MAX_DEFAULT, #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, @@ -100,6 +96,7 @@ void put_pid(struct pid *pid) ns = pid->numbers[pid->level].ns; if (refcount_dec_and_test(&pid->count)) { + pidfs_free_pid(pid); kmem_cache_free(ns->pid_cachep, pid); put_pid_ns(ns); } @@ -115,9 +112,13 @@ static void delayed_put_pid(struct rcu_head *rhp) void free_pid(struct pid *pid) { int i; + struct pid_namespace *active_ns; lockdep_assert_not_held(&tasklist_lock); + active_ns = pid->numbers[pid->level].ns; + ns_ref_active_put(active_ns); + spin_lock(&pidmap_lock); for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; @@ -281,6 +282,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, } spin_unlock(&pidmap_lock); idr_preload_end(); + ns_ref_active_get(ns); return pid; @@ -359,11 +361,6 @@ static void __change_pid(struct pid **pids, struct task_struct *task, hlist_del_rcu(&task->pid_links[type]); *pid_ptr = new; - if (type == PIDTYPE_PID) { - WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID)); - wake_up_all(&pid->wait_pidfd); - } - for (tmp = PIDTYPE_MAX; --tmp >= 0; ) if (pid_has_task(pid, tmp)) return; @@ -495,7 +492,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) struct upid *upid; pid_t nr = 0; - if (pid && ns->level <= pid->level) { + if (pid && ns && ns->level <= pid->level) { upid = &pid->numbers[ns->level]; if (upid->ns == ns) nr = upid->nr; @@ -518,7 +515,8 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, rcu_read_lock(); if (!ns) ns = task_active_pid_ns(current); - nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); + if (ns) + nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); rcu_read_unlock(); return nr; @@ -684,7 +682,7 @@ static int pid_table_root_permissions(struct ctl_table_header *head, container_of(head->set, struct pid_namespace, set); int mode = table->mode; - if (ns_capable(pidns->user_ns, CAP_SYS_ADMIN) || + if (ns_capable_noaudit(pidns->user_ns, CAP_SYS_ADMIN) || uid_eq(current_euid(), make_kuid(pidns->user_ns, 0))) mode = (mode & S_IRWXU) >> 6; else if (in_egroup_p(make_kgid(pidns->user_ns, 0))) @@ -717,6 +715,29 @@ static struct ctl_table_root pid_table_root = { .set_ownership = pid_table_root_set_ownership, }; +static int proc_do_cad_pid(const struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + struct pid *new_pid; + pid_t tmp_pid; + int r; + struct ctl_table tmp_table = *table; + + tmp_pid = pid_vnr(cad_pid); + tmp_table.data = &tmp_pid; + + r = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); + if (r || !write) + return r; + + new_pid = find_get_pid(tmp_pid); + if (!new_pid) + return -ESRCH; + + put_pid(xchg(&cad_pid, new_pid)); + return 0; +} + static const struct ctl_table pid_table[] = { { .procname = "pid_max", @@ -727,6 +748,14 @@ static const struct ctl_table pid_table[] = { .extra1 = &pid_max_min, .extra2 = &pid_max_max, }, +#ifdef CONFIG_PROC_SYSCTL + { + .procname = "cad_pid", + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_do_cad_pid, + }, +#endif }; #endif diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 7098ed44e717..e48f5de41361 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -23,6 +23,7 @@ #include <linux/sched/task.h> #include <linux/sched/signal.h> #include <linux/idr.h> +#include <linux/nstree.h> #include <uapi/linux/wait.h> #include "pid_sysctl.h" @@ -102,17 +103,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns->pid_cachep == NULL) goto out_free_idr; - err = ns_alloc_inum(&ns->ns); + err = ns_common_init(ns); if (err) goto out_free_idr; - ns->ns.ops = &pidns_operations; ns->pid_max = PID_MAX_LIMIT; err = register_pidns_sysctls(ns); if (err) goto out_free_inum; - refcount_set(&ns->ns.count, 1); ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); @@ -124,10 +123,11 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns); #endif + ns_tree_add(ns); return ns; out_free_inum: - ns_free_inum(&ns->ns); + ns_common_free(ns); out_free_idr: idr_destroy(&ns->idr); kmem_cache_free(pid_ns_cachep, ns); @@ -149,9 +149,10 @@ static void delayed_free_pidns(struct rcu_head *p) static void destroy_pid_namespace(struct pid_namespace *ns) { + ns_tree_remove(ns); unregister_pidns_sysctls(ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); idr_destroy(&ns->idr); call_rcu(&ns->rcu, delayed_free_pidns); @@ -168,10 +169,10 @@ static void destroy_pid_namespace_work(struct work_struct *work) parent = ns->parent; destroy_pid_namespace(ns); ns = parent; - } while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)); + } while (ns != &init_pid_ns && ns_ref_put(ns)); } -struct pid_namespace *copy_pid_ns(unsigned long flags, +struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *old_ns) { if (!(flags & CLONE_NEWPID)) @@ -183,7 +184,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, void put_pid_ns(struct pid_namespace *ns) { - if (ns && ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)) + if (ns && ns_ref_put(ns)) schedule_work(&ns->work); } EXPORT_SYMBOL_GPL(put_pid_ns); @@ -344,11 +345,6 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) return 0; } -static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) -{ - return container_of(ns, struct pid_namespace, ns); -} - static struct ns_common *pidns_get(struct task_struct *task) { struct pid_namespace *ns; @@ -390,11 +386,23 @@ static void pidns_put(struct ns_common *ns) put_pid_ns(to_pid_ns(ns)); } +bool pidns_is_ancestor(struct pid_namespace *child, + struct pid_namespace *ancestor) +{ + struct pid_namespace *ns; + + if (child->level < ancestor->level) + return false; + for (ns = child; ns->level > ancestor->level; ns = ns->parent) + ; + return ns == ancestor; +} + static int pidns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct pid_namespace *active = task_active_pid_ns(current); - struct pid_namespace *ancestor, *new = to_pid_ns(ns); + struct pid_namespace *new = to_pid_ns(ns); if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) @@ -408,13 +416,7 @@ static int pidns_install(struct nsset *nsset, struct ns_common *ns) * this maintains the property that processes and their * children can not escape their current pid namespace. */ - if (new->level < active->level) - return -EINVAL; - - ancestor = new; - while (ancestor->level > active->level) - ancestor = ancestor->parent; - if (ancestor != active) + if (!pidns_is_ancestor(new, active)) return -EINVAL; put_pid_ns(nsproxy->pid_ns_for_children); @@ -447,7 +449,6 @@ static struct user_namespace *pidns_owner(struct ns_common *ns) const struct proc_ns_operations pidns_operations = { .name = "pid", - .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, @@ -458,7 +459,6 @@ const struct proc_ns_operations pidns_operations = { const struct proc_ns_operations pidns_for_children_operations = { .name = "pid_for_children", .real_ns_name = "pid", - .type = CLONE_NEWPID, .get = pidns_for_children_get, .put = pidns_put, .install = pidns_install, @@ -475,6 +475,7 @@ static __init int pid_namespaces_init(void) #endif register_pid_ns_sysctl_table_vm(); + ns_tree_add(&init_pid_ns); return 0; } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 54a623680019..05337f437cca 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -202,6 +202,17 @@ config PM_WAKELOCKS_GC depends on PM_WAKELOCKS default y +config PM_QOS_CPU_SYSTEM_WAKEUP + bool "User space interface for CPU system wakeup QoS" + depends on CPU_IDLE + help + Enable this to allow user space via the cpu_wakeup_latency file to + specify a CPU system wakeup latency limit. + + This may be particularly useful for platforms supporting multiple low + power states for CPUs during system-wide suspend and s2idle in + particular. + config PM bool "Device power management core functionality" help diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 874ad834dc8d..773e2789412b 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -21,4 +21,6 @@ obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o -obj-$(CONFIG_ENERGY_MODEL) += energy_model.o +obj-$(CONFIG_ENERGY_MODEL) += em.o +em-y := energy_model.o +em-$(CONFIG_NET) += em_netlink_autogen.o em_netlink.o diff --git a/kernel/power/console.c b/kernel/power/console.c index fcdf0e14a47d..a906a0ac0f9b 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -16,6 +16,7 @@ #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) static int orig_fgconsole, orig_kmsg; +static bool vt_switch_done; static DEFINE_MUTEX(vt_switch_mutex); @@ -43,9 +44,10 @@ static LIST_HEAD(pm_vt_switch_list); * no_console_suspend argument has been passed on the command line, VT * switches will occur. */ -void pm_vt_switch_required(struct device *dev, bool required) +int pm_vt_switch_required(struct device *dev, bool required) { struct pm_vt_switch *entry, *tmp; + int ret = 0; mutex_lock(&vt_switch_mutex); list_for_each_entry(tmp, &pm_vt_switch_list, head) { @@ -57,8 +59,10 @@ void pm_vt_switch_required(struct device *dev, bool required) } entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) + if (!entry) { + ret = -ENOMEM; goto out; + } entry->required = required; entry->dev = dev; @@ -66,6 +70,7 @@ void pm_vt_switch_required(struct device *dev, bool required) list_add(&entry->head, &pm_vt_switch_list); out: mutex_unlock(&vt_switch_mutex); + return ret; } EXPORT_SYMBOL(pm_vt_switch_required); @@ -136,17 +141,21 @@ void pm_prepare_console(void) if (orig_fgconsole < 0) return; + vt_switch_done = true; + orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE); return; } void pm_restore_console(void) { - if (!pm_vt_switch()) + if (!pm_vt_switch() && !vt_switch_done) return; if (orig_fgconsole >= 0) { vt_move_to_console(orig_fgconsole, 0); vt_kmsg_redirect(orig_kmsg); } + + vt_switch_done = false; } diff --git a/kernel/power/em_netlink.c b/kernel/power/em_netlink.c new file mode 100644 index 000000000000..4b85da138a06 --- /dev/null +++ b/kernel/power/em_netlink.c @@ -0,0 +1,308 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Generic netlink for energy model. + * + * Copyright (c) 2025 Valve Corporation. + * Author: Changwoo Min <changwoo@igalia.com> + */ + +#define pr_fmt(fmt) "energy_model: " fmt + +#include <linux/energy_model.h> +#include <net/sock.h> +#include <net/genetlink.h> +#include <uapi/linux/energy_model.h> + +#include "em_netlink.h" +#include "em_netlink_autogen.h" + +#define EM_A_PD_CPUS_LEN 256 + +/*************************** Command encoding ********************************/ +static int __em_nl_get_pd_size(struct em_perf_domain *pd, void *data) +{ + char cpus_buf[EM_A_PD_CPUS_LEN]; + int *tot_msg_sz = data; + int msg_sz, cpus_sz; + + cpus_sz = snprintf(cpus_buf, sizeof(cpus_buf), "%*pb", + cpumask_pr_args(to_cpumask(pd->cpus))); + + msg_sz = nla_total_size(0) + /* EM_A_PDS_PD */ + nla_total_size(sizeof(u32)) + /* EM_A_PD_PD_ID */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PD_FLAGS */ + nla_total_size(cpus_sz); /* EM_A_PD_CPUS */ + + *tot_msg_sz += nlmsg_total_size(genlmsg_msg_size(msg_sz)); + return 0; +} + +static int __em_nl_get_pd(struct em_perf_domain *pd, void *data) +{ + char cpus_buf[EM_A_PD_CPUS_LEN]; + struct sk_buff *msg = data; + struct nlattr *entry; + + entry = nla_nest_start(msg, EM_A_PDS_PD); + if (!entry) + goto out_cancel_nest; + + if (nla_put_u32(msg, EM_A_PD_PD_ID, pd->id)) + goto out_cancel_nest; + + if (nla_put_u64_64bit(msg, EM_A_PD_FLAGS, pd->flags, EM_A_PD_PAD)) + goto out_cancel_nest; + + snprintf(cpus_buf, sizeof(cpus_buf), "%*pb", + cpumask_pr_args(to_cpumask(pd->cpus))); + if (nla_put_string(msg, EM_A_PD_CPUS, cpus_buf)) + goto out_cancel_nest; + + nla_nest_end(msg, entry); + + return 0; + +out_cancel_nest: + nla_nest_cancel(msg, entry); + + return -EMSGSIZE; +} + +int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + void *hdr; + int cmd = info->genlhdr->cmd; + int ret = -EMSGSIZE, msg_sz = 0; + + for_each_em_perf_domain(__em_nl_get_pd_size, &msg_sz); + + msg = genlmsg_new(msg_sz, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put_reply(msg, info, &em_nl_family, 0, cmd); + if (!hdr) + goto out_free_msg; + + ret = for_each_em_perf_domain(__em_nl_get_pd, msg); + if (ret) + goto out_cancel_msg; + + genlmsg_end(msg, hdr); + + return genlmsg_reply(msg, info); + +out_cancel_msg: + genlmsg_cancel(msg, hdr); +out_free_msg: + nlmsg_free(msg); + + return ret; +} + +static struct em_perf_domain *__em_nl_get_pd_table_id(struct nlattr **attrs) +{ + struct em_perf_domain *pd; + int id; + + if (!attrs[EM_A_PD_TABLE_PD_ID]) + return NULL; + + id = nla_get_u32(attrs[EM_A_PD_TABLE_PD_ID]); + pd = em_perf_domain_get_by_id(id); + return pd; +} + +static int __em_nl_get_pd_table_size(const struct em_perf_domain *pd) +{ + int id_sz, ps_sz; + + id_sz = nla_total_size(sizeof(u32)); /* EM_A_PD_TABLE_PD_ID */ + ps_sz = nla_total_size(0) + /* EM_A_PD_TABLE_PS */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_PERFORMANCE */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_FREQUENCY */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_POWER */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_COST */ + nla_total_size_64bit(sizeof(u64)); /* EM_A_PS_FLAGS */ + ps_sz *= pd->nr_perf_states; + + return nlmsg_total_size(genlmsg_msg_size(id_sz + ps_sz)); +} + +static int __em_nl_get_pd_table(struct sk_buff *msg, const struct em_perf_domain *pd) +{ + struct em_perf_state *table, *ps; + struct nlattr *entry; + int i; + + if (nla_put_u32(msg, EM_A_PD_TABLE_PD_ID, pd->id)) + goto out_err; + + rcu_read_lock(); + table = em_perf_state_from_pd((struct em_perf_domain *)pd); + + for (i = 0; i < pd->nr_perf_states; i++) { + ps = &table[i]; + + entry = nla_nest_start(msg, EM_A_PD_TABLE_PS); + if (!entry) + goto out_unlock_ps; + + if (nla_put_u64_64bit(msg, EM_A_PS_PERFORMANCE, + ps->performance, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + if (nla_put_u64_64bit(msg, EM_A_PS_FREQUENCY, + ps->frequency, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + if (nla_put_u64_64bit(msg, EM_A_PS_POWER, + ps->power, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + if (nla_put_u64_64bit(msg, EM_A_PS_COST, + ps->cost, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + if (nla_put_u64_64bit(msg, EM_A_PS_FLAGS, + ps->flags, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + + nla_nest_end(msg, entry); + } + rcu_read_unlock(); + return 0; + +out_cancel_ps_nest: + nla_nest_cancel(msg, entry); +out_unlock_ps: + rcu_read_unlock(); +out_err: + return -EMSGSIZE; +} + +int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info) +{ + int cmd = info->genlhdr->cmd; + int msg_sz, ret = -EMSGSIZE; + struct em_perf_domain *pd; + struct sk_buff *msg; + void *hdr; + + pd = __em_nl_get_pd_table_id(info->attrs); + if (!pd) + return -EINVAL; + + msg_sz = __em_nl_get_pd_table_size(pd); + + msg = genlmsg_new(msg_sz, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put_reply(msg, info, &em_nl_family, 0, cmd); + if (!hdr) + goto out_free_msg; + + ret = __em_nl_get_pd_table(msg, pd); + if (ret) + goto out_free_msg; + + genlmsg_end(msg, hdr); + return genlmsg_reply(msg, info); + +out_free_msg: + nlmsg_free(msg); + return ret; +} + + +/**************************** Event encoding *********************************/ +static void __em_notify_pd_table(const struct em_perf_domain *pd, int ntf_type) +{ + struct sk_buff *msg; + int msg_sz, ret = -EMSGSIZE; + void *hdr; + + if (!genl_has_listeners(&em_nl_family, &init_net, EM_NLGRP_EVENT)) + return; + + msg_sz = __em_nl_get_pd_table_size(pd); + + msg = genlmsg_new(msg_sz, GFP_KERNEL); + if (!msg) + return; + + hdr = genlmsg_put(msg, 0, 0, &em_nl_family, 0, ntf_type); + if (!hdr) + goto out_free_msg; + + ret = __em_nl_get_pd_table(msg, pd); + if (ret) + goto out_free_msg; + + genlmsg_end(msg, hdr); + + genlmsg_multicast(&em_nl_family, msg, 0, EM_NLGRP_EVENT, GFP_KERNEL); + + return; + +out_free_msg: + nlmsg_free(msg); + return; +} + +void em_notify_pd_created(const struct em_perf_domain *pd) +{ + __em_notify_pd_table(pd, EM_CMD_PD_CREATED); +} + +void em_notify_pd_updated(const struct em_perf_domain *pd) +{ + __em_notify_pd_table(pd, EM_CMD_PD_UPDATED); +} + +static int __em_notify_pd_deleted_size(const struct em_perf_domain *pd) +{ + int id_sz = nla_total_size(sizeof(u32)); /* EM_A_PD_TABLE_PD_ID */ + + return nlmsg_total_size(genlmsg_msg_size(id_sz)); +} + +void em_notify_pd_deleted(const struct em_perf_domain *pd) +{ + struct sk_buff *msg; + void *hdr; + int msg_sz; + + if (!genl_has_listeners(&em_nl_family, &init_net, EM_NLGRP_EVENT)) + return; + + msg_sz = __em_notify_pd_deleted_size(pd); + + msg = genlmsg_new(msg_sz, GFP_KERNEL); + if (!msg) + return; + + hdr = genlmsg_put(msg, 0, 0, &em_nl_family, 0, EM_CMD_PD_DELETED); + if (!hdr) + goto out_free_msg; + + if (nla_put_u32(msg, EM_A_PD_TABLE_PD_ID, pd->id)) { + goto out_free_msg; + } + + genlmsg_end(msg, hdr); + + genlmsg_multicast(&em_nl_family, msg, 0, EM_NLGRP_EVENT, GFP_KERNEL); + + return; + +out_free_msg: + nlmsg_free(msg); + return; +} + +/**************************** Initialization *********************************/ +static int __init em_netlink_init(void) +{ + return genl_register_family(&em_nl_family); +} +postcore_initcall(em_netlink_init); diff --git a/kernel/power/em_netlink.h b/kernel/power/em_netlink.h new file mode 100644 index 000000000000..583d7f1c3939 --- /dev/null +++ b/kernel/power/em_netlink.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * + * Generic netlink for energy model. + * + * Copyright (c) 2025 Valve Corporation. + * Author: Changwoo Min <changwoo@igalia.com> + */ +#ifndef _EM_NETLINK_H +#define _EM_NETLINK_H + +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_NET) +int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *), + void *data); +struct em_perf_domain *em_perf_domain_get_by_id(int id); +void em_notify_pd_created(const struct em_perf_domain *pd); +void em_notify_pd_deleted(const struct em_perf_domain *pd); +void em_notify_pd_updated(const struct em_perf_domain *pd); +#else +static inline +int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *), + void *data) +{ + return -EINVAL; +} +static inline +struct em_perf_domain *em_perf_domain_get_by_id(int id) +{ + return NULL; +} + +static inline void em_notify_pd_created(const struct em_perf_domain *pd) {} + +static inline void em_notify_pd_deleted(const struct em_perf_domain *pd) {} + +static inline void em_notify_pd_updated(const struct em_perf_domain *pd) {} +#endif + +#endif /* _EM_NETLINK_H */ diff --git a/kernel/power/em_netlink_autogen.c b/kernel/power/em_netlink_autogen.c new file mode 100644 index 000000000000..ceb3b2bb6ebe --- /dev/null +++ b/kernel/power/em_netlink_autogen.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/em.yaml */ +/* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include "em_netlink_autogen.h" + +#include <uapi/linux/energy_model.h> + +/* EM_CMD_GET_PD_TABLE - do */ +static const struct nla_policy em_get_pd_table_nl_policy[EM_A_PD_TABLE_PD_ID + 1] = { + [EM_A_PD_TABLE_PD_ID] = { .type = NLA_U32, }, +}; + +/* Ops table for em */ +static const struct genl_split_ops em_nl_ops[] = { + { + .cmd = EM_CMD_GET_PDS, + .doit = em_nl_get_pds_doit, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = EM_CMD_GET_PD_TABLE, + .doit = em_nl_get_pd_table_doit, + .policy = em_get_pd_table_nl_policy, + .maxattr = EM_A_PD_TABLE_PD_ID, + .flags = GENL_CMD_CAP_DO, + }, +}; + +static const struct genl_multicast_group em_nl_mcgrps[] = { + [EM_NLGRP_EVENT] = { "event", }, +}; + +struct genl_family em_nl_family __ro_after_init = { + .name = EM_FAMILY_NAME, + .version = EM_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = em_nl_ops, + .n_split_ops = ARRAY_SIZE(em_nl_ops), + .mcgrps = em_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(em_nl_mcgrps), +}; diff --git a/kernel/power/em_netlink_autogen.h b/kernel/power/em_netlink_autogen.h new file mode 100644 index 000000000000..140ab548103c --- /dev/null +++ b/kernel/power/em_netlink_autogen.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/em.yaml */ +/* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ + +#ifndef _LINUX_EM_GEN_H +#define _LINUX_EM_GEN_H + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include <uapi/linux/energy_model.h> + +int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info); +int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info); + +enum { + EM_NLGRP_EVENT, +}; + +extern struct genl_family em_nl_family; + +#endif /* _LINUX_EM_GEN_H */ diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index d9b7e2b38c7a..11af9f64aa82 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -17,12 +17,24 @@ #include <linux/sched/topology.h> #include <linux/slab.h> +#include "em_netlink.h" + /* * Mutex serializing the registrations of performance domains and letting * callbacks defined by drivers sleep. */ static DEFINE_MUTEX(em_pd_mutex); +/* + * Manage performance domains with IDs. One can iterate the performance domains + * through the list and pick one with their associated ID. The mutex serializes + * the list access. When holding em_pd_list_mutex, em_pd_mutex should not be + * taken to avoid potential deadlock. + */ +static DEFINE_IDA(em_pd_ida); +static LIST_HEAD(em_pd_list); +static DEFINE_MUTEX(em_pd_list_mutex); + static void em_cpufreq_update_efficiencies(struct device *dev, struct em_perf_state *table); static void em_check_capacity_update(void); @@ -116,6 +128,16 @@ static int em_debug_flags_show(struct seq_file *s, void *unused) } DEFINE_SHOW_ATTRIBUTE(em_debug_flags); +static int em_debug_id_show(struct seq_file *s, void *unused) +{ + struct em_perf_domain *pd = s->private; + + seq_printf(s, "%d\n", pd->id); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(em_debug_id); + static void em_debug_create_pd(struct device *dev) { struct em_dbg_info *em_dbg; @@ -132,6 +154,8 @@ static void em_debug_create_pd(struct device *dev) debugfs_create_file("flags", 0444, d, dev->em_pd, &em_debug_flags_fops); + debugfs_create_file("id", 0444, d, dev->em_pd, &em_debug_id_fops); + em_dbg = devm_kcalloc(dev, dev->em_pd->nr_perf_states, sizeof(*em_dbg), GFP_KERNEL); if (!em_dbg) @@ -233,6 +257,10 @@ static int em_compute_costs(struct device *dev, struct em_perf_state *table, unsigned long prev_cost = ULONG_MAX; int i, ret; + /* This is needed only for CPUs and EAS skip other devices */ + if (!_is_cpu_device(dev)) + return 0; + /* Compute the cost of each performance state. */ for (i = nr_states - 1; i >= 0; i--) { unsigned long power_res, cost; @@ -324,6 +352,8 @@ int em_dev_update_perf_domain(struct device *dev, em_table_free(old_table); mutex_unlock(&em_pd_mutex); + + em_notify_pd_updated(pd); return 0; } EXPORT_SYMBOL_GPL(em_dev_update_perf_domain); @@ -392,7 +422,7 @@ static int em_create_pd(struct device *dev, int nr_states, struct em_perf_table *em_table; struct em_perf_domain *pd; struct device *cpu_dev; - int cpu, ret, num_cpus; + int cpu, ret, num_cpus, id; if (_is_cpu_device(dev)) { num_cpus = cpumask_weight(cpus); @@ -416,6 +446,13 @@ static int em_create_pd(struct device *dev, int nr_states, pd->nr_perf_states = nr_states; + INIT_LIST_HEAD(&pd->node); + + id = ida_alloc(&em_pd_ida, GFP_KERNEL); + if (id < 0) + return -ENOMEM; + pd->id = id; + em_table = em_table_alloc(pd); if (!em_table) goto free_pd; @@ -440,6 +477,7 @@ free_pd_table: kfree(em_table); free_pd: kfree(pd); + ida_free(&em_pd_ida, id); return -EINVAL; } @@ -549,6 +587,30 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, const struct em_data_callback *cb, const cpumask_t *cpus, bool microwatts) { + int ret = em_dev_register_pd_no_update(dev, nr_states, cb, cpus, microwatts); + + if (_is_cpu_device(dev)) + em_check_capacity_update(); + + return ret; +} +EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); + +/** + * em_dev_register_pd_no_update() - Register a perf domain for a device + * @dev : Device to register the PD for + * @nr_states : Number of performance states in the new PD + * @cb : Callback functions for populating the energy model + * @cpus : CPUs to include in the new PD (mandatory if @dev is a CPU device) + * @microwatts : Whether or not the power values in the EM will be in uW + * + * Like em_dev_register_perf_domain(), but does not trigger a CPU capacity + * update after registering the PD, even if @dev is a CPU device. + */ +int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states, + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts) +{ struct em_perf_table *em_table; unsigned long cap, prev_cap = 0; unsigned long flags = 0; @@ -631,13 +693,18 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, unlock: mutex_unlock(&em_pd_mutex); + if (ret) + return ret; - if (_is_cpu_device(dev)) - em_check_capacity_update(); + mutex_lock(&em_pd_list_mutex); + list_add_tail(&dev->em_pd->node, &em_pd_list); + mutex_unlock(&em_pd_list_mutex); - return ret; + em_notify_pd_created(dev->em_pd); + + return 0; } -EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); +EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update); /** * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device @@ -653,6 +720,12 @@ void em_dev_unregister_perf_domain(struct device *dev) if (_is_cpu_device(dev)) return; + mutex_lock(&em_pd_list_mutex); + list_del_init(&dev->em_pd->node); + mutex_unlock(&em_pd_list_mutex); + + em_notify_pd_deleted(dev->em_pd); + /* * The mutex separates all register/unregister requests and protects * from potential clean-up/setup issues in the debugfs directories. @@ -664,6 +737,8 @@ void em_dev_unregister_perf_domain(struct device *dev) em_table_free(rcu_dereference_protected(dev->em_pd->em_table, lockdep_is_held(&em_pd_mutex))); + ida_free(&em_pd_ida, dev->em_pd->id); + kfree(dev->em_pd); dev->em_pd = NULL; mutex_unlock(&em_pd_mutex); @@ -698,10 +773,12 @@ static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd, { int ret; - ret = em_compute_costs(dev, em_table->state, NULL, pd->nr_perf_states, - pd->flags); - if (ret) - goto free_em_table; + if (!em_is_artificial(pd)) { + ret = em_compute_costs(dev, em_table->state, NULL, + pd->nr_perf_states, pd->flags); + if (ret) + goto free_em_table; + } ret = em_dev_update_perf_domain(dev, em_table); if (ret) @@ -721,10 +798,24 @@ free_em_table: * Adjustment of CPU performance values after boot, when all CPUs capacites * are correctly calculated. */ -static void em_adjust_new_capacity(struct device *dev, +static void em_adjust_new_capacity(unsigned int cpu, struct device *dev, struct em_perf_domain *pd) { + unsigned long cpu_capacity = arch_scale_cpu_capacity(cpu); struct em_perf_table *em_table; + struct em_perf_state *table; + unsigned long em_max_perf; + + rcu_read_lock(); + table = em_perf_state_from_pd(pd); + em_max_perf = table[pd->nr_perf_states - 1].performance; + rcu_read_unlock(); + + if (em_max_perf == cpu_capacity) + return; + + pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n", cpu, + cpu_capacity, em_max_perf); em_table = em_table_dup(pd); if (!em_table) { @@ -737,13 +828,28 @@ static void em_adjust_new_capacity(struct device *dev, em_recalc_and_update(dev, pd, em_table); } +/** + * em_adjust_cpu_capacity() - Adjust the EM for a CPU after a capacity update. + * @cpu: Target CPU. + * + * Adjust the existing EM for @cpu after a capacity update under the assumption + * that the capacity has been updated in the same way for all of the CPUs in + * the same perf domain. + */ +void em_adjust_cpu_capacity(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); + struct em_perf_domain *pd; + + pd = em_pd_get(dev); + if (pd) + em_adjust_new_capacity(cpu, dev, pd); +} + static void em_check_capacity_update(void) { cpumask_var_t cpu_done_mask; - struct em_perf_state *table; - struct em_perf_domain *pd; - unsigned long cpu_capacity; - int cpu; + int cpu, failed_cpus = 0; if (!zalloc_cpumask_var(&cpu_done_mask, GFP_KERNEL)) { pr_warn("no free memory\n"); @@ -753,7 +859,7 @@ static void em_check_capacity_update(void) /* Check if CPUs capacity has changed than update EM */ for_each_possible_cpu(cpu) { struct cpufreq_policy *policy; - unsigned long em_max_perf; + struct em_perf_domain *pd; struct device *dev; if (cpumask_test_cpu(cpu, cpu_done_mask)) @@ -761,10 +867,8 @@ static void em_check_capacity_update(void) policy = cpufreq_cpu_get(cpu); if (!policy) { - pr_debug("Accessing cpu%d policy failed\n", cpu); - schedule_delayed_work(&em_update_work, - msecs_to_jiffies(1000)); - break; + failed_cpus++; + continue; } cpufreq_cpu_put(policy); @@ -776,26 +880,12 @@ static void em_check_capacity_update(void) cpumask_or(cpu_done_mask, cpu_done_mask, em_span_cpus(pd)); - cpu_capacity = arch_scale_cpu_capacity(cpu); - - rcu_read_lock(); - table = em_perf_state_from_pd(pd); - em_max_perf = table[pd->nr_perf_states - 1].performance; - rcu_read_unlock(); - - /* - * Check if the CPU capacity has been adjusted during boot - * and trigger the update for new performance values. - */ - if (em_max_perf == cpu_capacity) - continue; - - pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n", - cpu, cpu_capacity, em_max_perf); - - em_adjust_new_capacity(dev, pd); + em_adjust_new_capacity(cpu, dev, pd); } + if (failed_cpus) + schedule_delayed_work(&em_update_work, msecs_to_jiffies(1000)); + free_cpumask_var(cpu_done_mask); } @@ -918,3 +1008,39 @@ void em_rebuild_sched_domains(void) */ schedule_work(&rebuild_sd_work); } + +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_NET) +int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *), + void *data) +{ + struct em_perf_domain *pd; + + lockdep_assert_not_held(&em_pd_mutex); + guard(mutex)(&em_pd_list_mutex); + + list_for_each_entry(pd, &em_pd_list, node) { + int ret; + + ret = cb(pd, data); + if (ret) + return ret; + } + + return 0; +} + +struct em_perf_domain *em_perf_domain_get_by_id(int id) +{ + struct em_perf_domain *pd; + + lockdep_assert_not_held(&em_pd_mutex); + guard(mutex)(&em_pd_list_mutex); + + list_for_each_entry(pd, &em_pd_list, node) { + if (pd->id == id) + return pd; + } + + return NULL; +} +#endif diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 23c0f4e6cb2f..af8d07bafe02 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -80,6 +80,17 @@ static const struct platform_hibernation_ops *hibernation_ops; static atomic_t hibernate_atomic = ATOMIC_INIT(1); +#ifdef CONFIG_SUSPEND +/** + * pm_hibernation_mode_is_suspend - Check if hibernation has been set to suspend + */ +bool pm_hibernation_mode_is_suspend(void) +{ + return hibernation_mode == HIBERNATION_SUSPEND; +} +EXPORT_SYMBOL_GPL(pm_hibernation_mode_is_suspend); +#endif + bool hibernate_acquire(void) { return atomic_add_unless(&hibernate_atomic, -1, 0); @@ -90,6 +101,11 @@ void hibernate_release(void) atomic_inc(&hibernate_atomic); } +bool hibernation_in_progress(void) +{ + return !atomic_read(&hibernate_atomic); +} + bool hibernation_available(void) { return nohibernate == 0 && @@ -133,10 +149,15 @@ bool system_entering_hibernation(void) EXPORT_SYMBOL(system_entering_hibernation); #ifdef CONFIG_PM_DEBUG +static unsigned int pm_test_delay = 5; +module_param(pm_test_delay, uint, 0644); +MODULE_PARM_DESC(pm_test_delay, + "Number of seconds to wait before resuming from hibernation test"); static void hibernation_debug_sleep(void) { - pr_info("debug: Waiting for 5 seconds.\n"); - mdelay(5000); + pr_info("hibernation debug: Waiting for %d second(s).\n", + pm_test_delay); + mdelay(pm_test_delay * 1000); } static int hibernation_test(int level) @@ -371,6 +392,23 @@ static int create_image(int platform_mode) return error; } +static void shrink_shmem_memory(void) +{ + struct sysinfo info; + unsigned long nr_shmem_pages, nr_freed_pages; + + si_meminfo(&info); + nr_shmem_pages = info.sharedram; /* current page count used for shmem */ + /* + * The intent is to reclaim all shmem pages. Though shrink_all_memory() can + * only reclaim about half of them, it's enough for creating the hibernation + * image. + */ + nr_freed_pages = shrink_all_memory(nr_shmem_pages); + pr_debug("requested to reclaim %lu shmem pages, actually freed %lu pages\n", + nr_shmem_pages, nr_freed_pages); +} + /** * hibernation_snapshot - Quiesce devices and create a hibernation image. * @platform_mode: If set, use platform driver to prepare for the transition. @@ -412,6 +450,15 @@ int hibernation_snapshot(int platform_mode) goto Thaw; } + /* + * Device drivers may move lots of data to shmem in dpm_prepare(). The shmem + * pages will use lots of system memory, causing hibernation image creation + * fail due to insufficient free memory. + * This call is to force flush the shmem pages to swap disk and reclaim + * the system memory so that image creation can succeed. + */ + shrink_shmem_memory(); + console_suspend_all(); pm_restrict_gfp_mask(); @@ -549,7 +596,6 @@ int hibernation_restore(int platform_mode) pm_prepare_console(); console_suspend_all(); - pm_restrict_gfp_mask(); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { error = resume_target_kernel(platform_mode); @@ -561,7 +607,6 @@ int hibernation_restore(int platform_mode) BUG_ON(!error); } dpm_resume_end(PMSG_RECOVER); - pm_restore_gfp_mask(); console_resume_all(); pm_restore_console(); return error; @@ -662,18 +707,11 @@ static void power_down(void) #ifdef CONFIG_SUSPEND if (hibernation_mode == HIBERNATION_SUSPEND) { error = suspend_devices_and_enter(mem_sleep_current); - if (error) { - hibernation_mode = hibernation_ops ? - HIBERNATION_PLATFORM : - HIBERNATION_SHUTDOWN; - } else { - /* Restore swap signature. */ - error = swsusp_unmark(); - if (error) - pr_err("Swap will be unusable! Try swapon -a.\n"); + if (!error) + goto exit; - return; - } + hibernation_mode = hibernation_ops ? HIBERNATION_PLATFORM : + HIBERNATION_SHUTDOWN; } #endif @@ -684,10 +722,9 @@ static void power_down(void) case HIBERNATION_PLATFORM: error = hibernation_platform_enter(); if (error == -EAGAIN || error == -EBUSY) { - swsusp_unmark(); events_check_enabled = false; pr_info("Wakeup event detected during hibernation, rolling back.\n"); - return; + goto exit; } fallthrough; case HIBERNATION_SHUTDOWN: @@ -706,6 +743,12 @@ static void power_down(void) pr_crit("Power down manually\n"); while (1) cpu_relax(); + +exit: + /* Restore swap signature. */ + error = swsusp_unmark(); + if (error) + pr_err("Swap will be unusable! Try swapon -a.\n"); } static int load_image_and_restore(void) @@ -757,7 +800,7 @@ int hibernate(void) * Query for the compression algorithm support if compression is enabled. */ if (!nocompress) { - strscpy(hib_comp_algo, hibernate_compressor, sizeof(hib_comp_algo)); + strscpy(hib_comp_algo, hibernate_compressor); if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) { pr_err("%s compression is not available\n", hib_comp_algo); return -EOPNOTSUPP; @@ -777,7 +820,11 @@ int hibernate(void) if (error) goto Restore; - ksys_sync_helper(); + error = pm_sleep_fs_sync(); + if (error) + goto Notify; + + filesystems_freeze(filesystem_freeze_enabled); error = freeze_processes(); if (error) @@ -846,6 +893,8 @@ int hibernate(void) /* Don't bother checking whether freezer_test_done is true */ freezer_test_done = false; Exit: + filesystems_thaw(); + Notify: pm_notifier_call_chain(PM_POST_HIBERNATION); Restore: pm_restore_console(); @@ -882,6 +931,8 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data) if (error) goto restore; + filesystems_freeze(filesystem_freeze_enabled); + error = freeze_processes(); if (error) goto exit; @@ -941,6 +992,7 @@ thaw: thaw_processes(); exit: + filesystems_thaw(); pm_notifier_call_chain(PM_POST_HIBERNATION); restore: @@ -1006,9 +1058,9 @@ static int software_resume(void) */ if (!(swsusp_header_flags & SF_NOCOMPRESS_MODE)) { if (swsusp_header_flags & SF_COMPRESSION_ALG_LZ4) - strscpy(hib_comp_algo, COMPRESSION_ALGO_LZ4, sizeof(hib_comp_algo)); + strscpy(hib_comp_algo, COMPRESSION_ALGO_LZ4); else - strscpy(hib_comp_algo, COMPRESSION_ALGO_LZO, sizeof(hib_comp_algo)); + strscpy(hib_comp_algo, COMPRESSION_ALGO_LZO); if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) { pr_err("%s compression is not available\n", hib_comp_algo); error = -EOPNOTSUPP; @@ -1029,19 +1081,25 @@ static int software_resume(void) if (error) goto Restore; + filesystems_freeze(filesystem_freeze_enabled); + pm_pr_dbg("Preparing processes for hibernation restore.\n"); error = freeze_processes(); - if (error) + if (error) { + filesystems_thaw(); goto Close_Finish; + } error = freeze_kernel_threads(); if (error) { thaw_processes(); + filesystems_thaw(); goto Close_Finish; } error = load_image_and_restore(); thaw_processes(); + filesystems_thaw(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); Restore: @@ -1456,8 +1514,7 @@ static int hibernate_compressor_param_set(const char *compressor, if (index >= 0) { ret = param_set_copystring(comp_alg_enabled[index], kp); if (!ret) - strscpy(hib_comp_algo, comp_alg_enabled[index], - sizeof(hib_comp_algo)); + strscpy(hib_comp_algo, comp_alg_enabled[index]); } else { ret = index; } diff --git a/kernel/power/main.c b/kernel/power/main.c index 6254814d4817..03b2c5495c77 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -8,6 +8,7 @@ #include <linux/acpi.h> #include <linux/export.h> +#include <linux/init.h> #include <linux/kobject.h> #include <linux/string.h> #include <linux/pm-trace.h> @@ -17,6 +18,8 @@ #include <linux/suspend.h> #include <linux/syscalls.h> #include <linux/pm_runtime.h> +#include <linux/atomic.h> +#include <linux/wait.h> #include "power.h" @@ -30,23 +33,35 @@ * held, unless the suspend/hibernate code is guaranteed not to run in parallel * with that modification). */ +static unsigned int saved_gfp_count; static gfp_t saved_gfp_mask; void pm_restore_gfp_mask(void) { WARN_ON(!mutex_is_locked(&system_transition_mutex)); - if (saved_gfp_mask) { - gfp_allowed_mask = saved_gfp_mask; - saved_gfp_mask = 0; - } + + if (WARN_ON(!saved_gfp_count) || --saved_gfp_count) + return; + + gfp_allowed_mask = saved_gfp_mask; + saved_gfp_mask = 0; + + pm_pr_dbg("GFP mask restored\n"); } void pm_restrict_gfp_mask(void) { WARN_ON(!mutex_is_locked(&system_transition_mutex)); - WARN_ON(saved_gfp_mask); + + if (saved_gfp_count++) { + WARN_ON((saved_gfp_mask & ~(__GFP_IO | __GFP_FS)) != gfp_allowed_mask); + return; + } + saved_gfp_mask = gfp_allowed_mask; gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); + + pm_pr_dbg("GFP mask restricted\n"); } unsigned int lock_system_sleep(void) @@ -79,6 +94,61 @@ void ksys_sync_helper(void) } EXPORT_SYMBOL_GPL(ksys_sync_helper); +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) +/* Wakeup events handling resolution while syncing file systems in jiffies */ +#define PM_FS_SYNC_WAKEUP_RESOLUTION 5 + +static atomic_t pm_fs_sync_count = ATOMIC_INIT(0); +static struct workqueue_struct *pm_fs_sync_wq; +static DECLARE_WAIT_QUEUE_HEAD(pm_fs_sync_wait); + +static bool pm_fs_sync_completed(void) +{ + return atomic_read(&pm_fs_sync_count) == 0; +} + +static void pm_fs_sync_work_fn(struct work_struct *work) +{ + ksys_sync_helper(); + + if (atomic_dec_and_test(&pm_fs_sync_count)) + wake_up(&pm_fs_sync_wait); +} +static DECLARE_WORK(pm_fs_sync_work, pm_fs_sync_work_fn); + +/** + * pm_sleep_fs_sync() - Sync file systems in an interruptible way + * + * Return: 0 on successful file system sync, or -EBUSY if the file system sync + * was aborted. + */ +int pm_sleep_fs_sync(void) +{ + pm_wakeup_clear(0); + + /* + * Take back-to-back sleeps into account by queuing a subsequent fs sync + * only if the previous fs sync is running or is not queued. Multiple fs + * syncs increase the likelihood of saving the latest files immediately + * before sleep. + */ + if (!work_pending(&pm_fs_sync_work)) { + atomic_inc(&pm_fs_sync_count); + queue_work(pm_fs_sync_wq, &pm_fs_sync_work); + } + + while (!pm_fs_sync_completed()) { + if (pm_wakeup_pending()) + return -EBUSY; + + wait_event_timeout(pm_fs_sync_wait, pm_fs_sync_completed(), + PM_FS_SYNC_WAKEUP_RESOLUTION); + } + + return 0; +} +#endif /* CONFIG_SUSPEND || CONFIG_HIBERNATION */ + /* Routines for PM-transition notifications */ static BLOCKING_NOTIFIER_HEAD(pm_chain_head); @@ -112,6 +182,14 @@ int pm_notifier_call_chain(unsigned long val) /* If set, devices may be suspended and resumed asynchronously. */ int pm_async_enabled = 1; +static int __init pm_async_setup(char *str) +{ + if (!strcmp(str, "off")) + pm_async_enabled = 0; + return 1; +} +__setup("pm_async=", pm_async_setup); + static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -210,10 +288,10 @@ static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr power_attr(mem_sleep); /* - * sync_on_suspend: invoke ksys_sync_helper() before suspend. + * sync_on_suspend: Sync file systems before suspend. * - * show() returns whether ksys_sync_helper() is invoked before suspend. - * store() accepts 0 or 1. 0 disables ksys_sync_helper() and 1 enables it. + * show() returns whether file systems sync before suspend is enabled. + * store() accepts 0 or 1. 0 disables file systems sync and 1 enables it. */ bool sync_on_suspend_enabled = !IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC); @@ -557,6 +635,10 @@ static int __init pm_debugfs_init(void) late_initcall(pm_debugfs_init); #endif /* CONFIG_DEBUG_FS */ +bool pm_sleep_transition_in_progress(void) +{ + return pm_suspend_in_progress() || hibernation_in_progress(); +} #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_SLEEP_DEBUG @@ -594,7 +676,7 @@ power_attr(pm_print_times); static inline void pm_print_times_init(void) { - pm_print_times_enabled = !!initcall_debug; + pm_print_times_enabled = initcall_debug; } static ssize_t pm_wakeup_irq_show(struct kobject *kobj, @@ -613,7 +695,7 @@ bool pm_debug_messages_on __read_mostly; bool pm_debug_messages_should_print(void) { - return pm_debug_messages_on && pm_suspend_target_state != PM_SUSPEND_ON; + return pm_debug_messages_on && pm_sleep_transition_in_progress(); } EXPORT_SYMBOL_GPL(pm_debug_messages_should_print); @@ -962,6 +1044,34 @@ power_attr(pm_freeze_timeout); #endif /* CONFIG_FREEZER*/ +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) +bool filesystem_freeze_enabled = false; + +static ssize_t freeze_filesystems_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", filesystem_freeze_enabled); +} + +static ssize_t freeze_filesystems_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + if (val > 1) + return -EINVAL; + + filesystem_freeze_enabled = !!val; + return n; +} + +power_attr(freeze_filesystems); +#endif /* CONFIG_SUSPEND || CONFIG_HIBERNATION */ + static struct attribute * g[] = { &state_attr.attr, #ifdef CONFIG_PM_TRACE @@ -992,6 +1102,9 @@ static struct attribute * g[] = { #ifdef CONFIG_FREEZER &pm_freeze_timeout_attr.attr, #endif +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) + &freeze_filesystems_attr.attr, +#endif NULL, }; @@ -1010,16 +1123,26 @@ static const struct attribute_group *attr_groups[] = { struct workqueue_struct *pm_wq; EXPORT_SYMBOL_GPL(pm_wq); -static int __init pm_start_workqueue(void) +static int __init pm_start_workqueues(void) { - pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0); + pm_wq = alloc_workqueue("pm", WQ_FREEZABLE | WQ_UNBOUND, 0); + if (!pm_wq) + return -ENOMEM; - return pm_wq ? 0 : -ENOMEM; +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) + pm_fs_sync_wq = alloc_ordered_workqueue("pm_fs_sync", 0); + if (!pm_fs_sync_wq) { + destroy_workqueue(pm_wq); + return -ENOMEM; + } +#endif + + return 0; } static int __init pm_init(void) { - int error = pm_start_workqueue(); + int error = pm_start_workqueues(); if (error) return error; hibernate_image_size_init(); diff --git a/kernel/power/power.h b/kernel/power/power.h index c352dea2f67b..75b63843886e 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -18,6 +18,11 @@ struct swsusp_info { unsigned long size; } __aligned(PAGE_SIZE); +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) +extern int pm_sleep_fs_sync(void); +extern bool filesystem_freeze_enabled; +#endif + #ifdef CONFIG_HIBERNATION /* kernel/power/snapshot.c */ extern void __init hibernate_reserved_size_init(void); @@ -71,10 +76,14 @@ extern void enable_restore_image_protection(void); static inline void enable_restore_image_protection(void) {} #endif /* CONFIG_STRICT_KERNEL_RWX */ +extern bool hibernation_in_progress(void); + #else /* !CONFIG_HIBERNATION */ static inline void hibernate_reserved_size_init(void) {} static inline void hibernate_image_size_init(void) {} + +static inline bool hibernation_in_progress(void) { return false; } #endif /* !CONFIG_HIBERNATION */ #define power_attr(_name) \ @@ -231,11 +240,6 @@ static inline void suspend_test_finish(const char *label) {} /* kernel/power/main.c */ extern int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down); extern int pm_notifier_call_chain(unsigned long val); -void pm_restrict_gfp_mask(void); -void pm_restore_gfp_mask(void); -#else -static inline void pm_restrict_gfp_mask(void) {} -static inline void pm_restore_gfp_mask(void) {} #endif #ifdef CONFIG_HIGHMEM diff --git a/kernel/power/process.c b/kernel/power/process.c index 66ac067d9ae6..dc0dfc349f22 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -189,7 +189,7 @@ void thaw_processes(void) oom_killer_enable(); - pr_info("Restarting tasks ... "); + pr_info("Restarting tasks: Starting\n"); __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); @@ -208,7 +208,7 @@ void thaw_processes(void) usermodehelper_enable(); schedule(); - pr_cont("done.\n"); + pr_info("Restarting tasks: Done\n"); trace_suspend_resume(TPS("thaw_processes"), 0, false); } @@ -217,7 +217,7 @@ void thaw_kernel_threads(void) struct task_struct *g, *p; pm_nosig_freezing = false; - pr_info("Restarting kernel threads ... "); + pr_info("Restarting kernel threads ...\n"); thaw_workqueues(); @@ -229,5 +229,5 @@ void thaw_kernel_threads(void) read_unlock(&tasklist_lock); schedule(); - pr_cont("done.\n"); + pr_info("Done restarting kernel threads.\n"); } diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 4244b069442e..f7d8064e9adc 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -415,6 +415,105 @@ static struct miscdevice cpu_latency_qos_miscdev = { .fops = &cpu_latency_qos_fops, }; +#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP +/* The CPU system wakeup latency QoS. */ +static struct pm_qos_constraints cpu_wakeup_latency_constraints = { + .list = PLIST_HEAD_INIT(cpu_wakeup_latency_constraints.list), + .target_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT, + .default_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT, + .no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT, + .type = PM_QOS_MIN, +}; + +/** + * cpu_wakeup_latency_qos_limit - Current CPU system wakeup latency QoS limit. + * + * Returns the current CPU system wakeup latency QoS limit that may have been + * requested by user space. + */ +s32 cpu_wakeup_latency_qos_limit(void) +{ + return pm_qos_read_value(&cpu_wakeup_latency_constraints); +} + +static int cpu_wakeup_latency_qos_open(struct inode *inode, struct file *filp) +{ + struct pm_qos_request *req; + + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->qos = &cpu_wakeup_latency_constraints; + pm_qos_update_target(req->qos, &req->node, PM_QOS_ADD_REQ, + PM_QOS_RESUME_LATENCY_NO_CONSTRAINT); + filp->private_data = req; + + return 0; +} + +static int cpu_wakeup_latency_qos_release(struct inode *inode, + struct file *filp) +{ + struct pm_qos_request *req = filp->private_data; + + filp->private_data = NULL; + pm_qos_update_target(req->qos, &req->node, PM_QOS_REMOVE_REQ, + PM_QOS_RESUME_LATENCY_NO_CONSTRAINT); + kfree(req); + + return 0; +} + +static ssize_t cpu_wakeup_latency_qos_read(struct file *filp, char __user *buf, + size_t count, loff_t *f_pos) +{ + s32 value = pm_qos_read_value(&cpu_wakeup_latency_constraints); + + return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); +} + +static ssize_t cpu_wakeup_latency_qos_write(struct file *filp, + const char __user *buf, + size_t count, loff_t *f_pos) +{ + struct pm_qos_request *req = filp->private_data; + s32 value; + + if (count == sizeof(s32)) { + if (copy_from_user(&value, buf, sizeof(s32))) + return -EFAULT; + } else { + int ret; + + ret = kstrtos32_from_user(buf, count, 16, &value); + if (ret) + return ret; + } + + if (value < 0) + return -EINVAL; + + pm_qos_update_target(req->qos, &req->node, PM_QOS_UPDATE_REQ, value); + + return count; +} + +static const struct file_operations cpu_wakeup_latency_qos_fops = { + .open = cpu_wakeup_latency_qos_open, + .release = cpu_wakeup_latency_qos_release, + .read = cpu_wakeup_latency_qos_read, + .write = cpu_wakeup_latency_qos_write, + .llseek = noop_llseek, +}; + +static struct miscdevice cpu_wakeup_latency_qos_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "cpu_wakeup_latency", + .fops = &cpu_wakeup_latency_qos_fops, +}; +#endif /* CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP */ + static int __init cpu_latency_qos_init(void) { int ret; @@ -424,6 +523,13 @@ static int __init cpu_latency_qos_init(void) pr_err("%s: %s setup failed\n", __func__, cpu_latency_qos_miscdev.name); +#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP + ret = misc_register(&cpu_wakeup_latency_qos_miscdev); + if (ret < 0) + pr_err("%s: %s setup failed\n", __func__, + cpu_wakeup_latency_qos_miscdev.name); +#endif + return ret; } late_initcall(cpu_latency_qos_init); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 4e6e24e8b854..0a946932d5c1 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -363,7 +363,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) * * One radix tree is represented by one struct mem_zone_bm_rtree. There are * two linked lists for the nodes of the tree, one for the inner nodes and - * one for the leave nodes. The linked leave nodes are used for fast linear + * one for the leaf nodes. The linked leaf nodes are used for fast linear * access of the memory bitmap. * * The struct rtree_node represents one node of the radix tree. @@ -1094,16 +1094,15 @@ static void mark_nosave_pages(struct memory_bitmap *bm) ((unsigned long long) region->end_pfn << PAGE_SHIFT) - 1); - for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) - if (pfn_valid(pfn)) { - /* - * It is safe to ignore the result of - * mem_bm_set_bit_check() here, since we won't - * touch the PFNs for which the error is - * returned anyway. - */ - mem_bm_set_bit_check(bm, pfn); - } + for_each_valid_pfn(pfn, region->start_pfn, region->end_pfn) { + /* + * It is safe to ignore the result of + * mem_bm_set_bit_check() here, since we won't + * touch the PFNs for which the error is + * returned anyway. + */ + mem_bm_set_bit_check(bm, pfn); + } } } @@ -1255,21 +1254,20 @@ static void mark_free_pages(struct zone *zone) spin_lock_irqsave(&zone->lock, flags); max_zone_pfn = zone_end_pfn(zone); - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); + for_each_valid_pfn(pfn, zone->zone_start_pfn, max_zone_pfn) { + page = pfn_to_page(pfn); - if (!--page_count) { - touch_nmi_watchdog(); - page_count = WD_PAGE_COUNT; - } + if (!--page_count) { + touch_nmi_watchdog(); + page_count = WD_PAGE_COUNT; + } - if (page_zone(page) != zone) - continue; + if (page_zone(page) != zone) + continue; - if (!swsusp_page_is_forbidden(page)) - swsusp_unset_page_free(page); - } + if (!swsusp_page_is_forbidden(page)) + swsusp_unset_page_free(page); + } for_each_migratetype_order(order, t) { list_for_each_entry(page, @@ -1538,7 +1536,7 @@ static unsigned long copy_data_pages(struct memory_bitmap *copy_bm, memory_bm_position_reset(orig_bm); memory_bm_position_reset(copy_bm); copy_pfn = memory_bm_next_pfn(copy_bm); - for(;;) { + for (;;) { pfn = memory_bm_next_pfn(orig_bm); if (unlikely(pfn == BM_END_OF_MAP)) break; @@ -2112,22 +2110,20 @@ asmlinkage __visible int swsusp_save(void) { unsigned int nr_pages, nr_highmem; - pr_info("Creating image:\n"); + pm_deferred_pr_dbg("Creating image\n"); drain_local_pages(NULL); nr_pages = count_data_pages(); nr_highmem = count_highmem_pages(); - pr_info("Need to copy %u pages\n", nr_pages + nr_highmem); + pm_deferred_pr_dbg("Need to copy %u pages\n", nr_pages + nr_highmem); if (!enough_free_mem(nr_pages, nr_highmem)) { - pr_err("Not enough free memory\n"); + pm_deferred_pr_dbg("Not enough free memory for image creation\n"); return -ENOMEM; } - if (swsusp_alloc(©_bm, nr_pages, nr_highmem)) { - pr_err("Memory allocation failed\n"); + if (swsusp_alloc(©_bm, nr_pages, nr_highmem)) return -ENOMEM; - } /* * During allocating of suspend pagedir, new cold pages may appear. @@ -2146,7 +2142,8 @@ asmlinkage __visible int swsusp_save(void) nr_zero_pages = nr_pages - nr_copy_pages; nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); - pr_info("Image created (%d pages copied, %d zero pages)\n", nr_copy_pages, nr_zero_pages); + pm_deferred_pr_dbg("Image created (%d pages copied, %d zero pages)\n", + nr_copy_pages, nr_zero_pages); return 0; } @@ -2163,13 +2160,13 @@ static const char *check_image_kernel(struct swsusp_info *info) { if (info->version_code != LINUX_VERSION_CODE) return "kernel version"; - if (strcmp(info->uts.sysname,init_utsname()->sysname)) + if (strcmp(info->uts.sysname, init_utsname()->sysname)) return "system type"; - if (strcmp(info->uts.release,init_utsname()->release)) + if (strcmp(info->uts.release, init_utsname()->release)) return "kernel release"; - if (strcmp(info->uts.version,init_utsname()->version)) + if (strcmp(info->uts.version, init_utsname()->version)) return "version"; - if (strcmp(info->uts.machine,init_utsname()->machine)) + if (strcmp(info->uts.machine, init_utsname()->machine)) return "machine"; return NULL; } @@ -2363,7 +2360,7 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm, struct memory_bitmap *zero_bm) { unsigned long decoded_pfn; - bool zero; + bool zero; int j; for (j = 0; j < PAGE_SIZE / sizeof(long); j++) { diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 8eaec4ab121d..2da4482bb6eb 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -30,6 +30,7 @@ #include <trace/events/power.h> #include <linux/compiler.h> #include <linux/moduleparam.h> +#include <linux/fs.h> #include "power.h" @@ -343,10 +344,14 @@ MODULE_PARM_DESC(pm_test_delay, static int suspend_test(int level) { #ifdef CONFIG_PM_DEBUG + int i; + if (pm_test_level == level) { pr_info("suspend debug: Waiting for %d second(s).\n", pm_test_delay); - mdelay(pm_test_delay * 1000); + for (i = 0; i < pm_test_delay && !pm_wakeup_pending(); i++) + msleep(1000); + return 1; } #endif /* !CONFIG_PM_DEBUG */ @@ -374,6 +379,7 @@ static int suspend_prepare(suspend_state_t state) if (error) goto Restore; + filesystems_freeze(filesystem_freeze_enabled); trace_suspend_resume(TPS("freeze_processes"), 0, true); error = suspend_freeze_processes(); trace_suspend_resume(TPS("freeze_processes"), 0, false); @@ -381,6 +387,7 @@ static int suspend_prepare(suspend_state_t state) return 0; dpm_save_failed_step(SUSPEND_FREEZE); + filesystems_thaw(); pm_notifier_call_chain(PM_POST_SUSPEND); Restore: pm_restore_console(); @@ -550,6 +557,7 @@ int suspend_devices_and_enter(suspend_state_t state) static void suspend_finish(void) { suspend_thaw_processes(); + filesystems_thaw(); pm_notifier_call_chain(PM_POST_SUSPEND); pm_restore_console(); } @@ -585,7 +593,11 @@ static int enter_state(suspend_state_t state) if (sync_on_suspend_enabled) { trace_suspend_resume(TPS("sync_filesystems"), 0, true); - ksys_sync_helper(); + + error = pm_sleep_fs_sync(); + if (error) + goto Unlock; + trace_suspend_resume(TPS("sync_filesystems"), 0, false); } @@ -600,9 +612,7 @@ static int enter_state(suspend_state_t state) trace_suspend_resume(TPS("suspend_enter"), state, false); pm_pr_dbg("Suspending system (%s)\n", mem_sleep_labels[state]); - pm_restrict_gfp_mask(); error = suspend_devices_and_enter(state); - pm_restore_gfp_mask(); Finish: events_check_enabled = false; diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 80ff5f933a62..33a186373bef 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -46,19 +46,18 @@ static bool clean_pages_on_read; static bool clean_pages_on_decompress; /* - * The swap map is a data structure used for keeping track of each page - * written to a swap partition. It consists of many swap_map_page - * structures that contain each an array of MAP_PAGE_ENTRIES swap entries. - * These structures are stored on the swap and linked together with the - * help of the .next_swap member. + * The swap map is a data structure used for keeping track of each page + * written to a swap partition. It consists of many swap_map_page structures + * that contain each an array of MAP_PAGE_ENTRIES swap entries. These + * structures are stored on the swap and linked together with the help of the + * .next_swap member. * - * The swap map is created during suspend. The swap map pages are - * allocated and populated one at a time, so we only need one memory - * page to set up the entire structure. + * The swap map is created during suspend. The swap map pages are allocated and + * populated one at a time, so we only need one memory page to set up the entire + * structure. * - * During resume we pick up all swap_map_page structures into a list. + * During resume we pick up all swap_map_page structures into a list. */ - #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) /* @@ -89,10 +88,8 @@ struct swap_map_page_list { }; /* - * The swap_map_handle structure is used for handling swap in - * a file-alike way + * The swap_map_handle structure is used for handling swap in a file-alike way. */ - struct swap_map_handle { struct swap_map_page *cur; struct swap_map_page_list *maps; @@ -117,10 +114,9 @@ struct swsusp_header { static struct swsusp_header *swsusp_header; /* - * The following functions are used for tracing the allocated - * swap pages, so that they can be freed in case of an error. + * The following functions are used for tracing the allocated swap pages, so + * that they can be freed in case of an error. */ - struct swsusp_extent { struct rb_node node; unsigned long start; @@ -170,15 +166,14 @@ static int swsusp_extents_insert(unsigned long swap_offset) return 0; } -/* - * alloc_swapdev_block - allocate a swap page and register that it has - * been allocated, so that it can be freed in case of an error. - */ - sector_t alloc_swapdev_block(int swap) { unsigned long offset; + /* + * Allocate a swap page and register that it has been allocated, so that + * it can be freed in case of an error. + */ offset = swp_offset(get_swap_page_of_type(swap)); if (offset) { if (swsusp_extents_insert(offset)) @@ -189,16 +184,14 @@ sector_t alloc_swapdev_block(int swap) return 0; } -/* - * free_all_swap_pages - free swap pages allocated for saving image data. - * It also frees the extents used to register which swap entries had been - * allocated. - */ - void free_all_swap_pages(int swap) { struct rb_node *node; + /* + * Free swap pages allocated for saving image data. It also frees the + * extents used to register which swap entries had been allocated. + */ while ((node = swsusp_extents.rb_node)) { struct swsusp_extent *ext; @@ -268,35 +261,26 @@ static void hib_end_io(struct bio *bio) bio_put(bio); } -static int hib_submit_io(blk_opf_t opf, pgoff_t page_off, void *addr, +static int hib_submit_io_sync(blk_opf_t opf, pgoff_t page_off, void *addr) +{ + return bdev_rw_virt(file_bdev(hib_resume_bdev_file), + page_off * (PAGE_SIZE >> 9), addr, PAGE_SIZE, opf); +} + +static int hib_submit_io_async(blk_opf_t opf, pgoff_t page_off, void *addr, struct hib_bio_batch *hb) { - struct page *page = virt_to_page(addr); struct bio *bio; - int error = 0; bio = bio_alloc(file_bdev(hib_resume_bdev_file), 1, opf, GFP_NOIO | __GFP_HIGH); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); - - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - pr_err("Adding page to bio failed at %llu\n", - (unsigned long long)bio->bi_iter.bi_sector); - bio_put(bio); - return -EFAULT; - } - - if (hb) { - bio->bi_end_io = hib_end_io; - bio->bi_private = hb; - atomic_inc(&hb->count); - submit_bio(bio); - } else { - error = submit_bio_wait(bio); - bio_put(bio); - } - - return error; + bio_add_virt_nofail(bio, addr, PAGE_SIZE); + bio->bi_end_io = hib_end_io; + bio->bi_private = hb; + atomic_inc(&hb->count); + submit_bio(bio); + return 0; } static int hib_wait_io(struct hib_bio_batch *hb) @@ -312,11 +296,12 @@ static int hib_wait_io(struct hib_bio_batch *hb) /* * Saving part */ + static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; - hib_submit_io(REQ_OP_READ, swsusp_resume_block, swsusp_header, NULL); + hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block, swsusp_header); if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); @@ -329,8 +314,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) swsusp_header->flags = flags; if (flags & SF_CRC32_MODE) swsusp_header->crc32 = handle->crc32; - error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, - swsusp_resume_block, swsusp_header, NULL); + error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, + swsusp_resume_block, swsusp_header); } else { pr_err("Swap header not found!\n"); error = -ENODEV; @@ -345,16 +330,14 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) */ unsigned int swsusp_header_flags; -/** - * swsusp_swap_check - check if the resume device is a swap device - * and get its index (if so) - * - * This is called before saving image - */ static int swsusp_swap_check(void) { int res; + /* + * Check if the resume device is a swap device and get its index (if so). + * This is called before saving the image. + */ if (swsusp_resume_device) res = swap_type_of(swsusp_resume_device, swsusp_resume_block); else @@ -371,45 +354,32 @@ static int swsusp_swap_check(void) return 0; } -/** - * write_page - Write one page to given swap location. - * @buf: Address we're writing. - * @offset: Offset of the swap page we're writing to. - * @hb: bio completion batch - */ - static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) { + gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; void *src; int ret; if (!offset) return -ENOSPC; - if (hb) { - src = (void *)__get_free_page(GFP_NOIO | __GFP_NOWARN | - __GFP_NORETRY); - if (src) { - copy_page(src, buf); - } else { - ret = hib_wait_io(hb); /* Free pages */ - if (ret) - return ret; - src = (void *)__get_free_page(GFP_NOIO | - __GFP_NOWARN | - __GFP_NORETRY); - if (src) { - copy_page(src, buf); - } else { - WARN_ON_ONCE(1); - hb = NULL; /* Go synchronous */ - src = buf; - } - } - } else { - src = buf; + if (!hb) + goto sync_io; + + src = (void *)__get_free_page(gfp); + if (!src) { + ret = hib_wait_io(hb); /* Free pages */ + if (ret) + return ret; + src = (void *)__get_free_page(gfp); + if (WARN_ON_ONCE(!src)) + goto sync_io; } - return hib_submit_io(REQ_OP_WRITE | REQ_SYNC, offset, src, hb); + + copy_page(src, buf); + return hib_submit_io_async(REQ_OP_WRITE | REQ_SYNC, offset, src, hb); +sync_io: + return hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, offset, buf); } static void release_swap_writer(struct swap_map_handle *handle) @@ -534,17 +504,14 @@ static int swap_writer_finish(struct swap_map_handle *handle, CMP_HEADER, PAGE_SIZE) #define CMP_SIZE (CMP_PAGES * PAGE_SIZE) -/* Maximum number of threads for compression/decompression. */ -#define CMP_THREADS 3 +/* Default number of threads for compression/decompression. */ +#define CMP_THREADS 3 +static unsigned int hibernate_compression_threads = CMP_THREADS; /* Minimum/maximum number of pages for read buffering. */ #define CMP_MIN_RD_PAGES 1024 #define CMP_MAX_RD_PAGES 8192 -/** - * save_image - save the suspend image data - */ - static int save_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, unsigned int nr_to_write) @@ -600,13 +567,48 @@ struct crc_data { wait_queue_head_t go; /* start crc update */ wait_queue_head_t done; /* crc update done */ u32 *crc32; /* points to handle's crc32 */ - size_t *unc_len[CMP_THREADS]; /* uncompressed lengths */ - unsigned char *unc[CMP_THREADS]; /* uncompressed data */ + size_t **unc_len; /* uncompressed lengths */ + unsigned char **unc; /* uncompressed data */ }; -/* - * CRC32 update function that runs in its own thread. - */ +static struct crc_data *alloc_crc_data(int nr_threads) +{ + struct crc_data *crc; + + crc = kzalloc(sizeof(*crc), GFP_KERNEL); + if (!crc) + return NULL; + + crc->unc = kcalloc(nr_threads, sizeof(*crc->unc), GFP_KERNEL); + if (!crc->unc) + goto err_free_crc; + + crc->unc_len = kcalloc(nr_threads, sizeof(*crc->unc_len), GFP_KERNEL); + if (!crc->unc_len) + goto err_free_unc; + + return crc; + +err_free_unc: + kfree(crc->unc); +err_free_crc: + kfree(crc); + return NULL; +} + +static void free_crc_data(struct crc_data *crc) +{ + if (!crc) + return; + + if (crc->thr) + kthread_stop(crc->thr); + + kfree(crc->unc_len); + kfree(crc->unc); + kfree(crc); +} + static int crc32_threadfn(void *data) { struct crc_data *d = data; @@ -631,6 +633,7 @@ static int crc32_threadfn(void *data) } return 0; } + /* * Structure used for data compression. */ @@ -650,11 +653,8 @@ struct cmp_data { }; /* Indicates the image size after compression */ -static atomic_t compressed_size = ATOMIC_INIT(0); +static atomic64_t compressed_size = ATOMIC_INIT(0); -/* - * Compression function that runs in its own thread. - */ static int compress_threadfn(void *data) { struct cmp_data *d = data; @@ -679,19 +679,13 @@ static int compress_threadfn(void *data) d->ret = crypto_acomp_compress(d->cr); d->cmp_len = d->cr->dlen; - atomic_set(&compressed_size, atomic_read(&compressed_size) + d->cmp_len); + atomic64_add(d->cmp_len, &compressed_size); atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; } -/** - * save_compressed_image - Save the suspend image data after compression. - * @handle: Swap map handle to use for saving the image. - * @snapshot: Image to read data from. - * @nr_to_write: Number of pages to save. - */ static int save_compressed_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, unsigned int nr_to_write) @@ -704,21 +698,21 @@ static int save_compressed_image(struct swap_map_handle *handle, ktime_t start; ktime_t stop; size_t off; - unsigned thr, run_threads, nr_threads; + unsigned int thr, run_threads, nr_threads; unsigned char *page = NULL; struct cmp_data *data = NULL; struct crc_data *crc = NULL; hib_init_batch(&hb); - atomic_set(&compressed_size, 0); + atomic64_set(&compressed_size, 0); /* * We'll limit the number of threads for compression to limit memory * footprint. */ nr_threads = num_online_cpus() - 1; - nr_threads = clamp_val(nr_threads, 1, CMP_THREADS); + nr_threads = clamp_val(nr_threads, 1, hibernate_compression_threads); page = (void *)__get_free_page(GFP_NOIO | __GFP_HIGH); if (!page) { @@ -727,14 +721,14 @@ static int save_compressed_image(struct swap_map_handle *handle, goto out_clean; } - data = vzalloc(array_size(nr_threads, sizeof(*data))); + data = vcalloc(nr_threads, sizeof(*data)); if (!data) { pr_err("Failed to allocate %s data\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } - crc = kzalloc(sizeof(*crc), GFP_KERNEL); + crc = alloc_crc_data(nr_threads); if (!crc) { pr_err("Failed to allocate crc\n"); ret = -ENOMEM; @@ -892,19 +886,18 @@ out_finish: stop = ktime_get(); if (!ret) ret = err2; - if (!ret) + if (!ret) { + swsusp_show_speed(start, stop, nr_to_write, "Wrote"); + pr_info("Image size after compression: %lld kbytes\n", + (atomic64_read(&compressed_size) / 1024)); pr_info("Image saving done\n"); - swsusp_show_speed(start, stop, nr_to_write, "Wrote"); - pr_info("Image size after compression: %d kbytes\n", - (atomic_read(&compressed_size) / 1024)); + } else { + pr_err("Image saving failed: %d\n", ret); + } out_clean: hib_finish_batch(&hb); - if (crc) { - if (crc->thr) - kthread_stop(crc->thr); - kfree(crc); - } + free_crc_data(crc); if (data) { for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) @@ -914,18 +907,12 @@ out_clean: } vfree(data); } - if (page) free_page((unsigned long)page); + if (page) + free_page((unsigned long)page); return ret; } -/** - * enough_swap - Make sure we have enough swap to save the image. - * - * Returns TRUE or FALSE after checking the total amount of swap - * space available from the resume partition. - */ - static int enough_swap(unsigned int nr_pages) { unsigned int free_swap = count_swap_pages(root_swap, 1); @@ -938,15 +925,16 @@ static int enough_swap(unsigned int nr_pages) } /** - * swsusp_write - Write entire image and metadata. - * @flags: flags to pass to the "boot" kernel in the image header + * swsusp_write - Write entire image and metadata. + * @flags: flags to pass to the "boot" kernel in the image header * - * It is important _NOT_ to umount filesystems at this point. We want - * them synced (in case something goes wrong) but we DO not want to mark - * filesystem clean: it is not. (And it does not matter, if we resume - * correctly, we'll mark system clean, anyway.) + * It is important _NOT_ to umount filesystems at this point. We want them + * synced (in case something goes wrong) but we DO not want to mark filesystem + * clean: it is not. (And it does not matter, if we resume correctly, we'll mark + * system clean, anyway.) + * + * Return: 0 on success, negative error code on failure. */ - int swsusp_write(unsigned int flags) { struct swap_map_handle handle; @@ -989,8 +977,8 @@ out_finish: } /* - * The following functions allow us to read data using a swap map - * in a file-like way. + * The following functions allow us to read data using a swap map in a file-like + * way. */ static void release_swap_reader(struct swap_map_handle *handle) @@ -1041,7 +1029,7 @@ static int get_swap_reader(struct swap_map_handle *handle, return -ENOMEM; } - error = hib_submit_io(REQ_OP_READ, offset, tmp->map, NULL); + error = hib_submit_io_sync(REQ_OP_READ, offset, tmp->map); if (error) { release_swap_reader(handle); return error; @@ -1065,7 +1053,10 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, offset = handle->cur->entries[handle->k]; if (!offset) return -EFAULT; - error = hib_submit_io(REQ_OP_READ, offset, buf, hb); + if (hb) + error = hib_submit_io_async(REQ_OP_READ, offset, buf, hb); + else + error = hib_submit_io_sync(REQ_OP_READ, offset, buf); if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { @@ -1089,12 +1080,6 @@ static int swap_reader_finish(struct swap_map_handle *handle) return 0; } -/** - * load_image - load the image using the swap map handle - * @handle and the snapshot handle @snapshot - * (assume there are @nr_pages pages to load) - */ - static int load_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, unsigned int nr_to_read) @@ -1165,9 +1150,6 @@ struct dec_data { unsigned char cmp[CMP_SIZE]; /* compressed buffer */ }; -/* - * Decompression function that runs in its own thread. - */ static int decompress_threadfn(void *data) { struct dec_data *d = data; @@ -1202,12 +1184,6 @@ static int decompress_threadfn(void *data) return 0; } -/** - * load_compressed_image - Load compressed image data and decompress it. - * @handle: Swap map handle to use for loading data. - * @snapshot: Image to copy uncompressed data into. - * @nr_to_read: Number of pages to load. - */ static int load_compressed_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, unsigned int nr_to_read) @@ -1235,23 +1211,23 @@ static int load_compressed_image(struct swap_map_handle *handle, * footprint. */ nr_threads = num_online_cpus() - 1; - nr_threads = clamp_val(nr_threads, 1, CMP_THREADS); + nr_threads = clamp_val(nr_threads, 1, hibernate_compression_threads); - page = vmalloc(array_size(CMP_MAX_RD_PAGES, sizeof(*page))); + page = vmalloc_array(CMP_MAX_RD_PAGES, sizeof(*page)); if (!page) { pr_err("Failed to allocate %s page\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } - data = vzalloc(array_size(nr_threads, sizeof(*data))); + data = vcalloc(nr_threads, sizeof(*data)); if (!data) { pr_err("Failed to allocate %s data\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } - crc = kzalloc(sizeof(*crc), GFP_KERNEL); + crc = alloc_crc_data(nr_threads); if (!crc) { pr_err("Failed to allocate crc\n"); ret = -ENOMEM; @@ -1518,11 +1494,7 @@ out_clean: hib_finish_batch(&hb); for (i = 0; i < ring_size; i++) free_page((unsigned long)page[i]); - if (crc) { - if (crc->thr) - kthread_stop(crc->thr); - kfree(crc); - } + free_crc_data(crc); if (data) { for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) @@ -1541,8 +1513,9 @@ out_clean: * swsusp_read - read the hibernation image. * @flags_p: flags passed by the "frozen" kernel in the image header should * be written into this memory location + * + * Return: 0 on success, negative error code on failure. */ - int swsusp_read(unsigned int *flags_p) { int error; @@ -1579,8 +1552,9 @@ static void *swsusp_holder; /** * swsusp_check - Open the resume device and check for the swsusp signature. * @exclusive: Open the resume device exclusively. + * + * Return: 0 if a valid image is found, negative error code otherwise. */ - int swsusp_check(bool exclusive) { void *holder = exclusive ? &swsusp_holder : NULL; @@ -1590,8 +1564,8 @@ int swsusp_check(bool exclusive) BLK_OPEN_READ, holder, NULL); if (!IS_ERR(hib_resume_bdev_file)) { clear_page(swsusp_header); - error = hib_submit_io(REQ_OP_READ, swsusp_resume_block, - swsusp_header, NULL); + error = hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block, + swsusp_header); if (error) goto put; @@ -1599,9 +1573,9 @@ int swsusp_check(bool exclusive) memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); swsusp_header_flags = swsusp_header->flags; /* Reset swap signature now */ - error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, + error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, swsusp_resume_block, - swsusp_header, NULL); + swsusp_header); } else { error = -EINVAL; } @@ -1630,7 +1604,6 @@ put: /** * swsusp_close - close resume device. */ - void swsusp_close(void) { if (IS_ERR(hib_resume_bdev_file)) { @@ -1642,21 +1615,21 @@ void swsusp_close(void) } /** - * swsusp_unmark - Unmark swsusp signature in the resume device + * swsusp_unmark - Unmark swsusp signature in the resume device + * + * Return: 0 on success, negative error code on failure. */ - #ifdef CONFIG_SUSPEND int swsusp_unmark(void) { int error; - hib_submit_io(REQ_OP_READ, swsusp_resume_block, - swsusp_header, NULL); + hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block, swsusp_header); if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); - error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, + error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, swsusp_resume_block, - swsusp_header, NULL); + swsusp_header); } else { pr_err("Cannot find swsusp signature!\n"); error = -ENODEV; @@ -1671,8 +1644,46 @@ int swsusp_unmark(void) } #endif +static ssize_t hibernate_compression_threads_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", hibernate_compression_threads); +} + +static ssize_t hibernate_compression_threads_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 0, &val)) + return -EINVAL; + + if (val < 1) + return -EINVAL; + + hibernate_compression_threads = val; + return n; +} +power_attr(hibernate_compression_threads); + +static struct attribute *g[] = { + &hibernate_compression_threads_attr.attr, + NULL, +}; + +static const struct attribute_group attr_group = { + .attrs = g, +}; + static int __init swsusp_header_init(void) { + int error; + + error = sysfs_create_group(power_kobj, &attr_group); + if (error) + return -ENOMEM; + swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); if (!swsusp_header) panic("Could not allocate memory for swsusp_header\n"); @@ -1680,3 +1691,19 @@ static int __init swsusp_header_init(void) } core_initcall(swsusp_header_init); + +static int __init hibernate_compression_threads_setup(char *str) +{ + int rc = kstrtouint(str, 0, &hibernate_compression_threads); + + if (rc) + return rc; + + if (hibernate_compression_threads < 1) + hibernate_compression_threads = CMP_THREADS; + + return 1; + +} + +__setup("hibernate_compression_threads=", hibernate_compression_threads_setup); diff --git a/kernel/power/user.c b/kernel/power/user.c index 3f9e3efb9f6e..4401cfe26e5c 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -278,7 +278,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (data->frozen) break; - ksys_sync_helper(); + error = pm_sleep_fs_sync(); + if (error) + break; error = freeze_processes(); if (error) diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index 52571dcad768..4e941999a53b 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -49,6 +49,9 @@ ssize_t pm_show_wakelocks(char *buf, bool show_active) len += sysfs_emit_at(buf, len, "%s ", wl->name); } + if (len > 0) + --len; + len += sysfs_emit_at(buf, len, "\n"); mutex_unlock(&wakelocks_lock); diff --git a/kernel/printk/.kunitconfig b/kernel/printk/.kunitconfig new file mode 100644 index 000000000000..f31458fd1a92 --- /dev/null +++ b/kernel/printk/.kunitconfig @@ -0,0 +1,3 @@ +CONFIG_KUNIT=y +CONFIG_PRINTK=y +CONFIG_PRINTK_RINGBUFFER_KUNIT_TEST=y diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 39a2b61c7232..f8004ac3983d 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -7,3 +7,5 @@ obj-$(CONFIG_PRINTK_INDEX) += index.o obj-$(CONFIG_PRINTK) += printk_support.o printk_support-y := printk_ringbuffer.o printk_support-$(CONFIG_SYSCTL) += sysctl.o + +obj-$(CONFIG_PRINTK_RINGBUFFER_KUNIT_TEST) += printk_ringbuffer_kunit_test.o diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 48a24e7b309d..5f5f626f4279 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -3,7 +3,6 @@ * internal.h - printk internal definitions */ #include <linux/console.h> -#include <linux/percpu.h> #include <linux/types.h> #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) @@ -64,6 +63,7 @@ struct dev_printk_info; extern struct printk_ringbuffer *prb; extern bool printk_kthreads_running; +extern bool printk_kthreads_ready; extern bool debug_non_panic_cpus; __printf(4, 0) @@ -72,7 +72,6 @@ int vprintk_store(int facility, int level, const char *fmt, va_list args); __printf(1, 0) int vprintk_default(const char *fmt, va_list args); -__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); void __printk_safe_enter(void); void __printk_safe_exit(void); @@ -112,47 +111,6 @@ bool nbcon_kthread_create(struct console *con); void nbcon_kthread_stop(struct console *con); void nbcon_kthreads_wake(void); -/* - * Check if the given console is currently capable and allowed to print - * records. Note that this function does not consider the current context, - * which can also play a role in deciding if @con can be used to print - * records. - */ -static inline bool console_is_usable(struct console *con, short flags, bool use_atomic) -{ - if (!(flags & CON_ENABLED)) - return false; - - if ((flags & CON_SUSPENDED)) - return false; - - if (flags & CON_NBCON) { - /* The write_atomic() callback is optional. */ - if (use_atomic && !con->write_atomic) - return false; - - /* - * For the !use_atomic case, @printk_kthreads_running is not - * checked because the write_thread() callback is also used - * via the legacy loop when the printer threads are not - * available. - */ - } else { - if (!con->write) - return false; - } - - /* - * Console drivers may assume that per-cpu resources have been - * allocated. So unless they're explicitly marked as being able to - * cope (CON_ANYTIME) don't call them until this CPU is officially up. - */ - if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME)) - return false; - - return true; -} - /** * nbcon_kthread_wake - Wake up a console printing thread * @con: Console to operate on @@ -180,6 +138,7 @@ static inline void nbcon_kthread_wake(struct console *con) #define PRINTKRB_RECORD_MAX 0 #define printk_kthreads_running (false) +#define printk_kthreads_ready (false) /* * In !PRINTK builds we still export console_sem @@ -203,9 +162,6 @@ static inline bool nbcon_legacy_emit_next_record(struct console *con, bool *hand static inline void nbcon_kthread_wake(struct console *con) { } static inline void nbcon_kthreads_wake(void) { } -static inline bool console_is_usable(struct console *con, short flags, - bool use_atomic) { return false; } - #endif /* CONFIG_PRINTK */ extern bool have_boot_console; @@ -229,6 +185,8 @@ struct console_flush_type { bool legacy_offload; }; +extern bool console_irqwork_blocked; + /* * Identify which console flushing methods should be used in the context of * the caller. @@ -240,7 +198,7 @@ static inline void printk_get_console_flush_type(struct console_flush_type *ft) switch (nbcon_get_default_prio()) { case NBCON_PRIO_NORMAL: if (have_nbcon_console && !have_boot_console) { - if (printk_kthreads_running) + if (printk_kthreads_running && !console_irqwork_blocked) ft->nbcon_offload = true; else ft->nbcon_atomic = true; @@ -250,7 +208,7 @@ static inline void printk_get_console_flush_type(struct console_flush_type *ft) if (have_legacy_console || have_boot_console) { if (!is_printk_legacy_deferred()) ft->legacy_direct = true; - else + else if (!console_irqwork_blocked) ft->legacy_offload = true; } break; @@ -263,7 +221,7 @@ static inline void printk_get_console_flush_type(struct console_flush_type *ft) if (have_legacy_console || have_boot_console) { if (!is_printk_legacy_deferred()) ft->legacy_direct = true; - else + else if (!console_irqwork_blocked) ft->legacy_offload = true; } break; @@ -331,7 +289,6 @@ struct printk_message { unsigned long dropped; }; -bool other_cpu_in_panic(void); bool printk_get_next_message(struct printk_message *pmsg, u64 seq, bool is_extended, bool may_supress); diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index fd12efcc4aed..3fa403f9831f 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -10,8 +10,10 @@ #include <linux/export.h> #include <linux/init.h> #include <linux/irqflags.h> +#include <linux/kdb.h> #include <linux/kthread.h> #include <linux/minmax.h> +#include <linux/panic.h> #include <linux/percpu.h> #include <linux/preempt.h> #include <linux/slab.h> @@ -117,6 +119,9 @@ * from scratch. */ +/* Counter of active nbcon emergency contexts. */ +static atomic_t nbcon_cpu_emergency_cnt = ATOMIC_INIT(0); + /** * nbcon_state_set - Helper function to set the console state * @con: Console to update @@ -214,8 +219,9 @@ static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq) /** * nbcon_context_try_acquire_direct - Try to acquire directly - * @ctxt: The context of the caller - * @cur: The current console state + * @ctxt: The context of the caller + * @cur: The current console state + * @is_reacquire: This acquire is a reacquire * * Acquire the console when it is released. Also acquire the console when * the current owner has a lower priority and the console is in a safe state. @@ -225,17 +231,17 @@ static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq) * * Errors: * - * -EPERM: A panic is in progress and this is not the panic CPU. - * Or the current owner or waiter has the same or higher - * priority. No acquire method can be successful in - * this case. + * -EPERM: A panic is in progress and this is neither the panic + * CPU nor is this a reacquire. Or the current owner or + * waiter has the same or higher priority. No acquire + * method can be successful in these cases. * * -EBUSY: The current owner has a lower priority but the console * in an unsafe state. The caller should try using * the handover acquire method. */ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, - struct nbcon_state *cur) + struct nbcon_state *cur, bool is_reacquire) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; @@ -243,14 +249,23 @@ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, do { /* - * Panic does not imply that the console is owned. However, it - * is critical that non-panic CPUs during panic are unable to - * acquire ownership in order to satisfy the assumptions of - * nbcon_waiter_matches(). In particular, the assumption that - * lower priorities are ignored during panic. + * Panic does not imply that the console is owned. However, + * since all non-panic CPUs are stopped during panic(), it + * is safer to have them avoid gaining console ownership. + * + * One exception is when kdb has locked for printing on this CPU. + * + * Second exception is a reacquire (and an unsafe takeover + * has not previously occurred) then it is allowed to attempt + * a direct acquire in panic. This gives console drivers an + * opportunity to perform any necessary cleanup if they were + * interrupted by the panic CPU while printing. */ - if (other_cpu_in_panic()) + if (panic_on_other_cpu() && + !kdb_printf_on_this_cpu() && + (!is_reacquire || cur->unsafe_takeover)) { return -EPERM; + } if (ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio) return -EPERM; @@ -301,8 +316,9 @@ static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio) * Event #1 implies this context is EMERGENCY. * Event #2 implies the new context is PANIC. * Event #3 occurs when panic() has flushed the console. - * Events #4 and #5 are not possible due to the other_cpu_in_panic() - * check in nbcon_context_try_acquire_direct(). + * Event #4 occurs when a non-panic CPU reacquires. + * Event #5 is not possible due to the panic_on_other_cpu() check + * in nbcon_context_try_acquire_handover(). */ return (cur->req_prio == expected_prio); @@ -340,7 +356,7 @@ static int nbcon_context_try_acquire_requested(struct nbcon_context *ctxt, struct nbcon_state new; /* Note that the caller must still remove the request! */ - if (other_cpu_in_panic()) + if (panic_on_other_cpu()) return -EPERM; /* @@ -431,6 +447,16 @@ static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt, WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio); WARN_ON_ONCE(!cur->unsafe); + /* + * Panic does not imply that the console is owned. However, it + * is critical that non-panic CPUs during panic are unable to + * wait for a handover in order to satisfy the assumptions of + * nbcon_waiter_matches(). In particular, the assumption that + * lower priorities are ignored during panic. + */ + if (panic_on_other_cpu()) + return -EPERM; + /* Handover is not possible on the same CPU. */ if (cur->cpu == cpu) return -EBUSY; @@ -558,7 +584,8 @@ static struct printk_buffers panic_nbcon_pbufs; /** * nbcon_context_try_acquire - Try to acquire nbcon console - * @ctxt: The context of the caller + * @ctxt: The context of the caller + * @is_reacquire: This acquire is a reacquire * * Context: Under @ctxt->con->device_lock() or local_irq_save(). * Return: True if the console was acquired. False otherwise. @@ -568,16 +595,15 @@ static struct printk_buffers panic_nbcon_pbufs; * in an unsafe state. Otherwise, on success the caller may assume * the console is not in an unsafe state. */ -static bool nbcon_context_try_acquire(struct nbcon_context *ctxt) +static bool nbcon_context_try_acquire(struct nbcon_context *ctxt, bool is_reacquire) { - unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state cur; int err; nbcon_state_read(con, &cur); try_again: - err = nbcon_context_try_acquire_direct(ctxt, &cur); + err = nbcon_context_try_acquire_direct(ctxt, &cur, is_reacquire); if (err != -EBUSY) goto out; @@ -595,7 +621,7 @@ out: /* Acquire succeeded. */ /* Assign the appropriate buffer for this context. */ - if (atomic_read(&panic_cpu) == cpu) + if (panic_on_this_cpu()) ctxt->pbufs = &panic_nbcon_pbufs; else ctxt->pbufs = con->pbufs; @@ -831,8 +857,8 @@ out: return nbcon_context_can_proceed(ctxt, &cur); } -static void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt, - char *buf, unsigned int len) +void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt, + char *buf, unsigned int len) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); struct console *con = ctxt->console; @@ -913,7 +939,7 @@ void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); - while (!nbcon_context_try_acquire(ctxt)) + while (!nbcon_context_try_acquire(ctxt, true)) cpu_relax(); nbcon_write_context_set_buf(wctxt, NULL, 0); @@ -1101,7 +1127,7 @@ static bool nbcon_emit_one(struct nbcon_write_context *wctxt, bool use_atomic) cant_migrate(); } - if (!nbcon_context_try_acquire(ctxt)) + if (!nbcon_context_try_acquire(ctxt, false)) goto out; /* @@ -1144,6 +1170,17 @@ static bool nbcon_kthread_should_wakeup(struct console *con, struct nbcon_contex if (kthread_should_stop()) return true; + /* + * Block the kthread when the system is in an emergency or panic mode. + * It increases the chance that these contexts would be able to show + * the messages directly. And it reduces the risk of interrupted writes + * where the context with a higher priority takes over the nbcon console + * ownership in the middle of a message. + */ + if (unlikely(atomic_read(&nbcon_cpu_emergency_cnt)) || + unlikely(panic_in_progress())) + return false; + cookie = console_srcu_read_lock(); flags = console_srcu_read_flags(con); @@ -1195,6 +1232,14 @@ wait_for_event: if (kthread_should_stop()) return 0; + /* + * Block the kthread when the system is in an emergency or panic + * mode. See nbcon_kthread_should_wakeup() for more details. + */ + if (unlikely(atomic_read(&nbcon_cpu_emergency_cnt)) || + unlikely(panic_in_progress())) + goto wait_for_event; + backlog = false; /* @@ -1257,6 +1302,13 @@ void nbcon_kthreads_wake(void) if (!printk_kthreads_running) return; + /* + * It is not allowed to call this function when console irq_work + * is blocked. + */ + if (WARN_ON_ONCE(console_irqwork_blocked)) + return; + cookie = console_srcu_read_lock(); for_each_console_srcu(con) { if (!(console_srcu_read_flags(con) & CON_NBCON)) @@ -1375,7 +1427,7 @@ enum nbcon_prio nbcon_get_default_prio(void) { unsigned int *cpu_emergency_nesting; - if (this_cpu_in_panic()) + if (panic_on_this_cpu()) return NBCON_PRIO_PANIC; cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); @@ -1385,6 +1437,26 @@ enum nbcon_prio nbcon_get_default_prio(void) return NBCON_PRIO_NORMAL; } +/* + * Track if it is allowed to perform unsafe hostile takeovers of console + * ownership. When true, console drivers might perform unsafe actions while + * printing. It is externally available via nbcon_allow_unsafe_takeover(). + */ +static bool panic_nbcon_allow_unsafe_takeover; + +/** + * nbcon_allow_unsafe_takeover - Check if unsafe console takeovers are allowed + * + * Return: True, when it is permitted to perform unsafe console printing + * + * This is also used by console_is_usable() to determine if it is allowed to + * call write_atomic() callbacks flagged as unsafe (CON_NBCON_ATOMIC_UNSAFE). + */ +bool nbcon_allow_unsafe_takeover(void) +{ + return panic_on_this_cpu() && panic_nbcon_allow_unsafe_takeover; +} + /** * nbcon_legacy_emit_next_record - Print one record for an nbcon console * in legacy contexts @@ -1455,7 +1527,6 @@ bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, * write_atomic() callback * @con: The nbcon console to flush * @stop_seq: Flush up until this record - * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers * * Return: 0 if @con was flushed up to @stop_seq Otherwise, error code on * failure. @@ -1474,8 +1545,7 @@ bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, * returned, it cannot be expected that the unfinalized record will become * available. */ -static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, - bool allow_unsafe_takeover) +static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq) { struct nbcon_write_context wctxt = { }; struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); @@ -1484,12 +1554,12 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, ctxt->console = con; ctxt->spinwait_max_us = 2000; ctxt->prio = nbcon_get_default_prio(); - ctxt->allow_unsafe_takeover = allow_unsafe_takeover; - - if (!nbcon_context_try_acquire(ctxt)) - return -EPERM; + ctxt->allow_unsafe_takeover = nbcon_allow_unsafe_takeover(); while (nbcon_seq_read(con) < stop_seq) { + if (!nbcon_context_try_acquire(ctxt, false)) + return -EPERM; + /* * nbcon_emit_next_record() returns false when the console was * handed over or taken over. In both cases the context is no @@ -1498,6 +1568,8 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, if (!nbcon_emit_next_record(&wctxt, true)) return -EAGAIN; + nbcon_context_release(ctxt); + if (!ctxt->backlog) { /* Are there reserved but not yet finalized records? */ if (nbcon_seq_read(con) < stop_seq) @@ -1506,7 +1578,6 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, } } - nbcon_context_release(ctxt); return err; } @@ -1515,15 +1586,13 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, * write_atomic() callback * @con: The nbcon console to flush * @stop_seq: Flush up until this record - * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers * * This will stop flushing before @stop_seq if another context has ownership. * That context is then responsible for the flushing. Likewise, if new records * are added while this context was flushing and there is no other context * to handle the printing, this context must also flush those records. */ -static void nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, - bool allow_unsafe_takeover) +static void nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq) { struct console_flush_type ft; unsigned long flags; @@ -1538,7 +1607,7 @@ again: */ local_irq_save(flags); - err = __nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover); + err = __nbcon_atomic_flush_pending_con(con, stop_seq); local_irq_restore(flags); @@ -1570,9 +1639,8 @@ again: * __nbcon_atomic_flush_pending - Flush all nbcon consoles using their * write_atomic() callback * @stop_seq: Flush up until this record - * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers */ -static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeover) +static void __nbcon_atomic_flush_pending(u64 stop_seq) { struct console *con; int cookie; @@ -1590,7 +1658,7 @@ static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeove if (nbcon_seq_read(con) >= stop_seq) continue; - nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover); + nbcon_atomic_flush_pending_con(con, stop_seq); } console_srcu_read_unlock(cookie); } @@ -1606,7 +1674,7 @@ static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeove */ void nbcon_atomic_flush_pending(void) { - __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), false); + __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb)); } /** @@ -1618,7 +1686,9 @@ void nbcon_atomic_flush_pending(void) */ void nbcon_atomic_flush_unsafe(void) { - __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), true); + panic_nbcon_allow_unsafe_takeover = true; + __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb)); + panic_nbcon_allow_unsafe_takeover = false; } /** @@ -1636,6 +1706,8 @@ void nbcon_cpu_emergency_enter(void) preempt_disable(); + atomic_inc(&nbcon_cpu_emergency_cnt); + cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); (*cpu_emergency_nesting)++; } @@ -1650,10 +1722,24 @@ void nbcon_cpu_emergency_exit(void) unsigned int *cpu_emergency_nesting; cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); - if (!WARN_ON_ONCE(*cpu_emergency_nesting == 0)) (*cpu_emergency_nesting)--; + /* + * Wake up kthreads because there might be some pending messages + * added by other CPUs with normal priority since the last flush + * in the emergency context. + */ + if (!WARN_ON_ONCE(atomic_read(&nbcon_cpu_emergency_cnt) == 0)) { + if (atomic_dec_return(&nbcon_cpu_emergency_cnt) == 0) { + struct console_flush_type ft; + + printk_get_console_flush_type(&ft); + if (ft.nbcon_offload) + nbcon_kthreads_wake(); + } + } + preempt_enable(); } @@ -1671,6 +1757,9 @@ bool nbcon_alloc(struct console *con) { struct nbcon_state state = { }; + /* Synchronize the kthread start. */ + lockdep_assert_console_list_lock_held(); + /* The write_thread() callback is mandatory. */ if (WARN_ON(!con->write_thread)) return false; @@ -1701,12 +1790,15 @@ bool nbcon_alloc(struct console *con) return false; } - if (printk_kthreads_running) { + if (printk_kthreads_ready && !have_boot_console) { if (!nbcon_kthread_create(con)) { kfree(con->pbufs); con->pbufs = NULL; return false; } + + /* Might be the first kthread. */ + printk_kthreads_running = true; } } @@ -1716,14 +1808,30 @@ bool nbcon_alloc(struct console *con) /** * nbcon_free - Free and cleanup the nbcon console specific data * @con: Console to free/cleanup nbcon data + * + * Important: @have_nbcon_console must be updated before calling + * this function. In particular, it can be set only when there + * is still another nbcon console registered. */ void nbcon_free(struct console *con) { struct nbcon_state state = { }; - if (printk_kthreads_running) + /* Synchronize the kthread stop. */ + lockdep_assert_console_list_lock_held(); + + if (printk_kthreads_running) { nbcon_kthread_stop(con); + /* Might be the last nbcon console. + * + * Do not rely on printk_kthreads_check_locked(). It is not + * called in some code paths, see nbcon_free() callers. + */ + if (!have_nbcon_console) + printk_kthreads_running = false; + } + nbcon_state_set(con, &state); /* Boot consoles share global printk buffers. */ @@ -1762,7 +1870,7 @@ bool nbcon_device_try_acquire(struct console *con) ctxt->console = con; ctxt->prio = NBCON_PRIO_NORMAL; - if (!nbcon_context_try_acquire(ctxt)) + if (!nbcon_context_try_acquire(ctxt, false)) return false; if (!nbcon_context_enter_unsafe(ctxt)) @@ -1803,14 +1911,75 @@ void nbcon_device_release(struct console *con) * using the legacy loop. */ if (ft.nbcon_atomic) { - __nbcon_atomic_flush_pending_con(con, prb_next_reserve_seq(prb), false); + __nbcon_atomic_flush_pending_con(con, prb_next_reserve_seq(prb)); } else if (ft.legacy_direct) { if (console_trylock()) console_unlock(); } else if (ft.legacy_offload) { - printk_trigger_flush(); + defer_console_output(); } } console_srcu_read_unlock(cookie); } EXPORT_SYMBOL_GPL(nbcon_device_release); + +/** + * nbcon_kdb_try_acquire - Try to acquire nbcon console and enter unsafe + * section + * @con: The nbcon console to acquire + * @wctxt: The nbcon write context to be used on success + * + * Context: Under console_srcu_read_lock() for emitting a single kdb message + * using the given con->write_atomic() callback. Can be called + * only when the console is usable at the moment. + * + * Return: True if the console was acquired. False otherwise. + * + * kdb emits messages on consoles registered for printk() without + * storing them into the ring buffer. It has to acquire the console + * ownerhip so that it could call con->write_atomic() callback a safe way. + * + * This function acquires the nbcon console using priority NBCON_PRIO_EMERGENCY + * and marks it unsafe for handover/takeover. + */ +bool nbcon_kdb_try_acquire(struct console *con, + struct nbcon_write_context *wctxt) +{ + struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + + memset(ctxt, 0, sizeof(*ctxt)); + ctxt->console = con; + ctxt->prio = NBCON_PRIO_EMERGENCY; + + if (!nbcon_context_try_acquire(ctxt, false)) + return false; + + if (!nbcon_context_enter_unsafe(ctxt)) + return false; + + return true; +} + +/** + * nbcon_kdb_release - Exit unsafe section and release the nbcon console + * + * @wctxt: The nbcon write context initialized by a successful + * nbcon_kdb_try_acquire() + */ +void nbcon_kdb_release(struct nbcon_write_context *wctxt) +{ + struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + + if (!nbcon_context_exit_unsafe(ctxt)) + return; + + nbcon_context_release(ctxt); + + /* + * Flush any new printk() messages added when the console was blocked. + * Only the console used by the given write context was blocked. + * The console was locked only when the write_atomic() callback + * was usable. + */ + __nbcon_atomic_flush_pending_con(ctxt->console, prb_next_reserve_seq(prb)); +} diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1eea80d0648e..1d765ad242b8 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -48,6 +48,7 @@ #include <linux/sched/clock.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> +#include <linux/panic.h> #include <linux/uaccess.h> #include <asm/sections.h> @@ -345,34 +346,6 @@ static void __up_console_sem(unsigned long ip) } #define up_console_sem() __up_console_sem(_RET_IP_) -static bool panic_in_progress(void) -{ - return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); -} - -/* Return true if a panic is in progress on the current CPU. */ -bool this_cpu_in_panic(void) -{ - /* - * We can use raw_smp_processor_id() here because it is impossible for - * the task to be migrated to the panic_cpu, or away from it. If - * panic_cpu has already been set, and we're not currently executing on - * that CPU, then we never will be. - */ - return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); -} - -/* - * Return true if a panic is in progress on a remote CPU. - * - * On true, the local CPU should immediately release any printing resources - * that may be needed by the panic CPU. - */ -bool other_cpu_in_panic(void) -{ - return (panic_in_progress() && !this_cpu_in_panic()); -} - /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's @@ -489,6 +462,9 @@ bool have_boot_console; /* See printk_legacy_allow_panic_sync() for details. */ bool legacy_allow_panic_sync; +/* Avoid using irq_work when suspending. */ +bool console_irqwork_blocked; + #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); static DECLARE_WAIT_QUEUE_HEAD(legacy_wait); @@ -2407,7 +2383,7 @@ asmlinkage int vprintk_emit(int facility, int level, * non-panic CPUs are generating any messages, they will be * silently dropped. */ - if (other_cpu_in_panic() && + if (panic_on_other_cpu() && !debug_non_panic_cpus && !panic_triggering_all_cpu_backtrace) return 0; @@ -2417,7 +2393,7 @@ asmlinkage int vprintk_emit(int facility, int level, /* If called from the scheduler, we can not call up(). */ if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; - ft.legacy_offload |= ft.legacy_direct; + ft.legacy_offload |= ft.legacy_direct && !console_irqwork_blocked; ft.legacy_direct = false; } @@ -2453,7 +2429,7 @@ asmlinkage int vprintk_emit(int facility, int level, if (ft.legacy_offload) defer_console_output(); - else + else if (!console_irqwork_blocked) wake_up_klogd(); return printed_len; @@ -2757,10 +2733,20 @@ void console_suspend_all(void) { struct console *con; + if (console_suspend_enabled) + pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); + + /* + * Flush any console backlog and then avoid queueing irq_work until + * console_resume_all(). Until then deferred printing is no longer + * triggered, NBCON consoles transition to atomic flushing, and + * any klogd waiters are not triggered. + */ + pr_flush(1000, true); + console_irqwork_blocked = true; + if (!console_suspend_enabled) return; - pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); - pr_flush(1000, true); console_list_lock(); for_each_console(con) @@ -2781,26 +2767,34 @@ void console_resume_all(void) struct console_flush_type ft; struct console *con; - if (!console_suspend_enabled) - return; - - console_list_lock(); - for_each_console(con) - console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED); - console_list_unlock(); - /* - * Ensure that all SRCU list walks have completed. All printing - * contexts must be able to see they are no longer suspended so - * that they are guaranteed to wake up and resume printing. + * Allow queueing irq_work. After restoring console state, deferred + * printing and any klogd waiters need to be triggered in case there + * is now a console backlog. */ - synchronize_srcu(&console_srcu); + console_irqwork_blocked = false; + + if (console_suspend_enabled) { + console_list_lock(); + for_each_console(con) + console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED); + console_list_unlock(); + + /* + * Ensure that all SRCU list walks have completed. All printing + * contexts must be able to see they are no longer suspended so + * that they are guaranteed to wake up and resume printing. + */ + synchronize_srcu(&console_srcu); + } printk_get_console_flush_type(&ft); if (ft.nbcon_offload) nbcon_kthreads_wake(); if (ft.legacy_offload) defer_console_output(); + else + wake_up_klogd(); pr_flush(1000, true); } @@ -2843,7 +2837,7 @@ void console_lock(void) might_sleep(); /* On panic, the console_lock must be left to the panic cpu. */ - while (other_cpu_in_panic()) + while (panic_on_other_cpu()) msleep(1000); down_console_sem(); @@ -2863,7 +2857,7 @@ EXPORT_SYMBOL(console_lock); int console_trylock(void) { /* On panic, the console_lock must be left to the panic cpu. */ - if (other_cpu_in_panic()) + if (panic_on_other_cpu()) return 0; if (down_trylock_console_sem()) return 0; @@ -3029,21 +3023,18 @@ out: } /* - * Legacy console printing from printk() caller context does not respect - * raw_spinlock/spinlock nesting. For !PREEMPT_RT the lockdep warning is a - * false positive. For PREEMPT_RT the false positive condition does not - * occur. - * - * This map is used to temporarily establish LD_WAIT_SLEEP context for the - * console write() callback when legacy printing to avoid false positive - * lockdep complaints, thus allowing lockdep to continue to function for - * real issues. + * The legacy console always acquires a spinlock_t from its printing + * callback. This violates lock nesting if the caller acquired an always + * spinning lock (raw_spinlock_t) while invoking printk(). This is not a + * problem on PREEMPT_RT because legacy consoles print always from a + * dedicated thread and never from within printk(). Therefore we tell + * lockdep that a sleeping spin lock (spinlock_t) is valid here. */ #ifdef CONFIG_PREEMPT_RT static inline void printk_legacy_allow_spinlock_enter(void) { } static inline void printk_legacy_allow_spinlock_exit(void) { } #else -static DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_SLEEP); +static DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_CONFIG); static inline void printk_legacy_allow_spinlock_enter(void) { @@ -3161,104 +3152,147 @@ static inline void printk_kthreads_check_locked(void) { } #endif /* CONFIG_PRINTK */ + /* - * Print out all remaining records to all consoles. + * Print out one record for each console. * * @do_cond_resched is set by the caller. It can be true only in schedulable * context. * * @next_seq is set to the sequence number after the last available record. - * The value is valid only when this function returns true. It means that all - * usable consoles are completely flushed. + * The value is valid only when all usable consoles were flushed. It is + * when the function returns true (can do the job) and @try_again parameter + * is set to false, see below. * * @handover will be set to true if a printk waiter has taken over the * console_lock, in which case the caller is no longer holding the * console_lock. Otherwise it is set to false. * - * Returns true when there was at least one usable console and all messages - * were flushed to all usable consoles. A returned false informs the caller - * that everything was not flushed (either there were no usable consoles or - * another context has taken over printing or it is a panic situation and this - * is not the panic CPU). Regardless the reason, the caller should assume it - * is not useful to immediately try again. + * @try_again will be set to true when it still makes sense to call this + * function again. The function could do the job, see the return value. + * And some consoles still make progress. + * + * Returns true when the function could do the job. Some consoles are usable, + * and there was no takeover and no panic_on_other_cpu(). * * Requires the console_lock. */ -static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover) +static bool console_flush_one_record(bool do_cond_resched, u64 *next_seq, bool *handover, + bool *try_again) { struct console_flush_type ft; bool any_usable = false; struct console *con; - bool any_progress; int cookie; - *next_seq = 0; - *handover = false; + *try_again = false; - do { - any_progress = false; + printk_get_console_flush_type(&ft); - printk_get_console_flush_type(&ft); + cookie = console_srcu_read_lock(); + for_each_console_srcu(con) { + short flags = console_srcu_read_flags(con); + u64 printk_seq; + bool progress; - cookie = console_srcu_read_lock(); - for_each_console_srcu(con) { - short flags = console_srcu_read_flags(con); - u64 printk_seq; - bool progress; + /* + * console_flush_one_record() is only responsible for + * nbcon consoles when the nbcon consoles cannot print via + * their atomic or threaded flushing. + */ + if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload)) + continue; - /* - * console_flush_all() is only responsible for nbcon - * consoles when the nbcon consoles cannot print via - * their atomic or threaded flushing. - */ - if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload)) - continue; + if (!console_is_usable(con, flags, !do_cond_resched)) + continue; + any_usable = true; - if (!console_is_usable(con, flags, !do_cond_resched)) - continue; - any_usable = true; + if (flags & CON_NBCON) { + progress = nbcon_legacy_emit_next_record(con, handover, cookie, + !do_cond_resched); + printk_seq = nbcon_seq_read(con); + } else { + progress = console_emit_next_record(con, handover, cookie); + printk_seq = con->seq; + } - if (flags & CON_NBCON) { - progress = nbcon_legacy_emit_next_record(con, handover, cookie, - !do_cond_resched); - printk_seq = nbcon_seq_read(con); - } else { - progress = console_emit_next_record(con, handover, cookie); - printk_seq = con->seq; - } + /* + * If a handover has occurred, the SRCU read lock + * is already released. + */ + if (*handover) + goto fail; - /* - * If a handover has occurred, the SRCU read lock - * is already released. - */ - if (*handover) - return false; + /* Track the next of the highest seq flushed. */ + if (printk_seq > *next_seq) + *next_seq = printk_seq; - /* Track the next of the highest seq flushed. */ - if (printk_seq > *next_seq) - *next_seq = printk_seq; + if (!progress) + continue; - if (!progress) - continue; - any_progress = true; + /* + * An usable console made a progress. There might still be + * pending messages. + */ + *try_again = true; - /* Allow panic_cpu to take over the consoles safely. */ - if (other_cpu_in_panic()) - goto abandon; + /* Allow panic_cpu to take over the consoles safely. */ + if (panic_on_other_cpu()) + goto fail_srcu; - if (do_cond_resched) - cond_resched(); - } - console_srcu_read_unlock(cookie); - } while (any_progress); + if (do_cond_resched) + cond_resched(); + } + console_srcu_read_unlock(cookie); return any_usable; -abandon: +fail_srcu: console_srcu_read_unlock(cookie); +fail: + *try_again = false; return false; } +/* + * Print out all remaining records to all consoles. + * + * @do_cond_resched is set by the caller. It can be true only in schedulable + * context. + * + * @next_seq is set to the sequence number after the last available record. + * The value is valid only when this function returns true. It means that all + * usable consoles are completely flushed. + * + * @handover will be set to true if a printk waiter has taken over the + * console_lock, in which case the caller is no longer holding the + * console_lock. Otherwise it is set to false. + * + * Returns true when there was at least one usable console and all messages + * were flushed to all usable consoles. A returned false informs the caller + * that everything was not flushed (either there were no usable consoles or + * another context has taken over printing or it is a panic situation and this + * is not the panic CPU). Regardless the reason, the caller should assume it + * is not useful to immediately try again. + * + * Requires the console_lock. + */ +static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover) +{ + bool try_again; + bool ret; + + *next_seq = 0; + *handover = false; + + do { + ret = console_flush_one_record(do_cond_resched, next_seq, + handover, &try_again); + } while (try_again); + + return ret; +} + static void __console_flush_and_unlock(void) { bool do_cond_resched; @@ -3358,12 +3392,10 @@ void console_unblank(void) */ cookie = console_srcu_read_lock(); for_each_console_srcu(c) { - short flags = console_srcu_read_flags(c); - - if (flags & CON_SUSPENDED) + if (!console_is_usable(c, console_srcu_read_flags(c), true)) continue; - if ((flags & CON_ENABLED) && c->unblank) { + if (c->unblank) { found_unblank = true; break; } @@ -3400,12 +3432,10 @@ void console_unblank(void) cookie = console_srcu_read_lock(); for_each_console_srcu(c) { - short flags = console_srcu_read_flags(c); - - if (flags & CON_SUSPENDED) + if (!console_is_usable(c, console_srcu_read_flags(c), true)) continue; - if ((flags & CON_ENABLED) && c->unblank) + if (c->unblank) c->unblank(); } console_srcu_read_unlock(cookie); @@ -3574,7 +3604,7 @@ EXPORT_SYMBOL(console_resume); static int unregister_console_locked(struct console *console); /* True when system boot is far enough to create printer threads. */ -static bool printk_kthreads_ready __ro_after_init; +bool printk_kthreads_ready __ro_after_init; static struct task_struct *printk_legacy_kthread; @@ -3628,17 +3658,26 @@ static bool legacy_kthread_should_wakeup(void) static int legacy_kthread_func(void *unused) { - for (;;) { - wait_event_interruptible(legacy_wait, legacy_kthread_should_wakeup()); + bool try_again; + +wait_for_event: + wait_event_interruptible(legacy_wait, legacy_kthread_should_wakeup()); + + do { + bool handover = false; + u64 next_seq = 0; if (kthread_should_stop()) - break; + return 0; console_lock(); - __console_flush_and_unlock(); - } + console_flush_one_record(true, &next_seq, &handover, &try_again); + if (!handover) + __console_unlock(); - return 0; + } while (try_again); + + goto wait_for_event; } static bool legacy_kthread_create(void) @@ -3666,12 +3705,13 @@ static bool legacy_kthread_create(void) /** * printk_kthreads_shutdown - shutdown all threaded printers + * @data: syscore context * * On system shutdown all threaded printers are stopped. This allows printk * to transition back to atomic printing, thus providing a robust mechanism * for the final shutdown/reboot messages to be output. */ -static void printk_kthreads_shutdown(void) +static void printk_kthreads_shutdown(void *data) { struct console *con; @@ -3693,10 +3733,14 @@ static void printk_kthreads_shutdown(void) console_list_unlock(); } -static struct syscore_ops printk_syscore_ops = { +static const struct syscore_ops printk_syscore_ops = { .shutdown = printk_kthreads_shutdown, }; +static struct syscore printk_syscore = { + .ops = &printk_syscore_ops, +}; + /* * If appropriate, start nbcon kthreads and set @printk_kthreads_running. * If any kthreads fail to start, those consoles are unregistered. @@ -3713,6 +3757,7 @@ static void printk_kthreads_check_locked(void) if (!printk_kthreads_ready) return; + /* Start or stop the legacy kthread when needed. */ if (have_legacy_console || have_boot_console) { if (!printk_legacy_kthread && force_legacy_kthread() && @@ -3763,7 +3808,7 @@ static void printk_kthreads_check_locked(void) static int __init printk_set_kthreads_ready(void) { - register_syscore_ops(&printk_syscore_ops); + register_syscore(&printk_syscore); console_list_lock(); printk_kthreads_ready = true; @@ -4204,14 +4249,6 @@ static int unregister_console_locked(struct console *console) */ synchronize_srcu(&console_srcu); - if (console->flags & CON_NBCON) - nbcon_free(console); - - console_sysfs_notify(); - - if (console->exit) - res = console->exit(console); - /* * With this console gone, the global flags tracking registered * console types may have changed. Update them. @@ -4232,6 +4269,15 @@ static int unregister_console_locked(struct console *console) if (!found_nbcon_con) have_nbcon_console = found_nbcon_con; + /* @have_nbcon_console must be updated before calling nbcon_free(). */ + if (console->flags & CON_NBCON) + nbcon_free(console); + + console_sysfs_notify(); + + if (console->exit) + res = console->exit(console); + /* Changed console list, may require printer threads to start/stop. */ printk_kthreads_check_locked(); @@ -4536,6 +4582,13 @@ static void __wake_up_klogd(int val) if (!printk_percpu_data_ready()) return; + /* + * It is not allowed to call this function when console irq_work + * is blocked. + */ + if (WARN_ON_ONCE(console_irqwork_blocked)) + return; + preempt_disable(); /* * Guarantee any new records can be seen by tasks preparing to wait @@ -4592,9 +4645,30 @@ void defer_console_output(void) __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT); } +/** + * printk_trigger_flush - Attempt to flush printk buffer to consoles. + * + * If possible, flush the printk buffer to all consoles in the caller's + * context. If offloading is available, trigger deferred printing. + * + * This is best effort. Depending on the system state, console states, + * and caller context, no actual flushing may result from this call. + */ void printk_trigger_flush(void) { - defer_console_output(); + struct console_flush_type ft; + + printk_get_console_flush_type(&ft); + if (ft.nbcon_atomic) + nbcon_atomic_flush_pending(); + if (ft.nbcon_offload) + nbcon_kthreads_wake(); + if (ft.legacy_direct) { + if (console_trylock()) + console_unlock(); + } + if (ft.legacy_offload) + defer_console_output(); } int vprintk_deferred(const char *fmt, va_list args) diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index d9fb053cff67..56c8e3d031f4 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include <kunit/visibility.h> #include <linux/kernel.h> #include <linux/irqflags.h> #include <linux/string.h> @@ -393,25 +394,38 @@ static unsigned int to_blk_size(unsigned int size) * Sanity checker for reserve size. The ringbuffer code assumes that a data * block does not exceed the maximum possible size that could fit within the * ringbuffer. This function provides that basic size check so that the - * assumption is safe. + * assumption is safe. In particular, it guarantees that data_push_tail() will + * never attempt to push the tail beyond the head. */ static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size) { - struct prb_data_block *db = NULL; - + /* Data-less blocks take no space. */ if (size == 0) return true; /* - * Ensure the alignment padded size could possibly fit in the data - * array. The largest possible data block must still leave room for - * at least the ID of the next block. + * If data blocks were allowed to be larger than half the data ring + * size, a wrapping data block could require more space than the full + * ringbuffer. */ - size = to_blk_size(size); - if (size > DATA_SIZE(data_ring) - sizeof(db->id)) - return false; + return to_blk_size(size) <= DATA_SIZE(data_ring) / 2; +} - return true; +/* + * Compare the current and requested logical position and decide + * whether more space is needed. + * + * Return false when @lpos_current is already at or beyond @lpos_target. + * + * Also return false when the difference between the positions is bigger + * than the size of the data buffer. It might happen only when the caller + * raced with another CPU(s) which already made and used the space. + */ +static bool need_more_space(struct prb_data_ring *data_ring, + unsigned long lpos_current, + unsigned long lpos_target) +{ + return lpos_target - lpos_current - 1 < DATA_SIZE(data_ring); } /* Query the state of a descriptor. */ @@ -580,7 +594,7 @@ static bool data_make_reusable(struct printk_ringbuffer *rb, unsigned long id; /* Loop until @lpos_begin has advanced to or beyond @lpos_end. */ - while ((lpos_end - lpos_begin) - 1 < DATA_SIZE(data_ring)) { + while (need_more_space(data_ring, lpos_begin, lpos_end)) { blk = to_block(data_ring, lpos_begin); /* @@ -671,7 +685,7 @@ static bool data_push_tail(struct printk_ringbuffer *rb, unsigned long lpos) * sees the new tail lpos, any descriptor states that transitioned to * the reusable state must already be visible. */ - while ((lpos - tail_lpos) - 1 < DATA_SIZE(data_ring)) { + while (need_more_space(data_ring, tail_lpos, lpos)) { /* * Make all descriptors reusable that are associated with * data blocks before @lpos. @@ -1002,6 +1016,17 @@ static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out) return true; } +static bool is_blk_wrapped(struct prb_data_ring *data_ring, + unsigned long begin_lpos, unsigned long next_lpos) +{ + /* + * Subtract one from next_lpos since it's not actually part of this data + * block. This allows perfectly fitting records to not wrap. + */ + return DATA_WRAPS(data_ring, begin_lpos) != + DATA_WRAPS(data_ring, next_lpos - 1); +} + /* Determine the end of a data block. */ static unsigned long get_next_lpos(struct prb_data_ring *data_ring, unsigned long lpos, unsigned int size) @@ -1013,7 +1038,7 @@ static unsigned long get_next_lpos(struct prb_data_ring *data_ring, next_lpos = lpos + size; /* First check if the data block does not wrap. */ - if (DATA_WRAPS(data_ring, begin_lpos) == DATA_WRAPS(data_ring, next_lpos)) + if (!is_blk_wrapped(data_ring, begin_lpos, next_lpos)) return next_lpos; /* Wrapping data blocks store their data at the beginning. */ @@ -1051,8 +1076,17 @@ static char *data_alloc(struct printk_ringbuffer *rb, unsigned int size, do { next_lpos = get_next_lpos(data_ring, begin_lpos, size); - if (!data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) { - /* Failed to allocate, specify a data-less block. */ + /* + * data_check_size() prevents data block allocation that could + * cause illegal ringbuffer states. But double check that the + * used space will not be bigger than the ring buffer. Wrapped + * messages need to reserve more space, see get_next_lpos(). + * + * Specify a data-less block when the check or the allocation + * fails. + */ + if (WARN_ON_ONCE(next_lpos - begin_lpos > DATA_SIZE(data_ring)) || + !data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) { blk_lpos->begin = FAILED_LPOS; blk_lpos->next = FAILED_LPOS; return NULL; @@ -1081,7 +1115,7 @@ static char *data_alloc(struct printk_ringbuffer *rb, unsigned int size, blk = to_block(data_ring, begin_lpos); blk->id = id; /* LMM(data_alloc:B) */ - if (DATA_WRAPS(data_ring, begin_lpos) != DATA_WRAPS(data_ring, next_lpos)) { + if (is_blk_wrapped(data_ring, begin_lpos, next_lpos)) { /* Wrapping data blocks store their data at the beginning. */ blk = to_block(data_ring, 0); @@ -1125,14 +1159,21 @@ static char *data_realloc(struct printk_ringbuffer *rb, unsigned int size, return NULL; /* Keep track if @blk_lpos was a wrapping data block. */ - wrapped = (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, blk_lpos->next)); + wrapped = is_blk_wrapped(data_ring, blk_lpos->begin, blk_lpos->next); size = to_blk_size(size); next_lpos = get_next_lpos(data_ring, blk_lpos->begin, size); - /* If the data block does not increase, there is nothing to do. */ - if (head_lpos - next_lpos < DATA_SIZE(data_ring)) { + /* + * Use the current data block when the size does not increase, i.e. + * when @head_lpos is already able to accommodate the new @next_lpos. + * + * Note that need_more_space() could never return false here because + * the difference between the positions was bigger than the data + * buffer size. The data block is reopened and can't get reused. + */ + if (!need_more_space(data_ring, head_lpos, next_lpos)) { if (wrapped) blk = to_block(data_ring, 0); else @@ -1140,8 +1181,18 @@ static char *data_realloc(struct printk_ringbuffer *rb, unsigned int size, return &blk->data[0]; } - if (!data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) + /* + * data_check_size() prevents data block reallocation that could + * cause illegal ringbuffer states. But double check that the + * new used space will not be bigger than the ring buffer. Wrapped + * messages need to reserve more space, see get_next_lpos(). + * + * Specify failure when the check or the allocation fails. + */ + if (WARN_ON_ONCE(next_lpos - blk_lpos->begin > DATA_SIZE(data_ring)) || + !data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) { return NULL; + } /* The memory barrier involvement is the same as data_alloc:A. */ if (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &head_lpos, @@ -1151,7 +1202,7 @@ static char *data_realloc(struct printk_ringbuffer *rb, unsigned int size, blk = to_block(data_ring, blk_lpos->begin); - if (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, next_lpos)) { + if (is_blk_wrapped(data_ring, blk_lpos->begin, next_lpos)) { struct prb_data_block *old_blk = blk; /* Wrapping data blocks store their data at the beginning. */ @@ -1187,7 +1238,7 @@ static unsigned int space_used(struct prb_data_ring *data_ring, if (BLK_DATALESS(blk_lpos)) return 0; - if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next)) { + if (!is_blk_wrapped(data_ring, blk_lpos->begin, blk_lpos->next)) { /* Data block does not wrap. */ return (DATA_INDEX(data_ring, blk_lpos->next) - DATA_INDEX(data_ring, blk_lpos->begin)); @@ -1233,15 +1284,15 @@ static const char *get_data(struct prb_data_ring *data_ring, return NULL; } - /* Regular data block: @begin less than @next and in same wrap. */ - if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next) && - blk_lpos->begin < blk_lpos->next) { + /* Regular data block: @begin and @next in the same wrap. */ + if (!is_blk_wrapped(data_ring, blk_lpos->begin, blk_lpos->next)) { db = to_block(data_ring, blk_lpos->begin); *data_size = blk_lpos->next - blk_lpos->begin; /* Wrapping data block: @begin is one wrap behind @next. */ - } else if (DATA_WRAPS(data_ring, blk_lpos->begin + DATA_SIZE(data_ring)) == - DATA_WRAPS(data_ring, blk_lpos->next)) { + } else if (!is_blk_wrapped(data_ring, + blk_lpos->begin + DATA_SIZE(data_ring), + blk_lpos->next)) { db = to_block(data_ring, 0); *data_size = DATA_INDEX(data_ring, blk_lpos->next); @@ -1251,6 +1302,10 @@ static const char *get_data(struct prb_data_ring *data_ring, return NULL; } + /* Sanity check. Data-less blocks were handled earlier. */ + if (WARN_ON_ONCE(!data_check_size(data_ring, *data_size) || !*data_size)) + return NULL; + /* A valid data block will always be aligned to the ID size. */ if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) || WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) { @@ -1685,6 +1740,7 @@ fail: memset(r, 0, sizeof(*r)); return false; } +EXPORT_SYMBOL_IF_KUNIT(prb_reserve); /* Commit the data (possibly finalizing it) and restore interrupts. */ static void _prb_commit(struct prb_reserved_entry *e, unsigned long state_val) @@ -1759,6 +1815,7 @@ void prb_commit(struct prb_reserved_entry *e) if (head_id != e->id) desc_make_final(e->rb, e->id); } +EXPORT_SYMBOL_IF_KUNIT(prb_commit); /** * prb_final_commit() - Commit and finalize (previously reserved) data to @@ -2143,7 +2200,7 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, * But it would have the sequence number returned * by "prb_next_reserve_seq() - 1". */ - if (this_cpu_in_panic() && + if (panic_on_this_cpu() && (!debug_non_panic_cpus || legacy_allow_panic_sync) && ((*seq + 1) < prb_next_reserve_seq(rb))) { (*seq)++; @@ -2184,6 +2241,7 @@ bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, { return _prb_read_valid(rb, &seq, r, NULL); } +EXPORT_SYMBOL_IF_KUNIT(prb_read_valid); /** * prb_read_valid_info() - Non-blocking read of meta data for a requested @@ -2333,6 +2391,7 @@ void prb_init(struct printk_ringbuffer *rb, infos[0].seq = -(u64)_DESCS_COUNT(descbits); infos[_DESCS_COUNT(descbits) - 1].seq = 0; } +EXPORT_SYMBOL_IF_KUNIT(prb_init); /** * prb_record_text_space() - Query the full actual used ringbuffer space for diff --git a/kernel/printk/printk_ringbuffer_kunit_test.c b/kernel/printk/printk_ringbuffer_kunit_test.c new file mode 100644 index 000000000000..2282348e869a --- /dev/null +++ b/kernel/printk/printk_ringbuffer_kunit_test.c @@ -0,0 +1,327 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/cpuhplock.h> +#include <linux/cpumask.h> +#include <linux/init.h> +#include <linux/kthread.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/random.h> +#include <linux/slab.h> +#include <linux/timer.h> +#include <linux/wait.h> + +#include <kunit/resource.h> +#include <kunit/test.h> + +#include "printk_ringbuffer.h" + +/* + * This KUnit tests the data integrity of the lockless printk_ringbuffer. + * From multiple CPUs it writes messages of varying length and content while + * a reader validates the correctness of the messages. + * + * IMPORTANT: The more CPUs you can use for this KUnit, the better! + * + * The test works by starting "num_online_cpus() - 1" writer threads, each + * pinned to their own CPU. Each writer thread loops, writing data of varying + * length into a printk_ringbuffer as fast as possible. The data content is + * an embedded data struct followed by string content repeating the byte: + * + * 'A' + CPUID + * + * The reader is running on the remaining online CPU, or if there is only one + * CPU on the same as the writer. + * It ensures that the embedded struct content is consistent with the string + * and that the string * is terminated and is composed of the same repeating + * byte as its first byte. + * + * Because the threads are running in such tight loops, they will call + * cond_resched() from time to time so the system stays functional. + * + * If the reader encounters an error, the test is aborted and some + * information about the error is reported. + * The runtime of the test can be configured with the runtime_ms module parameter. + * + * Note that the test is performed on a separate printk_ringbuffer instance + * and not the instance used by printk(). + */ + +static unsigned long runtime_ms = 10 * MSEC_PER_SEC; +module_param(runtime_ms, ulong, 0400); + +/* test data structure */ +struct prbtest_rbdata { + unsigned int size; + char text[] __counted_by(size); +}; + +#define MAX_RBDATA_TEXT_SIZE 0x80 +#define MAX_PRB_RECORD_SIZE (sizeof(struct prbtest_rbdata) + MAX_RBDATA_TEXT_SIZE) + +struct prbtest_data { + struct kunit *test; + struct printk_ringbuffer *ringbuffer; + /* used by writers to signal reader of new records */ + wait_queue_head_t new_record_wait; +}; + +struct prbtest_thread_data { + unsigned long num; + struct prbtest_data *test_data; +}; + +static void prbtest_fail_record(struct kunit *test, const struct prbtest_rbdata *dat, u64 seq) +{ + unsigned int len; + + len = dat->size - 1; + + KUNIT_FAIL(test, "BAD RECORD: seq=%llu size=%u text=%.*s\n", + seq, dat->size, + len < MAX_RBDATA_TEXT_SIZE ? len : -1, + len < MAX_RBDATA_TEXT_SIZE ? dat->text : "<invalid>"); +} + +static bool prbtest_check_data(const struct prbtest_rbdata *dat) +{ + unsigned int len; + + /* Sane size? At least one character + trailing '\0' */ + if (dat->size < 2 || dat->size > MAX_RBDATA_TEXT_SIZE) + return false; + + len = dat->size - 1; + if (dat->text[len] != '\0') + return false; + + /* String repeats with the same character? */ + while (len--) { + if (dat->text[len] != dat->text[0]) + return false; + } + + return true; +} + +static int prbtest_writer(void *data) +{ + struct prbtest_thread_data *tr = data; + char text_id = 'A' + tr->num; + struct prb_reserved_entry e; + struct prbtest_rbdata *dat; + u32 record_size, text_size; + unsigned long count = 0; + struct printk_record r; + + kunit_info(tr->test_data->test, "start thread %03lu (writer)\n", tr->num); + + for (;;) { + /* ensure at least 1 character + trailing '\0' */ + text_size = get_random_u32_inclusive(2, MAX_RBDATA_TEXT_SIZE); + if (WARN_ON_ONCE(text_size < 2)) + text_size = 2; + if (WARN_ON_ONCE(text_size > MAX_RBDATA_TEXT_SIZE)) + text_size = MAX_RBDATA_TEXT_SIZE; + + record_size = sizeof(struct prbtest_rbdata) + text_size; + WARN_ON_ONCE(record_size > MAX_PRB_RECORD_SIZE); + + /* specify the text sizes for reservation */ + prb_rec_init_wr(&r, record_size); + + /* + * Reservation can fail if: + * + * - No free descriptor is available. + * - The buffer is full, and the oldest record is reserved + * but not yet committed. + * + * It actually happens in this test because all CPUs are trying + * to write an unbounded number of messages in a tight loop. + * These failures are intentionally ignored because this test + * focuses on races, ringbuffer consistency, and pushing system + * usability limits. + */ + if (prb_reserve(&e, tr->test_data->ringbuffer, &r)) { + r.info->text_len = record_size; + + dat = (struct prbtest_rbdata *)r.text_buf; + dat->size = text_size; + memset(dat->text, text_id, text_size - 1); + dat->text[text_size - 1] = '\0'; + + prb_commit(&e); + + wake_up_interruptible(&tr->test_data->new_record_wait); + } + + if ((count++ & 0x3fff) == 0) + cond_resched(); + + if (kthread_should_stop()) + break; + } + + kunit_info(tr->test_data->test, "end thread %03lu: wrote=%lu\n", tr->num, count); + + return 0; +} + +struct prbtest_wakeup_timer { + struct timer_list timer; + struct task_struct *task; +}; + +static void prbtest_wakeup_callback(struct timer_list *timer) +{ + struct prbtest_wakeup_timer *wakeup = timer_container_of(wakeup, timer, timer); + + set_tsk_thread_flag(wakeup->task, TIF_NOTIFY_SIGNAL); + wake_up_process(wakeup->task); +} + +static int prbtest_reader(struct prbtest_data *test_data, unsigned long timeout_ms) +{ + struct prbtest_wakeup_timer wakeup; + char text_buf[MAX_PRB_RECORD_SIZE]; + unsigned long count = 0; + struct printk_info info; + struct printk_record r; + u64 seq = 0; + + wakeup.task = current; + timer_setup_on_stack(&wakeup.timer, prbtest_wakeup_callback, 0); + mod_timer(&wakeup.timer, jiffies + msecs_to_jiffies(timeout_ms)); + + prb_rec_init_rd(&r, &info, text_buf, sizeof(text_buf)); + + kunit_info(test_data->test, "start reader\n"); + + while (!wait_event_interruptible(test_data->new_record_wait, + prb_read_valid(test_data->ringbuffer, seq, &r))) { + /* check/track the sequence */ + if (info.seq < seq) + KUNIT_FAIL(test_data->test, "BAD SEQ READ: request=%llu read=%llu\n", + seq, info.seq); + + if (!prbtest_check_data((struct prbtest_rbdata *)r.text_buf)) + prbtest_fail_record(test_data->test, + (struct prbtest_rbdata *)r.text_buf, info.seq); + + if ((count++ & 0x3fff) == 0) + cond_resched(); + + seq = info.seq + 1; + } + + timer_delete_sync(&wakeup.timer); + timer_destroy_on_stack(&wakeup.timer); + + kunit_info(test_data->test, "end reader: read=%lu seq=%llu\n", count, info.seq); + + return 0; +} + +KUNIT_DEFINE_ACTION_WRAPPER(prbtest_cpumask_cleanup, free_cpumask_var, struct cpumask *); +KUNIT_DEFINE_ACTION_WRAPPER(prbtest_kthread_cleanup, kthread_stop, struct task_struct *); + +static void prbtest_add_cpumask_cleanup(struct kunit *test, cpumask_var_t mask) +{ + int err; + + err = kunit_add_action_or_reset(test, prbtest_cpumask_cleanup, mask); + KUNIT_ASSERT_EQ(test, err, 0); +} + +static void prbtest_add_kthread_cleanup(struct kunit *test, struct task_struct *kthread) +{ + int err; + + err = kunit_add_action_or_reset(test, prbtest_kthread_cleanup, kthread); + KUNIT_ASSERT_EQ(test, err, 0); +} + +static inline void prbtest_prb_reinit(struct printk_ringbuffer *rb) +{ + prb_init(rb, rb->text_data_ring.data, rb->text_data_ring.size_bits, rb->desc_ring.descs, + rb->desc_ring.count_bits, rb->desc_ring.infos); +} + +static void test_readerwriter(struct kunit *test) +{ + /* Equivalent to CONFIG_LOG_BUF_SHIFT=13 */ + DEFINE_PRINTKRB(test_rb, 8, 5); + + struct prbtest_thread_data *thread_data; + struct prbtest_data *test_data; + struct task_struct *thread; + cpumask_var_t test_cpus; + int cpu, reader_cpu; + + KUNIT_ASSERT_TRUE(test, alloc_cpumask_var(&test_cpus, GFP_KERNEL)); + prbtest_add_cpumask_cleanup(test, test_cpus); + + cpus_read_lock(); + /* + * Failure of KUNIT_ASSERT() kills the current task + * so it can not be called while the CPU hotplug lock is held. + * Instead use a snapshot of the online CPUs. + * If they change during test execution it is unfortunate but not a grave error. + */ + cpumask_copy(test_cpus, cpu_online_mask); + cpus_read_unlock(); + + /* One CPU is for the reader, all others are writers */ + reader_cpu = cpumask_first(test_cpus); + if (cpumask_weight(test_cpus) == 1) + kunit_warn(test, "more than one CPU is recommended"); + else + cpumask_clear_cpu(reader_cpu, test_cpus); + + /* KUnit test can get restarted more times. */ + prbtest_prb_reinit(&test_rb); + + test_data = kunit_kmalloc(test, sizeof(*test_data), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, test_data); + test_data->test = test; + test_data->ringbuffer = &test_rb; + init_waitqueue_head(&test_data->new_record_wait); + + kunit_info(test, "running for %lu ms\n", runtime_ms); + + for_each_cpu(cpu, test_cpus) { + thread_data = kunit_kmalloc(test, sizeof(*thread_data), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, thread_data); + thread_data->test_data = test_data; + thread_data->num = cpu; + + thread = kthread_run_on_cpu(prbtest_writer, thread_data, cpu, + "prbtest writer %u"); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, thread); + prbtest_add_kthread_cleanup(test, thread); + } + + kunit_info(test, "starting test\n"); + + set_cpus_allowed_ptr(current, cpumask_of(reader_cpu)); + prbtest_reader(test_data, runtime_ms); + + kunit_info(test, "completed test\n"); +} + +static struct kunit_case prb_test_cases[] = { + KUNIT_CASE_SLOW(test_readerwriter), + {} +}; + +static struct kunit_suite prb_test_suite = { + .name = "printk-ringbuffer", + .test_cases = prb_test_cases, +}; +kunit_test_suite(prb_test_suite); + +MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING"); +MODULE_AUTHOR("John Ogness <john.ogness@linutronix.de>"); +MODULE_DESCRIPTION("printk_ringbuffer KUnit test"); +MODULE_LICENSE("GPL"); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index d5f89f9ef29f..392ec2f75f01 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -793,9 +793,9 @@ static long ptrace_get_rseq_configuration(struct task_struct *task, unsigned long size, void __user *data) { struct ptrace_rseq_configuration conf = { - .rseq_abi_pointer = (u64)(uintptr_t)task->rseq, - .rseq_abi_size = task->rseq_len, - .signature = task->rseq_sig, + .rseq_abi_pointer = (u64)(uintptr_t)task->rseq.usrptr, + .rseq_abi_size = task->rseq.len, + .signature = task->rseq.sig, .flags = 0, }; @@ -921,7 +921,6 @@ ptrace_get_syscall_info_entry(struct task_struct *child, struct pt_regs *regs, unsigned long args[ARRAY_SIZE(info->entry.args)]; int i; - info->op = PTRACE_SYSCALL_INFO_ENTRY; info->entry.nr = syscall_get_nr(child, regs); syscall_get_arguments(child, regs, args); for (i = 0; i < ARRAY_SIZE(args); i++) @@ -943,10 +942,12 @@ ptrace_get_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs, * diverge significantly enough. */ ptrace_get_syscall_info_entry(child, regs, info); - info->op = PTRACE_SYSCALL_INFO_SECCOMP; info->seccomp.ret_data = child->ptrace_message; - /* ret_data is the last field in struct ptrace_syscall_info.seccomp */ + /* + * ret_data is the last non-reserved field + * in struct ptrace_syscall_info.seccomp + */ return offsetofend(struct ptrace_syscall_info, seccomp.ret_data); } @@ -954,7 +955,6 @@ static unsigned long ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs, struct ptrace_syscall_info *info) { - info->op = PTRACE_SYSCALL_INFO_EXIT; info->exit.rval = syscall_get_error(child, regs); info->exit.is_error = !!info->exit.rval; if (!info->exit.is_error) @@ -965,19 +965,8 @@ ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs, } static int -ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, - void __user *datavp) +ptrace_get_syscall_info_op(struct task_struct *child) { - struct pt_regs *regs = task_pt_regs(child); - struct ptrace_syscall_info info = { - .op = PTRACE_SYSCALL_INFO_NONE, - .arch = syscall_get_arch(child), - .instruction_pointer = instruction_pointer(regs), - .stack_pointer = user_stack_pointer(regs), - }; - unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry); - unsigned long write_size; - /* * This does not need lock_task_sighand() to access * child->last_siginfo because ptrace_freeze_traced() @@ -988,24 +977,160 @@ ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, case SIGTRAP | 0x80: switch (child->ptrace_message) { case PTRACE_EVENTMSG_SYSCALL_ENTRY: - actual_size = ptrace_get_syscall_info_entry(child, regs, - &info); - break; + return PTRACE_SYSCALL_INFO_ENTRY; case PTRACE_EVENTMSG_SYSCALL_EXIT: - actual_size = ptrace_get_syscall_info_exit(child, regs, - &info); - break; + return PTRACE_SYSCALL_INFO_EXIT; + default: + return PTRACE_SYSCALL_INFO_NONE; } - break; case SIGTRAP | (PTRACE_EVENT_SECCOMP << 8): - actual_size = ptrace_get_syscall_info_seccomp(child, regs, - &info); + return PTRACE_SYSCALL_INFO_SECCOMP; + default: + return PTRACE_SYSCALL_INFO_NONE; + } +} + +static int +ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, + void __user *datavp) +{ + struct pt_regs *regs = task_pt_regs(child); + struct ptrace_syscall_info info = { + .op = ptrace_get_syscall_info_op(child), + .arch = syscall_get_arch(child), + .instruction_pointer = instruction_pointer(regs), + .stack_pointer = user_stack_pointer(regs), + }; + unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry); + unsigned long write_size; + + switch (info.op) { + case PTRACE_SYSCALL_INFO_ENTRY: + actual_size = ptrace_get_syscall_info_entry(child, regs, &info); + break; + case PTRACE_SYSCALL_INFO_EXIT: + actual_size = ptrace_get_syscall_info_exit(child, regs, &info); + break; + case PTRACE_SYSCALL_INFO_SECCOMP: + actual_size = ptrace_get_syscall_info_seccomp(child, regs, &info); break; } write_size = min(actual_size, user_size); return copy_to_user(datavp, &info, write_size) ? -EFAULT : actual_size; } + +static int +ptrace_set_syscall_info_entry(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + unsigned long args[ARRAY_SIZE(info->entry.args)]; + int nr = info->entry.nr; + int i; + + /* + * Check that the syscall number specified in info->entry.nr + * is either a value of type "int" or a sign-extended value + * of type "int". + */ + if (nr != info->entry.nr) + return -ERANGE; + + for (i = 0; i < ARRAY_SIZE(args); i++) { + args[i] = info->entry.args[i]; + /* + * Check that the syscall argument specified in + * info->entry.args[i] is either a value of type + * "unsigned long" or a sign-extended value of type "long". + */ + if (args[i] != info->entry.args[i]) + return -ERANGE; + } + + syscall_set_nr(child, regs, nr); + /* + * If the syscall number is set to -1, setting syscall arguments is not + * just pointless, it would also clobber the syscall return value on + * those architectures that share the same register both for the first + * argument of syscall and its return value. + */ + if (nr != -1) + syscall_set_arguments(child, regs, args); + + return 0; +} + +static int +ptrace_set_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + /* + * info->entry is currently a subset of info->seccomp, + * info->seccomp.ret_data is currently ignored. + */ + return ptrace_set_syscall_info_entry(child, regs, info); +} + +static int +ptrace_set_syscall_info_exit(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + long rval = info->exit.rval; + + /* + * Check that the return value specified in info->exit.rval + * is either a value of type "long" or a sign-extended value + * of type "long". + */ + if (rval != info->exit.rval) + return -ERANGE; + + if (info->exit.is_error) + syscall_set_return_value(child, regs, rval, 0); + else + syscall_set_return_value(child, regs, 0, rval); + + return 0; +} + +static int +ptrace_set_syscall_info(struct task_struct *child, unsigned long user_size, + const void __user *datavp) +{ + struct pt_regs *regs = task_pt_regs(child); + struct ptrace_syscall_info info; + + if (user_size < sizeof(info)) + return -EINVAL; + + /* + * The compatibility is tracked by info.op and info.flags: if user-space + * does not instruct us to use unknown extra bits from future versions + * of ptrace_syscall_info, we are not going to read them either. + */ + if (copy_from_user(&info, datavp, sizeof(info))) + return -EFAULT; + + /* Reserved for future use. */ + if (info.flags || info.reserved) + return -EINVAL; + + /* Changing the type of the system call stop is not supported yet. */ + if (ptrace_get_syscall_info_op(child) != info.op) + return -EINVAL; + + switch (info.op) { + case PTRACE_SYSCALL_INFO_ENTRY: + return ptrace_set_syscall_info_entry(child, regs, &info); + case PTRACE_SYSCALL_INFO_EXIT: + return ptrace_set_syscall_info_exit(child, regs, &info); + case PTRACE_SYSCALL_INFO_SECCOMP: + return ptrace_set_syscall_info_seccomp(child, regs, &info); + default: + /* Other types of system call stops are not supported yet. */ + return -EINVAL; + } +} #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ int ptrace_request(struct task_struct *child, long request, @@ -1224,6 +1349,10 @@ int ptrace_request(struct task_struct *child, long request, case PTRACE_GET_SYSCALL_INFO: ret = ptrace_get_syscall_info(child, addr, datavp); break; + + case PTRACE_SET_SYSCALL_INFO: + ret = ptrace_set_syscall_info(child, addr, datavp); + break; #endif case PTRACE_SECCOMP_GET_FILTER: diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 12e4c64ebae1..625d75392647 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -213,4 +213,19 @@ config RCU_STRICT_GRACE_PERIOD when looking for certain types of RCU usage bugs, for example, too-short RCU read-side critical sections. + +config RCU_DYNTICKS_TORTURE + bool "Minimize RCU dynticks counter size" + depends on RCU_EXPERT && !COMPILE_TEST + default n + help + This option sets the width of the dynticks counter to its + minimum usable value. This minimum width greatly increases + the probability of flushing out bugs involving counter wrap, + but it also increases the probability of extending grace period + durations. This Kconfig option should therefore be avoided in + production due to the consequent increased probability of OOMs. + + This has no value for production and is only for testing. + endmenu # "RCU Debugging" diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index eed2951a4962..9cf01832a6c3 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -57,6 +57,9 @@ /* Low-order bit definition for polled grace-period APIs. */ #define RCU_GET_STATE_COMPLETED 0x1 +/* A complete grace period count */ +#define RCU_SEQ_GP (RCU_SEQ_STATE_MASK + 1) + extern int sysctl_sched_rt_runtime; /* @@ -157,12 +160,21 @@ static inline bool rcu_seq_done(unsigned long *sp, unsigned long s) * Given a snapshot from rcu_seq_snap(), determine whether or not a * full update-side operation has occurred, but do not allow the * (ULONG_MAX / 2) safety-factor/guard-band. + * + * The token returned by get_state_synchronize_rcu_full() is based on + * rcu_state.gp_seq but it is tested in poll_state_synchronize_rcu_full() + * against the root rnp->gp_seq. Since rcu_seq_start() is first called + * on rcu_state.gp_seq and only later reflected on the root rnp->gp_seq, + * it is possible that rcu_seq_snap(rcu_state.gp_seq) returns 2 full grace + * periods ahead of the root rnp->gp_seq. To prevent false-positives with the + * full polling API that a wrap around instantly completed the GP, when nothing + * like that happened, adjust for the 2 GPs in the ULONG_CMP_LT(). */ static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s) { unsigned long cur_s = READ_ONCE(*sp); - return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (3 * RCU_SEQ_STATE_MASK + 1)); + return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_GP)); } /* @@ -572,6 +584,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename, unsigned long c_old, unsigned long c); void rcu_gp_set_torture_wait(int duration); +void rcu_set_gpwrap_lag(unsigned long lag); +int rcu_get_gpwrap_count(int cpu); #else static inline void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq) { @@ -589,6 +603,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename, do { } while (0) #endif static inline void rcu_gp_set_torture_wait(int duration) { } +static inline void rcu_set_gpwrap_lag(unsigned long lag) { } +static inline int rcu_get_gpwrap_count(int cpu) { return 0; } #endif unsigned long long rcutorture_gather_gp_seqs(void); void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len); diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 0f3059b1b80d..7484d8ad5767 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -762,7 +762,7 @@ kfree_scale_thread(void *arg) } for (i = 0; i < kfree_alloc_num; i++) { - alloc_ptr = kmalloc(kfree_mult * sizeof(struct kfree_obj), GFP_KERNEL); + alloc_ptr = kcalloc(kfree_mult, sizeof(struct kfree_obj), GFP_KERNEL); if (!alloc_ptr) return -ENOMEM; @@ -796,7 +796,7 @@ kfree_scale_thread(void *arg) pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n", (unsigned long long)(end_time - start_time), kfree_loops, rcuscale_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started), - (mem_begin - mem_during) >> (20 - PAGE_SHIFT)); + PAGES_TO_MB(mem_begin - mem_during)); if (shutdown) { smp_mb(); /* Assign before wake. */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 4fa7772be183..07e51974b06b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -55,22 +55,24 @@ MODULE_DESCRIPTION("Read-Copy Update module-based torture test facility"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); -/* Bits for ->extendables field, extendables param, and related definitions. */ -#define RCUTORTURE_RDR_SHIFT_1 8 /* Put SRCU index in upper bits. */ -#define RCUTORTURE_RDR_MASK_1 (0xff << RCUTORTURE_RDR_SHIFT_1) -#define RCUTORTURE_RDR_SHIFT_2 16 /* Put SRCU index in upper bits. */ -#define RCUTORTURE_RDR_MASK_2 (0xff << RCUTORTURE_RDR_SHIFT_2) -#define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */ -#define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */ -#define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */ -#define RCUTORTURE_RDR_RBH 0x08 /* ... rcu_read_lock_bh(). */ -#define RCUTORTURE_RDR_SCHED 0x10 /* ... rcu_read_lock_sched(). */ -#define RCUTORTURE_RDR_RCU_1 0x20 /* ... entering another RCU reader. */ -#define RCUTORTURE_RDR_RCU_2 0x40 /* ... entering another RCU reader. */ -#define RCUTORTURE_RDR_NBITS 7 /* Number of bits defined above. */ -#define RCUTORTURE_MAX_EXTEND \ +// Bits for ->extendables field, extendables param, and related definitions. +#define RCUTORTURE_RDR_SHIFT_1 8 // Put SRCU index in upper bits. +#define RCUTORTURE_RDR_MASK_1 (0xff << RCUTORTURE_RDR_SHIFT_1) +#define RCUTORTURE_RDR_SHIFT_2 16 // Put SRCU index in upper bits. +#define RCUTORTURE_RDR_MASK_2 (0xff << RCUTORTURE_RDR_SHIFT_2) +#define RCUTORTURE_RDR_BH 0x01 // Extend readers by disabling bh. +#define RCUTORTURE_RDR_IRQ 0x02 // ... disabling interrupts. +#define RCUTORTURE_RDR_PREEMPT 0x04 // ... disabling preemption. +#define RCUTORTURE_RDR_RBH 0x08 // ... rcu_read_lock_bh(). +#define RCUTORTURE_RDR_SCHED 0x10 // ... rcu_read_lock_sched(). +#define RCUTORTURE_RDR_RCU_1 0x20 // ... entering another RCU reader. +#define RCUTORTURE_RDR_RCU_2 0x40 // ... entering another RCU reader. +#define RCUTORTURE_RDR_UPDOWN 0x80 // ... up-read from task, down-read from timer. + // Note: Manual start, automatic end. +#define RCUTORTURE_RDR_NBITS 8 // Number of bits defined above. +#define RCUTORTURE_MAX_EXTEND \ (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \ - RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED) + RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED) // Intentionally omit RCUTORTURE_RDR_UPDOWN. #define RCUTORTURE_RDR_ALLBITS \ (RCUTORTURE_MAX_EXTEND | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2 | \ RCUTORTURE_RDR_MASK_1 | RCUTORTURE_RDR_MASK_2) @@ -110,11 +112,16 @@ torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers"); torture_param(int, n_barrier_cbs, 0, "# of callbacks/kthreads for barrier testing"); +torture_param(int, n_up_down, 32, "# of concurrent up/down hrtimer-based RCU readers"); torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, object_debug, 0, "Enable debug-object double call_rcu() testing"); torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable"); +torture_param(bool, gpwrap_lag, true, "Enable grace-period wrap lag testing"); +torture_param(int, gpwrap_lag_gps, 8, "Value to set for set_gpwrap_lag during an active testing period."); +torture_param(int, gpwrap_lag_cycle_mins, 30, "Total cycle duration for gpwrap lag testing (in minutes)"); +torture_param(int, gpwrap_lag_active_mins, 5, "Duration for which gpwrap lag is active within each cycle (in minutes)"); torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disable"); torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)"); torture_param(int, preempt_duration, 0, "Preemption duration (ms), zero to disable"); @@ -152,6 +159,7 @@ static int nrealfakewriters; static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; +static struct task_struct *updown_task; static struct task_struct **nocb_tasks; static struct task_struct *stats_task; static struct task_struct *fqs_task; @@ -374,11 +382,14 @@ struct rcu_torture_ops { void (*readunlock)(int idx); int (*readlock_held)(void); // lockdep. int (*readlock_nesting)(void); // actual nesting, if available, -1 if not. + int (*down_read)(void); + void (*up_read)(int idx); unsigned long (*get_gp_seq)(void); unsigned long (*gp_diff)(unsigned long new, unsigned long old); void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); + void (*exp_current)(void); unsigned long (*get_gp_state_exp)(void); unsigned long (*start_gp_poll_exp)(void); void (*start_gp_poll_exp_full)(struct rcu_gp_oldstate *rgosp); @@ -413,6 +424,8 @@ struct rcu_torture_ops { bool (*reader_blocked)(void); unsigned long long (*gather_gp_seqs)(void); void (*format_gp_seqs)(unsigned long long seqs, char *cp, size_t len); + void (*set_gpwrap_lag)(unsigned long lag); + int (*get_gpwrap_count)(int cpu); long cbflood_max; int irq_capable; int can_boost; @@ -421,6 +434,7 @@ struct rcu_torture_ops { int no_pi_lock; int debug_objects; int start_poll_irqsoff; + int have_up_down; const char *name; }; @@ -458,7 +472,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) !(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); - if (preempt_count() & (SOFTIRQ_MASK | HARDIRQ_MASK)) + if ((preempt_count() & HARDIRQ_MASK) || softirq_count()) longdelay_ms = 5; /* Avoid triggering BH limits. */ mdelay(longdelay_ms); rtrsp->rt_delay_ms = longdelay_ms; @@ -619,6 +633,8 @@ static struct rcu_torture_ops rcu_ops = { : NULL, .gather_gp_seqs = rcutorture_gather_gp_seqs, .format_gp_seqs = rcutorture_format_gp_seqs, + .set_gpwrap_lag = rcu_set_gpwrap_lag, + .get_gpwrap_count = rcu_get_gpwrap_count, .irq_capable = 1, .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), .extendables = RCUTORTURE_MAX_EXTEND, @@ -676,10 +692,29 @@ static struct rcu_torture_ops rcu_busted_ops = { */ DEFINE_STATIC_SRCU(srcu_ctl); +DEFINE_STATIC_SRCU_FAST(srcu_ctlf); +DEFINE_STATIC_SRCU_FAST_UPDOWN(srcu_ctlfud); static struct srcu_struct srcu_ctld; static struct srcu_struct *srcu_ctlp = &srcu_ctl; static struct rcu_torture_ops srcud_ops; +static void srcu_torture_init(void) +{ + rcu_sync_torture_init(); + if (!reader_flavor || (reader_flavor & SRCU_READ_FLAVOR_NORMAL)) + VERBOSE_TOROUT_STRING("srcu_torture_init normal SRCU"); + if (reader_flavor & SRCU_READ_FLAVOR_NMI) + VERBOSE_TOROUT_STRING("srcu_torture_init NMI-safe SRCU"); + if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + srcu_ctlp = &srcu_ctlf; + VERBOSE_TOROUT_STRING("srcu_torture_init fast SRCU"); + } + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) { + srcu_ctlp = &srcu_ctlfud; + VERBOSE_TOROUT_STRING("srcu_torture_init fast-up/down SRCU"); + } +} + static void srcu_get_gp_data(int *flags, unsigned long *gp_seq) { srcutorture_get_gp_data(srcu_ctlp, flags, gp_seq); @@ -703,13 +738,14 @@ static int srcu_torture_read_lock(void) WARN_ON_ONCE(idx & ~0x1); ret += idx << 1; } - if (reader_flavor & SRCU_READ_FLAVOR_LITE) { - idx = srcu_read_lock_lite(srcu_ctlp); + if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + scp = srcu_read_lock_fast(srcu_ctlp); + idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); WARN_ON_ONCE(idx & ~0x1); ret += idx << 2; } - if (reader_flavor & SRCU_READ_FLAVOR_FAST) { - scp = srcu_read_lock_fast(srcu_ctlp); + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) { + scp = srcu_read_lock_fast_updown(srcu_ctlp); idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); WARN_ON_ONCE(idx & ~0x1); ret += idx << 3; @@ -739,10 +775,11 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) static void srcu_torture_read_unlock(int idx) { WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) + srcu_read_unlock_fast_updown(srcu_ctlp, + __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); if (reader_flavor & SRCU_READ_FLAVOR_FAST) - srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); - if (reader_flavor & SRCU_READ_FLAVOR_LITE) - srcu_read_unlock_lite(srcu_ctlp, (idx & 0x4) >> 2); + srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x4) >> 2)); if (reader_flavor & SRCU_READ_FLAVOR_NMI) srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1); if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) @@ -754,6 +791,50 @@ static int torture_srcu_read_lock_held(void) return srcu_read_lock_held(srcu_ctlp); } +static bool srcu_torture_have_up_down(void) +{ + int rf = reader_flavor; + + if (!rf) + rf = SRCU_READ_FLAVOR_NORMAL; + return !!(cur_ops->have_up_down & rf); +} + +static int srcu_torture_down_read(void) +{ + int idx; + struct srcu_ctr __percpu *scp; + + WARN_ON_ONCE(reader_flavor & ~SRCU_READ_FLAVOR_ALL); + WARN_ON_ONCE(reader_flavor & (reader_flavor - 1)); + + if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) { + idx = srcu_down_read(srcu_ctlp); + WARN_ON_ONCE(idx & ~0x1); + return idx; + } + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) { + scp = srcu_down_read_fast(srcu_ctlp); + idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); + WARN_ON_ONCE(idx & ~0x1); + return idx << 3; + } + WARN_ON_ONCE(1); + return 0; +} + +static void srcu_torture_up_read(int idx) +{ + WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) + srcu_up_read_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); + else if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || + !(reader_flavor & SRCU_READ_FLAVOR_ALL)) + srcu_up_read(srcu_ctlp, idx & 0x1); + else + WARN_ON_ONCE(1); +} + static unsigned long srcu_torture_completed(void) { return srcu_batches_completed(srcu_ctlp); @@ -805,18 +886,26 @@ static void srcu_torture_synchronize_expedited(void) synchronize_srcu_expedited(srcu_ctlp); } +static void srcu_torture_expedite_current(void) +{ + srcu_expedite_current(srcu_ctlp); +} + static struct rcu_torture_ops srcu_ops = { .ttype = SRCU_FLAVOR, - .init = rcu_sync_torture_init, + .init = srcu_torture_init, .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, + .down_read = srcu_torture_down_read, + .up_read = srcu_torture_up_read, .readlock_held = torture_srcu_read_lock_held, .get_gp_seq = srcu_torture_completed, .gp_diff = rcu_seq_diff, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, + .exp_current = srcu_torture_expedite_current, .same_gp_state = same_state_synchronize_srcu, .get_comp_state = get_completed_synchronize_srcu, .get_gp_state = srcu_torture_get_gp_state, @@ -831,13 +920,29 @@ static struct rcu_torture_ops srcu_ops = { .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .debug_objects = 1, + .have_up_down = IS_ENABLED(CONFIG_TINY_SRCU) + ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST_UPDOWN, .name = "srcu" }; -static void srcu_torture_init(void) +static void srcud_torture_init(void) { rcu_sync_torture_init(); - WARN_ON(init_srcu_struct(&srcu_ctld)); + if (!reader_flavor || (reader_flavor & SRCU_READ_FLAVOR_NORMAL)) { + WARN_ON(init_srcu_struct(&srcu_ctld)); + VERBOSE_TOROUT_STRING("srcud_torture_init normal SRCU"); + } else if (reader_flavor & SRCU_READ_FLAVOR_NMI) { + WARN_ON(init_srcu_struct(&srcu_ctld)); + VERBOSE_TOROUT_STRING("srcud_torture_init NMI-safe SRCU"); + } else if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + WARN_ON(init_srcu_struct_fast(&srcu_ctld)); + VERBOSE_TOROUT_STRING("srcud_torture_init fast SRCU"); + } else if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) { + WARN_ON(init_srcu_struct_fast_updown(&srcu_ctld)); + VERBOSE_TOROUT_STRING("srcud_torture_init fast-up/down SRCU"); + } else { + WARN_ON(init_srcu_struct(&srcu_ctld)); + } srcu_ctlp = &srcu_ctld; } @@ -850,17 +955,20 @@ static void srcu_torture_cleanup(void) /* As above, but dynamically allocated. */ static struct rcu_torture_ops srcud_ops = { .ttype = SRCU_FLAVOR, - .init = srcu_torture_init, + .init = srcud_torture_init, .cleanup = srcu_torture_cleanup, .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, .readlock_held = torture_srcu_read_lock_held, + .down_read = srcu_torture_down_read, + .up_read = srcu_torture_up_read, .get_gp_seq = srcu_torture_completed, .gp_diff = rcu_seq_diff, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, + .exp_current = srcu_torture_expedite_current, .same_gp_state = same_state_synchronize_srcu, .get_comp_state = get_completed_synchronize_srcu, .get_gp_state = srcu_torture_get_gp_state, @@ -875,6 +983,8 @@ static struct rcu_torture_ops srcud_ops = { .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .debug_objects = 1, + .have_up_down = IS_ENABLED(CONFIG_TINY_SRCU) + ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST_UPDOWN, .name = "srcud" }; @@ -902,7 +1012,8 @@ static struct rcu_torture_ops busted_srcud_ops = { /* * Definitions for trivial CONFIG_PREEMPT=n-only torture testing. - * This implementation does not necessarily work well with CPU hotplug. + * This implementation does not work well with CPU hotplug nor + * with rcutorture's shuffling. */ static void synchronize_rcu_trivial(void) @@ -915,6 +1026,16 @@ static void synchronize_rcu_trivial(void) } } +static void rcu_sync_torture_init_trivial(void) +{ + rcu_sync_torture_init(); + // if (onoff_interval || shuffle_interval) { + if (WARN_ONCE(onoff_interval || shuffle_interval, "%s: Non-zero onoff_interval (%d) or shuffle_interval (%d) breaks trivial RCU, resetting to zero", __func__, onoff_interval, shuffle_interval)) { + onoff_interval = 0; + shuffle_interval = 0; + } +} + static int rcu_torture_read_lock_trivial(void) { preempt_disable(); @@ -928,7 +1049,7 @@ static void rcu_torture_read_unlock_trivial(int idx) static struct rcu_torture_ops trivial_ops = { .ttype = RCU_TRIVIAL_FLAVOR, - .init = rcu_sync_torture_init, + .init = rcu_sync_torture_init_trivial, .readlock = rcu_torture_read_lock_trivial, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_torture_read_unlock_trivial, @@ -1457,7 +1578,7 @@ static void do_rtws_sync(struct torture_random_state *trsp, void (*sync)(void)) static int rcu_torture_writer(void *arg) { - bool boot_ended; + bool booting_still = false; bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal(); unsigned long cookie; struct rcu_gp_oldstate cookie_full; @@ -1468,6 +1589,7 @@ rcu_torture_writer(void *arg) struct rcu_gp_oldstate gp_snap1_full; int i; int idx; + unsigned long j; int oldnice = task_nice(current); struct rcu_gp_oldstate *rgo = NULL; int rgo_size = 0; @@ -1500,16 +1622,26 @@ rcu_torture_writer(void *arg) return 0; } if (cur_ops->poll_active > 0) { - ulo = kzalloc(cur_ops->poll_active * sizeof(ulo[0]), GFP_KERNEL); + ulo = kcalloc(cur_ops->poll_active, sizeof(*ulo), GFP_KERNEL); if (!WARN_ON(!ulo)) ulo_size = cur_ops->poll_active; } if (cur_ops->poll_active_full > 0) { - rgo = kzalloc(cur_ops->poll_active_full * sizeof(rgo[0]), GFP_KERNEL); + rgo = kcalloc(cur_ops->poll_active_full, sizeof(*rgo), GFP_KERNEL); if (!WARN_ON(!rgo)) rgo_size = cur_ops->poll_active_full; } + // If the system is still booting, let it finish. + j = jiffies; + while (!torture_must_stop() && !rcu_inkernel_boot_has_ended()) { + booting_still = true; + schedule_timeout_interruptible(HZ); + } + if (booting_still) + pr_alert("%s" TORTURE_FLAG " Waited %lu jiffies for boot to complete.\n", + torture_type, jiffies - j); + do { rcu_torture_writer_state = RTWS_FIXED_DELAY; torture_hrtimeout_us(500, 1000, &rand); @@ -1618,6 +1750,8 @@ rcu_torture_writer(void *arg) ulo[i] = cur_ops->get_comp_state(); gp_snap = cur_ops->start_gp_poll(); rcu_torture_writer_state = RTWS_POLL_WAIT; + if (cur_ops->exp_current && !torture_random(&rand) % 0xff) + cur_ops->exp_current(); while (!cur_ops->poll_gp_state(gp_snap)) { gp_snap1 = cur_ops->get_gp_state(); for (i = 0; i < ulo_size; i++) @@ -1638,6 +1772,8 @@ rcu_torture_writer(void *arg) cur_ops->get_comp_state_full(&rgo[i]); cur_ops->start_gp_poll_full(&gp_snap_full); rcu_torture_writer_state = RTWS_POLL_WAIT_FULL; + if (cur_ops->exp_current && !torture_random(&rand) % 0xff) + cur_ops->exp_current(); while (!cur_ops->poll_gp_state_full(&gp_snap_full)) { cur_ops->get_gp_state_full(&gp_snap1_full); for (i = 0; i < rgo_size; i++) @@ -1698,13 +1834,11 @@ rcu_torture_writer(void *arg) !rcu_gp_is_normal(); } rcu_torture_writer_state = RTWS_STUTTER; - boot_ended = rcu_inkernel_boot_has_ended(); stutter_waited = stutter_wait("rcu_torture_writer"); if (stutter_waited && !atomic_read(&rcu_fwd_cb_nodelay) && !cur_ops->slow_gps && !torture_must_stop() && - boot_ended && time_after(jiffies, stallsdone)) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) if (list_empty(&rcu_tortures[i].rtort_free) && @@ -1714,6 +1848,7 @@ rcu_torture_writer(void *arg) cur_ops->gp_kthread_dbg(); WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); rcu_ftrace_dump(DUMP_ALL); + break; } if (stutter_waited) sched_set_normal(current, oldnice); @@ -1907,14 +2042,14 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp, // Verify the specified RCUTORTURE_RDR* state. #define ROEC_ARGS "%s %s: Current %#x To add %#x To remove %#x preempt_count() %#x\n", __func__, s, curstate, new, old, preempt_count() -static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, bool insoftirq) +static void rcutorture_one_extend_check(char *s, int curstate, int new, int old) { int mask; - if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE)) + if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE) || in_nmi()) return; - WARN_ONCE(!(curstate & RCUTORTURE_RDR_IRQ) && irqs_disabled(), ROEC_ARGS); + WARN_ONCE(!(curstate & RCUTORTURE_RDR_IRQ) && irqs_disabled() && !in_hardirq(), ROEC_ARGS); WARN_ONCE((curstate & RCUTORTURE_RDR_IRQ) && !irqs_disabled(), ROEC_ARGS); // If CONFIG_PREEMPT_COUNT=n, further checks are unreliable. @@ -1922,21 +2057,21 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, return; WARN_ONCE((curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) && - !(preempt_count() & SOFTIRQ_MASK), ROEC_ARGS); + !softirq_count(), ROEC_ARGS); WARN_ONCE((curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) && !(preempt_count() & PREEMPT_MASK), ROEC_ARGS); WARN_ONCE(cur_ops->readlock_nesting && (curstate & (RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2)) && cur_ops->readlock_nesting() == 0, ROEC_ARGS); - // Timer handlers have all sorts of stuff disabled, so ignore + // Interrupt handlers have all sorts of stuff disabled, so ignore // unintended disabling. - if (insoftirq) + if (in_serving_softirq() || in_hardirq()) return; WARN_ONCE(cur_ops->extendables && !(curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) && - (preempt_count() & SOFTIRQ_MASK), ROEC_ARGS); + softirq_count(), ROEC_ARGS); /* * non-preemptible RCU in a preemptible kernel uses preempt_disable() @@ -1957,6 +2092,9 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) mask |= RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED; + if (IS_ENABLED(CONFIG_PREEMPT_RT) && softirq_count()) + mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; + WARN_ONCE(cur_ops->readlock_nesting && !(curstate & mask) && cur_ops->readlock_nesting() > 0, ROEC_ARGS); } @@ -1970,8 +2108,7 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, * beginning or end of the critical section and if there was actually a * change, do a ->read_delay(). */ -static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, - struct torture_random_state *trsp, +static void rcutorture_one_extend(int *readstate, int newstate, struct torture_random_state *trsp, struct rt_read_seg *rtrsp) { bool first; @@ -1985,8 +2122,8 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, first = idxold1 == 0; WARN_ON_ONCE(idxold2 < 0); - WARN_ON_ONCE(idxold2 & ~RCUTORTURE_RDR_ALLBITS); - rcutorture_one_extend_check("before change", idxold1, statesnew, statesold, insoftirq); + WARN_ON_ONCE(idxold2 & ~(RCUTORTURE_RDR_ALLBITS | RCUTORTURE_RDR_UPDOWN)); + rcutorture_one_extend_check("before change", idxold1, statesnew, statesold); rtrsp->rt_readstate = newstate; /* First, put new protection in place to avoid critical-section gap. */ @@ -2006,8 +2143,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, idxnew2 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_2) & RCUTORTURE_RDR_MASK_2; // Complain unless both the old and the new protection is in place. - rcutorture_one_extend_check("during change", - idxold1 | statesnew, statesnew, statesold, insoftirq); + rcutorture_one_extend_check("during change", idxold1 | statesnew, statesnew, statesold); // Sample CPU under both sets of protections to reduce confusion. if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) { @@ -2061,6 +2197,11 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, if (lockit) raw_spin_unlock_irqrestore(¤t->pi_lock, flags); } + if (statesold & RCUTORTURE_RDR_UPDOWN) { + cur_ops->up_read((idxold1 & RCUTORTURE_RDR_MASK_1) >> RCUTORTURE_RDR_SHIFT_1); + WARN_ON_ONCE(idxnew1 != -1); + idxold1 = 0; + } /* Delay if neither beginning nor end and there was a change. */ if ((statesnew || statesold) && *readstate && newstate) @@ -2077,7 +2218,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, WARN_ON_ONCE(*readstate < 0); if (WARN_ON_ONCE(*readstate & ~RCUTORTURE_RDR_ALLBITS)) pr_info("Unexpected readstate value of %#x\n", *readstate); - rcutorture_one_extend_check("after change", *readstate, statesnew, statesold, insoftirq); + rcutorture_one_extend_check("after change", *readstate, statesnew, statesold); } /* Return the biggest extendables mask given current RCU and boot parameters. */ @@ -2144,8 +2285,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) * critical section. */ static struct rt_read_seg * -rcutorture_loop_extend(int *readstate, bool insoftirq, struct torture_random_state *trsp, - struct rt_read_seg *rtrsp) +rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, struct rt_read_seg *rtrsp) { int i; int j; @@ -2159,58 +2299,77 @@ rcutorture_loop_extend(int *readstate, bool insoftirq, struct torture_random_sta i = ((i | (i >> 3)) & RCUTORTURE_RDR_MAX_LOOPS) + 1; for (j = 0; j < i; j++) { mask = rcutorture_extend_mask(*readstate, trsp); - rcutorture_one_extend(readstate, mask, insoftirq, trsp, &rtrsp[j]); + WARN_ON_ONCE(mask & RCUTORTURE_RDR_UPDOWN); + rcutorture_one_extend(readstate, mask, trsp, &rtrsp[j]); } return &rtrsp[j]; } -/* - * Do one read-side critical section, returning false if there was - * no data to read. Can be invoked both from process context and - * from a timer handler. - */ -static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) -{ - bool checkpolling = !(torture_random(trsp) & 0xfff); +struct rcu_torture_one_read_state { + bool checkpolling; unsigned long cookie; struct rcu_gp_oldstate cookie_full; - int i; unsigned long started; - unsigned long completed; - int newstate; struct rcu_torture *p; - int pipe_count; - bool preempted = false; - int readstate = 0; - struct rt_read_seg rtseg[RCUTORTURE_RDR_MAX_SEGS] = { { 0 } }; - struct rt_read_seg *rtrsp = &rtseg[0]; - struct rt_read_seg *rtrsp1; + int readstate; + struct rt_read_seg rtseg[RCUTORTURE_RDR_MAX_SEGS]; + struct rt_read_seg *rtrsp; unsigned long long ts; +}; - WARN_ON_ONCE(!rcu_is_watching()); - newstate = rcutorture_extend_mask(readstate, trsp); - rcutorture_one_extend(&readstate, newstate, myid < 0, trsp, rtrsp++); - if (checkpolling) { +static void init_rcu_torture_one_read_state(struct rcu_torture_one_read_state *rtorsp, + struct torture_random_state *trsp) +{ + memset(rtorsp, 0, sizeof(*rtorsp)); + rtorsp->checkpolling = !(torture_random(trsp) & 0xfff); + rtorsp->rtrsp = &rtorsp->rtseg[0]; +} + +/* + * Set up the first segment of a series of overlapping read-side + * critical sections. The caller must have actually initiated the + * outermost read-side critical section. + */ +static bool rcu_torture_one_read_start(struct rcu_torture_one_read_state *rtorsp, + struct torture_random_state *trsp, long myid) +{ + if (rtorsp->checkpolling) { if (cur_ops->get_gp_state && cur_ops->poll_gp_state) - cookie = cur_ops->get_gp_state(); + rtorsp->cookie = cur_ops->get_gp_state(); if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) - cur_ops->get_gp_state_full(&cookie_full); + cur_ops->get_gp_state_full(&rtorsp->cookie_full); } - started = cur_ops->get_gp_seq(); - ts = rcu_trace_clock_local(); - p = rcu_dereference_check(rcu_torture_current, - !cur_ops->readlock_held || cur_ops->readlock_held()); - if (p == NULL) { + rtorsp->started = cur_ops->get_gp_seq(); + rtorsp->ts = rcu_trace_clock_local(); + rtorsp->p = rcu_dereference_check(rcu_torture_current, + !cur_ops->readlock_held || cur_ops->readlock_held() || + (rtorsp->readstate & RCUTORTURE_RDR_UPDOWN)); + if (rtorsp->p == NULL) { /* Wait for rcu_torture_writer to get underway */ - rcutorture_one_extend(&readstate, 0, myid < 0, trsp, rtrsp); + rcutorture_one_extend(&rtorsp->readstate, 0, trsp, rtorsp->rtrsp); return false; } - if (p->rtort_mbtest == 0) + if (rtorsp->p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); - rcu_torture_reader_do_mbchk(myid, p, trsp); - rtrsp = rcutorture_loop_extend(&readstate, myid < 0, trsp, rtrsp); + rcu_torture_reader_do_mbchk(myid, rtorsp->p, trsp); + return true; +} + +/* + * Complete the last segment of a series of overlapping read-side + * critical sections and check for errors. + */ +static void rcu_torture_one_read_end(struct rcu_torture_one_read_state *rtorsp, + struct torture_random_state *trsp) +{ + int i; + unsigned long completed; + int pipe_count; + bool preempted = false; + struct rt_read_seg *rtrsp1; + preempt_disable(); - pipe_count = READ_ONCE(p->rtort_pipe_count); + pipe_count = READ_ONCE(rtorsp->p->rtort_pipe_count); if (pipe_count > RCU_TORTURE_PIPE_LEN) { // Should not happen in a correct RCU implementation, // happens quite often for torture_type=busted. @@ -2218,28 +2377,28 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) } completed = cur_ops->get_gp_seq(); if (pipe_count > 1) { - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, - ts, started, completed); + do_trace_rcu_torture_read(cur_ops->name, &rtorsp->p->rtort_rcu, + rtorsp->ts, rtorsp->started, completed); rcu_ftrace_dump(DUMP_ALL); } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = rcutorture_seq_diff(completed, started); + completed = rcutorture_seq_diff(completed, rtorsp->started); if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; } __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); - if (checkpolling) { + if (rtorsp->checkpolling) { if (cur_ops->get_gp_state && cur_ops->poll_gp_state) - WARN_ONCE(cur_ops->poll_gp_state(cookie), + WARN_ONCE(cur_ops->poll_gp_state(rtorsp->cookie), "%s: Cookie check 2 failed %s(%d) %lu->%lu\n", __func__, rcu_torture_writer_state_getname(), rcu_torture_writer_state, - cookie, cur_ops->get_gp_state()); + rtorsp->cookie, cur_ops->get_gp_state()); if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) - WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full), + WARN_ONCE(cur_ops->poll_gp_state_full(&rtorsp->cookie_full), "%s: Cookie check 6 failed %s(%d) online %*pbl\n", __func__, rcu_torture_writer_state_getname(), @@ -2248,21 +2407,41 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) } if (cur_ops->reader_blocked) preempted = cur_ops->reader_blocked(); - rcutorture_one_extend(&readstate, 0, myid < 0, trsp, rtrsp); - WARN_ON_ONCE(readstate); + rcutorture_one_extend(&rtorsp->readstate, 0, trsp, rtorsp->rtrsp); + WARN_ON_ONCE(rtorsp->readstate); // This next splat is expected behavior if leakpointer, especially // for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels. - WARN_ON_ONCE(leakpointer && READ_ONCE(p->rtort_pipe_count) > 1); + WARN_ON_ONCE(leakpointer && READ_ONCE(rtorsp->p->rtort_pipe_count) > 1); /* If error or close call, record the sequence of reader protections. */ if ((pipe_count > 1 || completed > 1) && !xchg(&err_segs_recorded, 1)) { i = 0; - for (rtrsp1 = &rtseg[0]; rtrsp1 < rtrsp; rtrsp1++) + for (rtrsp1 = &rtorsp->rtseg[0]; rtrsp1 < rtorsp->rtrsp; rtrsp1++) err_segs[i++] = *rtrsp1; rt_read_nsegs = i; rt_read_preempted = preempted; } +} +/* + * Do one read-side critical section, returning false if there was + * no data to read. Can be invoked both from process context and + * from a timer handler. + */ +static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) +{ + int newstate; + struct rcu_torture_one_read_state rtors; + + WARN_ON_ONCE(!rcu_is_watching()); + init_rcu_torture_one_read_state(&rtors, trsp); + newstate = rcutorture_extend_mask(rtors.readstate, trsp); + WARN_ON_ONCE(newstate & RCUTORTURE_RDR_UPDOWN); + rcutorture_one_extend(&rtors.readstate, newstate, trsp, rtors.rtrsp++); + if (!rcu_torture_one_read_start(&rtors, trsp, myid)) + return false; + rtors.rtrsp = rcutorture_loop_extend(&rtors.readstate, trsp, rtors.rtrsp); + rcu_torture_one_read_end(&rtors, trsp); return true; } @@ -2307,7 +2486,7 @@ rcu_torture_reader(void *arg) set_user_nice(current, MAX_NICE); if (irqreader && cur_ops->irq_capable) timer_setup_on_stack(&t, rcu_torture_timer, 0); - tick_dep_set_task(current, TICK_DEP_BIT_RCU); + tick_dep_set_task(current, TICK_DEP_BIT_RCU); // CPU bound, so need tick. do { if (irqreader && cur_ops->irq_capable) { if (!timer_pending(&t)) @@ -2319,19 +2498,166 @@ rcu_torture_reader(void *arg) torture_hrtimeout_us(500, 1000, &rand); lastsleep = jiffies + 10; } - while (torture_num_online_cpus() < mynumonline && !torture_must_stop()) + while (!torture_must_stop() && + (torture_num_online_cpus() < mynumonline || !rcu_inkernel_boot_has_ended())) schedule_timeout_interruptible(HZ / 5); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); if (irqreader && cur_ops->irq_capable) { timer_delete_sync(&t); - destroy_timer_on_stack(&t); + timer_destroy_on_stack(&t); } tick_dep_clear_task(current, TICK_DEP_BIT_RCU); torture_kthread_stopping("rcu_torture_reader"); return 0; } +struct rcu_torture_one_read_state_updown { + struct hrtimer rtorsu_hrt; + bool rtorsu_inuse; + ktime_t rtorsu_kt; + int rtorsu_cpu; + unsigned long rtorsu_j; + unsigned long rtorsu_ndowns; + unsigned long rtorsu_nups; + unsigned long rtorsu_nmigrates; + struct torture_random_state rtorsu_trs; + struct rcu_torture_one_read_state rtorsu_rtors; +}; + +static struct rcu_torture_one_read_state_updown *updownreaders; +static DEFINE_TORTURE_RANDOM(rcu_torture_updown_rand); +static int rcu_torture_updown(void *arg); + +static enum hrtimer_restart rcu_torture_updown_hrt(struct hrtimer *hrtp) +{ + int cpu = raw_smp_processor_id(); + struct rcu_torture_one_read_state_updown *rtorsup; + + rtorsup = container_of(hrtp, struct rcu_torture_one_read_state_updown, rtorsu_hrt); + rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs); + WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); + WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1); + WRITE_ONCE(rtorsup->rtorsu_nmigrates, + rtorsup->rtorsu_nmigrates + (cpu != rtorsup->rtorsu_cpu)); + smp_store_release(&rtorsup->rtorsu_inuse, false); + return HRTIMER_NORESTART; +} + +static int rcu_torture_updown_init(void) +{ + int i; + struct torture_random_state *rand = &rcu_torture_updown_rand; + int ret; + + if (n_up_down < 0) + return 0; + if (!srcu_torture_have_up_down()) { + VERBOSE_TOROUT_STRING("rcu_torture_updown_init: Disabling up/down reader tests due to lack of primitives"); + return 0; + } + updownreaders = kcalloc(n_up_down, sizeof(*updownreaders), GFP_KERNEL); + if (!updownreaders) { + VERBOSE_TOROUT_STRING("rcu_torture_updown_init: Out of memory, disabling up/down reader tests"); + return -ENOMEM; + } + for (i = 0; i < n_up_down; i++) { + init_rcu_torture_one_read_state(&updownreaders[i].rtorsu_rtors, rand); + hrtimer_setup(&updownreaders[i].rtorsu_hrt, rcu_torture_updown_hrt, CLOCK_MONOTONIC, + HRTIMER_MODE_REL | HRTIMER_MODE_HARD); + torture_random_init(&updownreaders[i].rtorsu_trs); + init_rcu_torture_one_read_state(&updownreaders[i].rtorsu_rtors, + &updownreaders[i].rtorsu_trs); + } + ret = torture_create_kthread(rcu_torture_updown, rand, updown_task); + if (ret) { + kfree(updownreaders); + updownreaders = NULL; + } + return ret; +} + +static void rcu_torture_updown_cleanup(void) +{ + struct rcu_torture_one_read_state_updown *rtorsup; + + for (rtorsup = updownreaders; rtorsup < &updownreaders[n_up_down]; rtorsup++) { + if (!smp_load_acquire(&rtorsup->rtorsu_inuse)) + continue; + if (hrtimer_cancel(&rtorsup->rtorsu_hrt) || WARN_ON_ONCE(rtorsup->rtorsu_inuse)) { + rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs); + WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); + WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1); + smp_store_release(&rtorsup->rtorsu_inuse, false); + } + + } + kfree(updownreaders); + updownreaders = NULL; +} + +// Do one reader for rcu_torture_updown(). +static void rcu_torture_updown_one(struct rcu_torture_one_read_state_updown *rtorsup) +{ + int idx; + int rawidx; + ktime_t t; + + init_rcu_torture_one_read_state(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs); + rawidx = cur_ops->down_read(); + WRITE_ONCE(rtorsup->rtorsu_ndowns, rtorsup->rtorsu_ndowns + 1); + idx = (rawidx << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1; + rtorsup->rtorsu_rtors.readstate = idx | RCUTORTURE_RDR_UPDOWN; + rtorsup->rtorsu_rtors.rtrsp++; + rtorsup->rtorsu_cpu = raw_smp_processor_id(); + if (!rcu_torture_one_read_start(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1)) { + WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); + WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1); + schedule_timeout_idle(HZ); + return; + } + smp_store_release(&rtorsup->rtorsu_inuse, true); + t = torture_random(&rtorsup->rtorsu_trs) & 0xfffff; // One per million. + if (t < 10 * 1000) + t = 200 * 1000 * 1000; + hrtimer_start(&rtorsup->rtorsu_hrt, t, HRTIMER_MODE_REL | HRTIMER_MODE_HARD); + smp_mb(); // Sample jiffies after posting hrtimer. + rtorsup->rtorsu_j = jiffies; // Not used by hrtimer handler. + rtorsup->rtorsu_kt = t; +} + +/* + * RCU torture up/down reader kthread, starting RCU readers in kthread + * context and ending them in hrtimer handlers. Otherwise similar to + * rcu_torture_reader(). + */ +static int +rcu_torture_updown(void *arg) +{ + unsigned long j; + struct rcu_torture_one_read_state_updown *rtorsup; + + VERBOSE_TOROUT_STRING("rcu_torture_updown task started"); + do { + for (rtorsup = updownreaders; rtorsup < &updownreaders[n_up_down]; rtorsup++) { + if (torture_must_stop()) + break; + j = smp_load_acquire(&jiffies); // Time before ->rtorsu_inuse. + if (smp_load_acquire(&rtorsup->rtorsu_inuse)) { + WARN_ONCE(time_after(j, rtorsup->rtorsu_j + 1 + HZ * 10), + "hrtimer queued at jiffies %lu for %lld ns took %lu jiffies\n", rtorsup->rtorsu_j, rtorsup->rtorsu_kt, j - rtorsup->rtorsu_j); + continue; + } + rcu_torture_updown_one(rtorsup); + } + torture_hrtimeout_ms(1, 1000, &rcu_torture_updown_rand); + stutter_wait("rcu_torture_updown"); + } while (!torture_must_stop()); + rcu_torture_updown_cleanup(); + torture_kthread_stopping("rcu_torture_updown"); + return 0; +} + /* * Randomly Toggle CPUs' callback-offload state. This uses hrtimers to * increase race probabilities and fuzzes the interval between toggling. @@ -2394,6 +2720,11 @@ rcu_torture_stats_print(void) int i; long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + long n_gpwraps = 0; + unsigned long ndowns = 0; + unsigned long nunexpired = 0; + unsigned long nmigrates = 0; + unsigned long nups = 0; struct rcu_torture *rtcp; static unsigned long rtcv_snap = ULONG_MAX; static bool splatted; @@ -2404,11 +2735,21 @@ rcu_torture_stats_print(void) pipesummary[i] += READ_ONCE(per_cpu(rcu_torture_count, cpu)[i]); batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]); } + if (cur_ops->get_gpwrap_count) + n_gpwraps += cur_ops->get_gpwrap_count(cpu); + } + if (updownreaders) { + for (i = 0; i < n_up_down; i++) { + ndowns += READ_ONCE(updownreaders[i].rtorsu_ndowns); + nups += READ_ONCE(updownreaders[i].rtorsu_nups); + nunexpired += READ_ONCE(updownreaders[i].rtorsu_inuse); + nmigrates += READ_ONCE(updownreaders[i].rtorsu_nmigrates); + } } for (i = RCU_TORTURE_PIPE_LEN; i >= 0; i--) { if (pipesummary[i] != 0) break; - } + } // The value of variable "i" is used later, so don't clobber it! pr_alert("%s%s ", torture_type, TORTURE_FLAG); rtcp = rcu_access_pointer(rcu_torture_current); @@ -2429,14 +2770,17 @@ rcu_torture_stats_print(void) n_rcu_torture_boost_failure, n_rcu_torture_boosts, atomic_long_read(&n_rcu_torture_timers)); + if (updownreaders) + pr_cont("ndowns: %lu nups: %lu nhrt: %lu nmigrates: %lu ", ndowns, nups, nunexpired, nmigrates); torture_onoff_stats(); pr_cont("barrier: %ld/%ld:%ld ", data_race(n_barrier_successes), data_race(n_barrier_attempts), data_race(n_rcu_torture_barrier_error)); pr_cont("read-exits: %ld ", data_race(n_read_exits)); // Statistic. - pr_cont("nocb-toggles: %ld:%ld\n", + pr_cont("nocb-toggles: %ld:%ld ", atomic_long_read(&n_nocb_offload), atomic_long_read(&n_nocb_deoffload)); + pr_cont("gpwraps: %ld\n", n_gpwraps); pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) || @@ -2474,7 +2818,8 @@ rcu_torture_stats_print(void) cur_ops->stats(); if (rtcv_snap == rcu_torture_current_version && rcu_access_pointer(rcu_torture_current) && - !rcu_stall_is_suppressed()) { + !rcu_stall_is_suppressed() && + rcu_inkernel_boot_has_ended()) { int __maybe_unused flags = 0; unsigned long __maybe_unused gp_seq = 0; @@ -2582,7 +2927,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "reader_flavor=%x " "nocbs_nthreads=%d nocbs_toggle=%d " "test_nmis=%d " - "preempt_duration=%d preempt_interval=%d\n", + "preempt_duration=%d preempt_interval=%d n_up_down=%d\n", torture_type, tag, nrealreaders, nrealfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, @@ -2596,7 +2941,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) reader_flavor, nocbs_nthreads, nocbs_toggle, test_nmis, - preempt_duration, preempt_interval); + preempt_duration, preempt_interval, n_up_down); } static int rcutorture_booster_cleanup(unsigned int cpu) @@ -3036,7 +3381,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) cver = READ_ONCE(rcu_torture_current_version); gps = cur_ops->get_gp_seq(); rfp->rcu_launder_gp_seq_start = gps; - tick_dep_set_task(current, TICK_DEP_BIT_RCU); + tick_dep_set_task(current, TICK_DEP_BIT_RCU); // CPU bound, so need tick. while (time_before(jiffies, stopat) && !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { @@ -3164,6 +3509,8 @@ static int rcu_torture_fwd_prog(void *args) int tested_tries = 0; VERBOSE_TOROUT_STRING("rcu_torture_fwd_progress task started"); + while (!rcu_inkernel_boot_has_ended()) + schedule_timeout_interruptible(HZ / 10); rcu_bind_current_to_nocb(); if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST)) set_user_nice(current, MAX_NICE); @@ -3607,6 +3954,57 @@ static int rcu_torture_preempt(void *unused) static enum cpuhp_state rcutor_hp; +static struct hrtimer gpwrap_lag_timer; +static bool gpwrap_lag_active; + +/* Timer handler for toggling RCU grace-period sequence overflow test lag value */ +static enum hrtimer_restart rcu_gpwrap_lag_timer(struct hrtimer *timer) +{ + ktime_t next_delay; + + if (gpwrap_lag_active) { + pr_alert("rcu-torture: Disabling gpwrap lag (value=0)\n"); + cur_ops->set_gpwrap_lag(0); + gpwrap_lag_active = false; + next_delay = ktime_set((gpwrap_lag_cycle_mins - gpwrap_lag_active_mins) * 60, 0); + } else { + pr_alert("rcu-torture: Enabling gpwrap lag (value=%d)\n", gpwrap_lag_gps); + cur_ops->set_gpwrap_lag(gpwrap_lag_gps); + gpwrap_lag_active = true; + next_delay = ktime_set(gpwrap_lag_active_mins * 60, 0); + } + + if (torture_must_stop_irq()) + return HRTIMER_NORESTART; + + hrtimer_forward_now(timer, next_delay); + return HRTIMER_RESTART; +} + +static int rcu_gpwrap_lag_init(void) +{ + if (!gpwrap_lag) + return 0; + + if (gpwrap_lag_cycle_mins <= 0 || gpwrap_lag_active_mins <= 0) { + pr_alert("rcu-torture: lag timing parameters must be positive\n"); + return -EINVAL; + } + + hrtimer_setup(&gpwrap_lag_timer, rcu_gpwrap_lag_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + gpwrap_lag_active = false; + hrtimer_start(&gpwrap_lag_timer, + ktime_set((gpwrap_lag_cycle_mins - gpwrap_lag_active_mins) * 60, 0), HRTIMER_MODE_REL); + + return 0; +} + +static void rcu_gpwrap_lag_cleanup(void) +{ + hrtimer_cancel(&gpwrap_lag_timer); + cur_ops->set_gpwrap_lag(0); + gpwrap_lag_active = false; +} static void rcu_torture_cleanup(void) { @@ -3648,6 +4046,10 @@ rcu_torture_cleanup(void) nocb_tasks = NULL; } + if (updown_task) { + torture_stop_kthread(rcu_torture_updown, updown_task); + updown_task = NULL; + } if (reader_tasks) { for (i = 0; i < nrealreaders; i++) torture_stop_kthread(rcu_torture_reader, @@ -3776,6 +4178,9 @@ rcu_torture_cleanup(void) torture_cleanup_end(); if (cur_ops->gp_slow_unregister) cur_ops->gp_slow_unregister(NULL); + + if (gpwrap_lag && cur_ops->set_gpwrap_lag) + rcu_gpwrap_lag_cleanup(); } static void rcu_torture_leak_cb(struct rcu_head *rhp) @@ -4141,11 +4546,6 @@ rcu_torture_init(void) /* Start up the kthreads. */ rcu_torture_write_types(); - firsterr = torture_create_kthread(rcu_torture_writer, NULL, - writer_task); - if (torture_init_error(firsterr)) - goto unwind; - if (nrealfakewriters > 0) { fakewriter_tasks = kcalloc(nrealfakewriters, sizeof(fakewriter_tasks[0]), @@ -4178,6 +4578,15 @@ rcu_torture_init(void) if (torture_init_error(firsterr)) goto unwind; } + + firsterr = torture_create_kthread(rcu_torture_writer, NULL, + writer_task); + if (torture_init_error(firsterr)) + goto unwind; + + firsterr = rcu_torture_updown_init(); + if (torture_init_error(firsterr)) + goto unwind; nrealnocbers = nocbs_nthreads; if (WARN_ON(nrealnocbers < 0)) nrealnocbers = 1; @@ -4272,9 +4681,17 @@ rcu_torture_init(void) } if (object_debug) rcu_test_debug_objects(); - torture_init_end(); + if (cur_ops->gp_slow_register && !WARN_ON_ONCE(!cur_ops->gp_slow_unregister)) cur_ops->gp_slow_register(&rcu_fwd_cb_nodelay); + + if (gpwrap_lag && cur_ops->set_gpwrap_lag) { + firsterr = rcu_gpwrap_lag_init(); + if (torture_init_error(firsterr)) + goto unwind; + } + + torture_init_end(); return 0; unwind: diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index f11a7c2af778..07a313782dfd 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -85,7 +85,7 @@ torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0, // Number of typesafe_lookup structures, that is, the degree of concurrency. torture_param(long, lookup_instances, 0, "Number of typesafe_lookup structures."); // Number of loops per experiment, all readers execute operations concurrently. -torture_param(long, loops, 10000, "Number of loops per experiment."); +torture_param(int, loops, 10000, "Number of loops per experiment."); // Number of readers, with -1 defaulting to about 75% of the CPUs. torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); // Number of runs. @@ -136,6 +136,7 @@ struct ref_scale_ops { void (*cleanup)(void); void (*readsection)(const int nloops); void (*delaysection)(const int nloops, const int udl, const int ndl); + bool enable_irqs; const char *name; }; @@ -184,6 +185,8 @@ static const struct ref_scale_ops rcu_ops = { // Definitions for SRCU ref scale testing. DEFINE_STATIC_SRCU(srcu_refctl_scale); +DEFINE_STATIC_SRCU_FAST(srcu_fast_refctl_scale); +DEFINE_STATIC_SRCU_FAST_UPDOWN(srcu_fast_updown_refctl_scale); static struct srcu_struct *srcu_ctlp = &srcu_refctl_scale; static void srcu_ref_scale_read_section(const int nloops) @@ -216,6 +219,12 @@ static const struct ref_scale_ops srcu_ops = { .name = "srcu" }; +static bool srcu_fast_sync_scale_init(void) +{ + srcu_ctlp = &srcu_fast_refctl_scale; + return true; +} + static void srcu_fast_ref_scale_read_section(const int nloops) { int i; @@ -240,40 +249,46 @@ static void srcu_fast_ref_scale_delay_section(const int nloops, const int udl, c } static const struct ref_scale_ops srcu_fast_ops = { - .init = rcu_sync_scale_init, + .init = srcu_fast_sync_scale_init, .readsection = srcu_fast_ref_scale_read_section, .delaysection = srcu_fast_ref_scale_delay_section, .name = "srcu-fast" }; -static void srcu_lite_ref_scale_read_section(const int nloops) +static bool srcu_fast_updown_sync_scale_init(void) +{ + srcu_ctlp = &srcu_fast_updown_refctl_scale; + return true; +} + +static void srcu_fast_updown_ref_scale_read_section(const int nloops) { int i; - int idx; + struct srcu_ctr __percpu *scp; for (i = nloops; i >= 0; i--) { - idx = srcu_read_lock_lite(srcu_ctlp); - srcu_read_unlock_lite(srcu_ctlp, idx); + scp = srcu_read_lock_fast_updown(srcu_ctlp); + srcu_read_unlock_fast_updown(srcu_ctlp, scp); } } -static void srcu_lite_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +static void srcu_fast_updown_ref_scale_delay_section(const int nloops, const int udl, const int ndl) { int i; - int idx; + struct srcu_ctr __percpu *scp; for (i = nloops; i >= 0; i--) { - idx = srcu_read_lock_lite(srcu_ctlp); + scp = srcu_read_lock_fast_updown(srcu_ctlp); un_delay(udl, ndl); - srcu_read_unlock_lite(srcu_ctlp, idx); + srcu_read_unlock_fast_updown(srcu_ctlp, scp); } } -static const struct ref_scale_ops srcu_lite_ops = { - .init = rcu_sync_scale_init, - .readsection = srcu_lite_ref_scale_read_section, - .delaysection = srcu_lite_ref_scale_delay_section, - .name = "srcu-lite" +static const struct ref_scale_ops srcu_fast_updown_ops = { + .init = srcu_fast_updown_sync_scale_init, + .readsection = srcu_fast_updown_ref_scale_read_section, + .delaysection = srcu_fast_updown_ref_scale_delay_section, + .name = "srcu-fast-updown" }; #ifdef CONFIG_TASKS_RCU @@ -353,6 +368,9 @@ static const struct ref_scale_ops rcu_trace_ops = { // Definitions for reference count static atomic_t refcnt; +// Definitions acquire-release. +static DEFINE_PER_CPU(unsigned long, test_acqrel); + static void ref_refcnt_section(const int nloops) { int i; @@ -381,6 +399,184 @@ static const struct ref_scale_ops refcnt_ops = { .name = "refcnt" }; +static void ref_percpuinc_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + this_cpu_inc(test_acqrel); + this_cpu_dec(test_acqrel); + } +} + +static void ref_percpuinc_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + this_cpu_inc(test_acqrel); + un_delay(udl, ndl); + this_cpu_dec(test_acqrel); + } +} + +static const struct ref_scale_ops percpuinc_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_percpuinc_section, + .delaysection = ref_percpuinc_delay_section, + .name = "percpuinc" +}; + +// Note that this can lose counts in preemptible kernels. +static void ref_incpercpu_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap = this_cpu_ptr(&test_acqrel); + + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + } +} + +static void ref_incpercpu_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap = this_cpu_ptr(&test_acqrel); + + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + un_delay(udl, ndl); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + } +} + +static const struct ref_scale_ops incpercpu_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_incpercpu_section, + .delaysection = ref_incpercpu_delay_section, + .name = "incpercpu" +}; + +static void ref_incpercpupreempt_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + preempt_disable(); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + preempt_enable(); + } +} + +static void ref_incpercpupreempt_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + preempt_disable(); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + un_delay(udl, ndl); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + preempt_enable(); + } +} + +static const struct ref_scale_ops incpercpupreempt_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_incpercpupreempt_section, + .delaysection = ref_incpercpupreempt_delay_section, + .name = "incpercpupreempt" +}; + +static void ref_incpercpubh_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + local_bh_disable(); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + local_bh_enable(); + } +} + +static void ref_incpercpubh_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + local_bh_disable(); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + un_delay(udl, ndl); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + local_bh_enable(); + } +} + +static const struct ref_scale_ops incpercpubh_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_incpercpubh_section, + .delaysection = ref_incpercpubh_delay_section, + .enable_irqs = true, + .name = "incpercpubh" +}; + +static void ref_incpercpuirqsave_section(const int nloops) +{ + int i; + unsigned long flags; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + local_irq_save(flags); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + local_irq_restore(flags); + } +} + +static void ref_incpercpuirqsave_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + unsigned long flags; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + local_irq_save(flags); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + un_delay(udl, ndl); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + local_irq_restore(flags); + } +} + +static const struct ref_scale_ops incpercpuirqsave_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_incpercpuirqsave_section, + .delaysection = ref_incpercpuirqsave_delay_section, + .name = "incpercpuirqsave" +}; + // Definitions for rwlock static rwlock_t test_rwlock; @@ -524,9 +720,6 @@ static const struct ref_scale_ops lock_irq_ops = { .name = "lock-irq" }; -// Definitions acquire-release. -static DEFINE_PER_CPU(unsigned long, test_acqrel); - static void ref_acqrel_section(const int nloops) { unsigned long x; @@ -659,6 +852,133 @@ static const struct ref_scale_ops jiffies_ops = { .name = "jiffies" }; +static void ref_preempt_section(const int nloops) +{ + int i; + + migrate_disable(); + for (i = nloops; i >= 0; i--) { + preempt_disable(); + preempt_enable(); + } + migrate_enable(); +} + +static void ref_preempt_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + migrate_disable(); + for (i = nloops; i >= 0; i--) { + preempt_disable(); + un_delay(udl, ndl); + preempt_enable(); + } + migrate_enable(); +} + +static const struct ref_scale_ops preempt_ops = { + .readsection = ref_preempt_section, + .delaysection = ref_preempt_delay_section, + .name = "preempt" +}; + +static void ref_bh_section(const int nloops) +{ + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_bh_disable(); + local_bh_enable(); + } + preempt_enable(); +} + +static void ref_bh_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_bh_disable(); + un_delay(udl, ndl); + local_bh_enable(); + } + preempt_enable(); +} + +static const struct ref_scale_ops bh_ops = { + .readsection = ref_bh_section, + .delaysection = ref_bh_delay_section, + .enable_irqs = true, + .name = "bh" +}; + +static void ref_irq_section(const int nloops) +{ + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_irq_disable(); + local_irq_enable(); + } + preempt_enable(); +} + +static void ref_irq_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_irq_disable(); + un_delay(udl, ndl); + local_irq_enable(); + } + preempt_enable(); +} + +static const struct ref_scale_ops irq_ops = { + .readsection = ref_irq_section, + .delaysection = ref_irq_delay_section, + .name = "irq" +}; + +static void ref_irqsave_section(const int nloops) +{ + unsigned long flags; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_irq_save(flags); + local_irq_restore(flags); + } + preempt_enable(); +} + +static void ref_irqsave_delay_section(const int nloops, const int udl, const int ndl) +{ + unsigned long flags; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_irq_save(flags); + un_delay(udl, ndl); + local_irq_restore(flags); + } + preempt_enable(); +} + +static const struct ref_scale_ops irqsave_ops = { + .readsection = ref_irqsave_section, + .delaysection = ref_irqsave_delay_section, + .name = "irqsave" +}; + //////////////////////////////////////////////////////////////////////// // // Methods leveraging SLAB_TYPESAFE_BY_RCU. @@ -954,15 +1274,18 @@ repeat: if (!atomic_dec_return(&n_warmedup)) while (atomic_read_acquire(&n_warmedup)) rcu_scale_one_reader(); - // Also keep interrupts disabled. This also has the effect - // of preventing entries into slow path for rcu_read_unlock(). - local_irq_save(flags); + // Also keep interrupts disabled when it is safe to do so, which + // it is not for local_bh_enable(). This also has the effect of + // preventing entries into slow path for rcu_read_unlock(). + if (!cur_ops->enable_irqs) + local_irq_save(flags); start = ktime_get_mono_fast_ns(); rcu_scale_one_reader(); duration = ktime_get_mono_fast_ns() - start; - local_irq_restore(flags); + if (!cur_ops->enable_irqs) + local_irq_restore(flags); rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration; // To reduce runtime-skew noise, do maintain-load invocations until @@ -1051,7 +1374,7 @@ static int main_func(void *arg) set_user_nice(current, MAX_NICE); VERBOSE_SCALEOUT("main_func task started"); - result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL); + result_avg = kcalloc(nruns, sizeof(*result_avg), GFP_KERNEL); buf = kzalloc(800 + 64, GFP_KERNEL); if (!result_avg || !buf) { SCALEOUT_ERRSTRING("out of memory"); @@ -1140,7 +1463,7 @@ static void ref_scale_print_module_parms(const struct ref_scale_ops *cur_ops, const char *tag) { pr_alert("%s" SCALE_FLAG - "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, + "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%d nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, verbose, verbose_batched, shutdown, holdoff, lookup_instances, loops, nreaders, nruns, readdelay); } @@ -1163,9 +1486,9 @@ ref_scale_cleanup(void) reader_tasks[i].task); } kfree(reader_tasks); + reader_tasks = NULL; torture_stop_kthread("main_task", main_task); - kfree(main_task); // Do scale-type-specific cleanup operations. if (cur_ops->cleanup != NULL) @@ -1193,9 +1516,13 @@ ref_scale_init(void) long i; int firsterr = 0; static const struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcu_fast_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS - &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, - &acqrel_ops, &sched_clock_ops, &clock_ops, &jiffies_ops, + &rcu_ops, &srcu_ops, &srcu_fast_ops, &srcu_fast_updown_ops, + RCU_TRACE_OPS RCU_TASKS_OPS + &refcnt_ops, &percpuinc_ops, &incpercpu_ops, &incpercpupreempt_ops, + &incpercpubh_ops, &incpercpuirqsave_ops, + &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, + &sched_clock_ops, &clock_ops, &jiffies_ops, + &preempt_ops, &bh_ops, &irq_ops, &irqsave_ops, &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, }; @@ -1238,12 +1565,16 @@ ref_scale_init(void) // Reader tasks (default to ~75% of online CPUs). if (nreaders < 0) nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2); - if (WARN_ONCE(loops <= 0, "%s: loops = %ld, adjusted to 1\n", __func__, loops)) + if (WARN_ONCE(loops <= 0, "%s: loops = %d, adjusted to 1\n", __func__, loops)) loops = 1; if (WARN_ONCE(nreaders <= 0, "%s: nreaders = %d, adjusted to 1\n", __func__, nreaders)) nreaders = 1; if (WARN_ONCE(nruns <= 0, "%s: nruns = %d, adjusted to 1\n", __func__, nruns)) nruns = 1; + if (WARN_ONCE(loops > INT_MAX / nreaders, + "%s: nreaders * loops will overflow, adjusted loops to %d", + __func__, INT_MAX / nreaders)) + loops = INT_MAX / nreaders; reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]), GFP_KERNEL); if (!reader_tasks) { diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 6e9fe2ce1075..3450c3751ef7 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -106,15 +106,15 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval); preempt_enable(); - if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task()) + if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task() && !irqs_disabled()) swake_up_one(&ssp->srcu_wq); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); /* * Workqueue handler to drive one grace period and invoke any callbacks - * that become ready as a result. Single-CPU and !PREEMPTION operation - * means that we get away with murder on synchronization. ;-) + * that become ready as a result. Single-CPU operation and preemption + * disabling mean that we get away with murder on synchronization. ;-) */ void srcu_drive_gp(struct work_struct *wp) { @@ -141,7 +141,12 @@ void srcu_drive_gp(struct work_struct *wp) WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ preempt_enable(); - swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx])); + do { + // Deadlock issues prevent __srcu_read_unlock() from + // doing an unconditional wakeup, so polling is required. + swait_event_timeout_exclusive(ssp->srcu_wq, + !READ_ONCE(ssp->srcu_lock_nesting[idx]), HZ / 10); + } while (READ_ONCE(ssp->srcu_lock_nesting[idx])); preempt_disable(); // Needed for PREEMPT_LAZY WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); @@ -176,10 +181,9 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp) { unsigned long cookie; - preempt_disable(); // Needed for PREEMPT_LAZY + lockdep_assert_preemption_disabled(); // Needed for PREEMPT_LAZY cookie = get_state_synchronize_srcu(ssp); if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) { - preempt_enable(); return; } WRITE_ONCE(ssp->srcu_idx_max, cookie); @@ -189,7 +193,6 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp) else if (list_empty(&ssp->srcu_work.entry)) list_add(&ssp->srcu_work.entry, &srcu_boot_list); } - preempt_enable(); } /* diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 9a59b071501b..ea3f128de06f 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -286,32 +286,92 @@ err_free_sup: #ifdef CONFIG_DEBUG_LOCK_ALLOC -int __init_srcu_struct(struct srcu_struct *ssp, const char *name, - struct lock_class_key *key) +static int +__init_srcu_struct_common(struct srcu_struct *ssp, const char *name, struct lock_class_key *key) { /* Don't re-initialize a lock while it is held. */ debug_check_no_locks_freed((void *)ssp, sizeof(*ssp)); lockdep_init_map(&ssp->dep_map, name, key, 0); return init_srcu_struct_fields(ssp, false); } + +int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key) +{ + ssp->srcu_reader_flavor = 0; + return __init_srcu_struct_common(ssp, name, key); +} EXPORT_SYMBOL_GPL(__init_srcu_struct); +int __init_srcu_struct_fast(struct srcu_struct *ssp, const char *name, struct lock_class_key *key) +{ + ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST; + return __init_srcu_struct_common(ssp, name, key); +} +EXPORT_SYMBOL_GPL(__init_srcu_struct_fast); + +int __init_srcu_struct_fast_updown(struct srcu_struct *ssp, const char *name, + struct lock_class_key *key) +{ + ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST_UPDOWN; + return __init_srcu_struct_common(ssp, name, key); +} +EXPORT_SYMBOL_GPL(__init_srcu_struct_fast_updown); + #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /** * init_srcu_struct - initialize a sleep-RCU structure * @ssp: structure to initialize. * - * Must invoke this on a given srcu_struct before passing that srcu_struct + * Use this in place of DEFINE_SRCU() and DEFINE_STATIC_SRCU() + * for non-static srcu_struct structures that are to be passed to + * srcu_read_lock(), srcu_read_lock_nmisafe(), and friends. It is necessary + * to invoke this on a given srcu_struct before passing that srcu_struct * to any other function. Each srcu_struct represents a separate domain * of SRCU protection. */ int init_srcu_struct(struct srcu_struct *ssp) { + ssp->srcu_reader_flavor = 0; return init_srcu_struct_fields(ssp, false); } EXPORT_SYMBOL_GPL(init_srcu_struct); +/** + * init_srcu_struct_fast - initialize a fast-reader sleep-RCU structure + * @ssp: structure to initialize. + * + * Use this in place of DEFINE_SRCU_FAST() and DEFINE_STATIC_SRCU_FAST() + * for non-static srcu_struct structures that are to be passed to + * srcu_read_lock_fast() and friends. It is necessary to invoke this on a + * given srcu_struct before passing that srcu_struct to any other function. + * Each srcu_struct represents a separate domain of SRCU protection. + */ +int init_srcu_struct_fast(struct srcu_struct *ssp) +{ + ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST; + return init_srcu_struct_fields(ssp, false); +} +EXPORT_SYMBOL_GPL(init_srcu_struct_fast); + +/** + * init_srcu_struct_fast_updown - initialize a fast-reader up/down sleep-RCU structure + * @ssp: structure to initialize. + * + * Use this function in place of DEFINE_SRCU_FAST_UPDOWN() and + * DEFINE_STATIC_SRCU_FAST_UPDOWN() for non-static srcu_struct + * structures that are to be passed to srcu_read_lock_fast_updown(), + * srcu_down_read_fast(), and friends. It is necessary to invoke this on a + * given srcu_struct before passing that srcu_struct to any other function. + * Each srcu_struct represents a separate domain of SRCU protection. + */ +int init_srcu_struct_fast_updown(struct srcu_struct *ssp) +{ + ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST_UPDOWN; + return init_srcu_struct_fields(ssp, false); +} +EXPORT_SYMBOL_GPL(init_srcu_struct_fast_updown); + #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* @@ -461,7 +521,7 @@ static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, uns static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, unsigned long *rdm) { int cpu; - unsigned long mask = 0; + unsigned long mask = ssp->srcu_reader_flavor; unsigned long sum = 0; for_each_possible_cpu(cpu) { @@ -502,6 +562,8 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) */ if (!did_gp) smp_mb(); /* A */ + else if (srcu_gp_is_expedited(ssp)) + synchronize_rcu_expedited(); /* X */ else synchronize_rcu(); /* X */ @@ -732,6 +794,10 @@ void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) sdp = raw_cpu_ptr(ssp->sda); old_read_flavor = READ_ONCE(sdp->srcu_reader_flavor); + WARN_ON_ONCE(ssp->srcu_reader_flavor && read_flavor != ssp->srcu_reader_flavor); + WARN_ON_ONCE(old_read_flavor && ssp->srcu_reader_flavor && + old_read_flavor != ssp->srcu_reader_flavor); + WARN_ON_ONCE(read_flavor == SRCU_READ_FLAVOR_FAST && !ssp->srcu_reader_flavor); if (!old_read_flavor) { old_read_flavor = cmpxchg(&sdp->srcu_reader_flavor, 0, read_flavor); if (!old_read_flavor) @@ -1166,6 +1232,16 @@ static void srcu_flip(struct srcu_struct *ssp) * counter update. Note that both this memory barrier and the * one in srcu_readers_active_idx_check() provide the guarantee * for __srcu_read_lock(). + * + * Note that this is a performance optimization, in which we spend + * an otherwise unnecessary smp_mb() in order to reduce the number + * of full per-CPU-variable scans in srcu_readers_lock_idx() and + * srcu_readers_unlock_idx(). But this performance optimization + * is not so optimal for SRCU-fast, where we would be spending + * not smp_mb(), but rather synchronize_rcu(). At the same time, + * the overhead of the smp_mb() is in the noise, so there is no + * point in omitting it in the SRCU-fast case. So the same code + * is executed either way. */ smp_mb(); /* D */ /* Pairs with C. */ } @@ -1589,7 +1665,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) { if (cookie != SRCU_GET_STATE_COMPLETED && - !rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie)) + !rcu_seq_done_exact(&ssp->srcu_sup->srcu_gp_seq, cookie)) return false; // Ensure that the end of the SRCU grace period happens before // any subsequent code that the caller might execute. @@ -1676,6 +1752,64 @@ void srcu_barrier(struct srcu_struct *ssp) } EXPORT_SYMBOL_GPL(srcu_barrier); +/* Callback for srcu_expedite_current() usage. */ +static void srcu_expedite_current_cb(struct rcu_head *rhp) +{ + unsigned long flags; + bool needcb = false; + struct srcu_data *sdp = container_of(rhp, struct srcu_data, srcu_ec_head); + + spin_lock_irqsave_sdp_contention(sdp, &flags); + if (sdp->srcu_ec_state == SRCU_EC_IDLE) { + WARN_ON_ONCE(1); + } else if (sdp->srcu_ec_state == SRCU_EC_PENDING) { + sdp->srcu_ec_state = SRCU_EC_IDLE; + } else { + WARN_ON_ONCE(sdp->srcu_ec_state != SRCU_EC_REPOST); + sdp->srcu_ec_state = SRCU_EC_PENDING; + needcb = true; + } + spin_unlock_irqrestore_rcu_node(sdp, flags); + // If needed, requeue ourselves as an expedited SRCU callback. + if (needcb) + __call_srcu(sdp->ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false); +} + +/** + * srcu_expedite_current - Expedite the current SRCU grace period + * @ssp: srcu_struct to expedite. + * + * Cause the current SRCU grace period to become expedited. The grace + * period following the current one might also be expedited. If there is + * no current grace period, one might be created. If the current grace + * period is currently sleeping, that sleep will complete before expediting + * will take effect. + */ +void srcu_expedite_current(struct srcu_struct *ssp) +{ + unsigned long flags; + bool needcb = false; + struct srcu_data *sdp; + + migrate_disable(); + sdp = this_cpu_ptr(ssp->sda); + spin_lock_irqsave_sdp_contention(sdp, &flags); + if (sdp->srcu_ec_state == SRCU_EC_IDLE) { + sdp->srcu_ec_state = SRCU_EC_PENDING; + needcb = true; + } else if (sdp->srcu_ec_state == SRCU_EC_PENDING) { + sdp->srcu_ec_state = SRCU_EC_REPOST; + } else { + WARN_ON_ONCE(sdp->srcu_ec_state != SRCU_EC_REPOST); + } + spin_unlock_irqrestore_rcu_node(sdp, flags); + // If needed, queue an expedited SRCU callback. + if (needcb) + __call_srcu(ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false); + migrate_enable(); +} +EXPORT_SYMBOL_GPL(srcu_expedite_current); + /** * srcu_batches_completed - return batches completed. * @ssp: srcu_struct on which to report batch completion. diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index c0cc7ae41106..2dc044fd126e 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -316,7 +316,8 @@ static void call_rcu_tasks_generic_timer(struct timer_list *tlp) unsigned long flags; bool needwake = false; struct rcu_tasks *rtp; - struct rcu_tasks_percpu *rtpcp = from_timer(rtpcp, tlp, lazy_timer); + struct rcu_tasks_percpu *rtpcp = timer_container_of(rtpcp, tlp, + lazy_timer); rtp = rtpcp->rtpp; raw_spin_lock_irqsave_rcu_node(rtpcp, flags); @@ -552,13 +553,13 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu rtpcp_next = rtp->rtpcp_array[index]; if (rtpcp_next->cpu < smp_load_acquire(&rtp->percpu_dequeue_lim)) { cpuwq = rcu_cpu_beenfullyonline(rtpcp_next->cpu) ? rtpcp_next->cpu : WORK_CPU_UNBOUND; - queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work); + queue_work_on(cpuwq, system_percpu_wq, &rtpcp_next->rtp_work); index++; if (index < num_possible_cpus()) { rtpcp_next = rtp->rtpcp_array[index]; if (rtpcp_next->cpu < smp_load_acquire(&rtp->percpu_dequeue_lim)) { cpuwq = rcu_cpu_beenfullyonline(rtpcp_next->cpu) ? rtpcp_next->cpu : WORK_CPU_UNBOUND; - queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work); + queue_work_on(cpuwq, system_percpu_wq, &rtpcp_next->rtp_work); } } } diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index c1ebfd51768b..585cade21010 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -70,12 +70,10 @@ void rcu_qs(void) */ void rcu_sched_clock_irq(int user) { - if (user) { + if (user) rcu_qs(); - } else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) { - set_tsk_need_resched(current); - set_preempt_need_resched(); - } + else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) + set_need_resched_current(); } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 659f83e71048..293bbd9ac3f4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -80,6 +80,15 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *); static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .gpwrap = true, }; + +int rcu_get_gpwrap_count(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + return READ_ONCE(rdp->gpwrap_count); +} +EXPORT_SYMBOL_GPL(rcu_get_gpwrap_count); + static struct rcu_state rcu_state = { .level = { &rcu_state.node[0] }, .gp_state = RCU_GP_IDLE, @@ -151,7 +160,6 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, unsigned long gps, unsigned long flags); static void invoke_rcu_core(void); static void rcu_report_exp_rdp(struct rcu_data *rdp); -static void sync_sched_exp_online_cleanup(int cpu); static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); static bool rcu_rdp_is_offloaded(struct rcu_data *rdp); static bool rcu_rdp_cpu_online(struct rcu_data *rdp); @@ -368,7 +376,7 @@ EXPORT_SYMBOL_GPL(rcu_momentary_eqs); */ static int rcu_is_cpu_rrupt_from_idle(void) { - long nesting; + long nmi_nesting = ct_nmi_nesting(); /* * Usually called from the tick; but also used from smp_function_call() @@ -380,21 +388,28 @@ static int rcu_is_cpu_rrupt_from_idle(void) /* Check for counter underflows */ RCU_LOCKDEP_WARN(ct_nesting() < 0, "RCU nesting counter underflow!"); - RCU_LOCKDEP_WARN(ct_nmi_nesting() <= 0, - "RCU nmi_nesting counter underflow/zero!"); - /* Are we at first interrupt nesting level? */ - nesting = ct_nmi_nesting(); - if (nesting > 1) + /* Non-idle interrupt or nested idle interrupt */ + if (nmi_nesting > 1) return false; /* - * If we're not in an interrupt, we must be in the idle task! + * Non nested idle interrupt (interrupting section where RCU + * wasn't watching). */ - WARN_ON_ONCE(!nesting && !is_idle_task(current)); + if (nmi_nesting == 1) + return true; - /* Does CPU appear to be idle from an RCU standpoint? */ - return ct_nesting() == 0; + /* Not in an interrupt */ + if (!nmi_nesting) { + RCU_LOCKDEP_WARN(!in_task() || !is_idle_task(current), + "RCU nmi_nesting counter not in idle task!"); + return !rcu_is_watching_curr_cpu(); + } + + RCU_LOCKDEP_WARN(1, "RCU nmi_nesting counter underflow/zero!"); + + return false; } #define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10) @@ -558,7 +573,7 @@ void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len) } EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); -#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) +#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_VIRT_XFER_TO_GUEST_WORK)) /* * An empty function that will trigger a reschedule on * IRQ tail once IRQs get re-enabled on userspace/guest resume. @@ -587,7 +602,7 @@ noinstr void rcu_irq_work_resched(void) if (IS_ENABLED(CONFIG_GENERIC_ENTRY) && !(current->flags & PF_VCPU)) return; - if (IS_ENABLED(CONFIG_KVM_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU)) + if (IS_ENABLED(CONFIG_VIRT_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU)) return; instrumentation_begin(); @@ -596,7 +611,7 @@ noinstr void rcu_irq_work_resched(void) } instrumentation_end(); } -#endif /* #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) */ +#endif /* #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_VIRT_XFER_TO_GUEST_WORK)) */ #ifdef CONFIG_PROVE_RCU /** @@ -757,6 +772,25 @@ void rcu_request_urgent_qs_task(struct task_struct *t) smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true); } +static unsigned long seq_gpwrap_lag = ULONG_MAX / 4; + +/** + * rcu_set_gpwrap_lag - Set RCU GP sequence overflow lag value. + * @lag_gps: Set overflow lag to this many grace period worth of counters + * which is used by rcutorture to quickly force a gpwrap situation. + * @lag_gps = 0 means we reset it back to the boot-time value. + */ +void rcu_set_gpwrap_lag(unsigned long lag_gps) +{ + unsigned long lag_seq_count; + + lag_seq_count = (lag_gps == 0) + ? ULONG_MAX / 4 + : lag_gps << RCU_SEQ_CTR_SHIFT; + WRITE_ONCE(seq_gpwrap_lag, lag_seq_count); +} +EXPORT_SYMBOL_GPL(rcu_set_gpwrap_lag); + /* * When trying to report a quiescent state on behalf of some other CPU, * it is our responsibility to check for and handle potential overflow @@ -767,9 +801,11 @@ void rcu_request_urgent_qs_task(struct task_struct *t) static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) { raw_lockdep_assert_held_rcu_node(rnp); - if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + ULONG_MAX / 4, - rnp->gp_seq)) + if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + seq_gpwrap_lag, + rnp->gp_seq)) { WRITE_ONCE(rdp->gpwrap, true); + WRITE_ONCE(rdp->gpwrap_count, READ_ONCE(rdp->gpwrap_count) + 1); + } if (ULONG_CMP_LT(rdp->rcu_iw_gp_seq + ULONG_MAX / 4, rnp->gp_seq)) rdp->rcu_iw_gp_seq = rnp->gp_seq + ULONG_MAX / 4; } @@ -801,6 +837,10 @@ static int rcu_watching_snap_save(struct rcu_data *rdp) return 0; } +#ifndef arch_irq_stat_cpu +#define arch_irq_stat_cpu(cpu) 0 +#endif + /* * Returns positive if the specified CPU has passed through a quiescent state * by virtue of being in or having passed through an dynticks idle state since @@ -936,9 +976,9 @@ static int rcu_watching_snap_recheck(struct rcu_data *rdp) rsrp->cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu); rsrp->cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu); rsrp->cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu); - rsrp->nr_hardirqs = kstat_cpu_irqs_sum(rdp->cpu); - rsrp->nr_softirqs = kstat_cpu_softirqs_sum(rdp->cpu); - rsrp->nr_csw = nr_context_switches_cpu(rdp->cpu); + rsrp->nr_hardirqs = kstat_cpu_irqs_sum(cpu) + arch_irq_stat_cpu(cpu); + rsrp->nr_softirqs = kstat_cpu_softirqs_sum(cpu); + rsrp->nr_csw = nr_context_switches_cpu(cpu); rsrp->jiffies = jiffies; rsrp->gp_seq = rdp->gp_seq; } @@ -1060,38 +1100,6 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp) return needmore; } -static void swake_up_one_online_ipi(void *arg) -{ - struct swait_queue_head *wqh = arg; - - swake_up_one(wqh); -} - -static void swake_up_one_online(struct swait_queue_head *wqh) -{ - int cpu = get_cpu(); - - /* - * If called from rcutree_report_cpu_starting(), wake up - * is dangerous that late in the CPU-down hotplug process. The - * scheduler might queue an ignored hrtimer. Defer the wake up - * to an online CPU instead. - */ - if (unlikely(cpu_is_offline(cpu))) { - int target; - - target = cpumask_any_and(housekeeping_cpumask(HK_TYPE_RCU), - cpu_online_mask); - - smp_call_function_single(target, swake_up_one_online_ipi, - wqh, 0); - put_cpu(); - } else { - put_cpu(); - swake_up_one(wqh); - } -} - /* * Awaken the grace-period kthread. Don't do a self-awaken (unless in an * interrupt or softirq handler, in which case we just might immediately @@ -1116,7 +1124,7 @@ static void rcu_gp_kthread_wake(void) return; WRITE_ONCE(rcu_state.gp_wake_time, jiffies); WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq)); - swake_up_one_online(&rcu_state.gp_wq); + swake_up_one(&rcu_state.gp_wq); } /* @@ -1623,8 +1631,10 @@ static void rcu_sr_put_wait_head(struct llist_node *node) atomic_set_release(&sr_wn->inuse, 0); } -/* Disabled by default. */ -static int rcu_normal_wake_from_gp; +/* Enable rcu_normal_wake_from_gp automatically on small systems. */ +#define WAKE_FROM_GP_CPU_THRESHOLD 16 + +static int rcu_normal_wake_from_gp = -1; module_param(rcu_normal_wake_from_gp, int, 0644); static struct workqueue_struct *sync_wq; @@ -1798,6 +1808,7 @@ static noinline_for_stack bool rcu_gp_init(void) struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(); bool start_new_poll; + unsigned long old_gp_seq; WRITE_ONCE(rcu_state.gp_activity, jiffies); raw_spin_lock_irq_rcu_node(rnp); @@ -1825,7 +1836,24 @@ static noinline_for_stack bool rcu_gp_init(void) */ start_new_poll = rcu_sr_normal_gp_init(); /* Record GP times before starting GP, hence rcu_seq_start(). */ + old_gp_seq = rcu_state.gp_seq; + /* + * Critical ordering: rcu_seq_start() must happen BEFORE the CPU hotplug + * scan below. Otherwise we risk a race where a newly onlining CPU could + * be missed by the current grace period, potentially leading to + * use-after-free errors. For a detailed explanation of this race, see + * Documentation/RCU/Design/Requirements/Requirements.rst in the + * "Hotplug CPU" section. + * + * Also note that the root rnp's gp_seq is kept separate from, and lags, + * the rcu_state's gp_seq, for a reason. See the Quick-Quiz on + * Single-node systems for more details (in Data-Structures.rst). + */ rcu_seq_start(&rcu_state.gp_seq); + /* Ensure that rcu_seq_done_exact() guardband doesn't give false positives. */ + WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && + rcu_seq_done_exact(&old_gp_seq, rcu_seq_snap(&rcu_state.gp_seq))); + ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap); @@ -1857,6 +1885,10 @@ static noinline_for_stack bool rcu_gp_init(void) /* Exclude CPU hotplug operations. */ rcu_for_each_leaf_node(rnp) { local_irq_disable(); + /* + * Serialize with CPU offline. See Requirements.rst > Hotplug CPU > + * Concurrent Quiescent State Reporting for Offline CPUs. + */ arch_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && @@ -1931,7 +1963,12 @@ static noinline_for_stack bool rcu_gp_init(void) trace_rcu_grace_period_init(rcu_state.name, rnp->gp_seq, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); - /* Quiescent states for tasks on any now-offline CPUs. */ + /* + * Quiescent states for tasks on any now-offline CPUs. Since we + * released the ofl and rnp lock before this loop, CPUs might + * have gone offline and we have to report QS on their behalf. + * See Requirements.rst > Hotplug CPU > Concurrent QS Reporting. + */ mask = rnp->qsmask & ~rnp->qsmaskinitnext; rnp->rcu_gp_init_mask = mask; if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp)) @@ -2659,10 +2696,8 @@ void rcu_sched_clock_irq(int user) /* The load-acquire pairs with the store-release setting to true. */ if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) { /* Idle and userspace execution already are quiescent states. */ - if (!rcu_is_cpu_rrupt_from_idle() && !user) { - set_tsk_need_resched(current); - set_preempt_need_resched(); - } + if (!rcu_is_cpu_rrupt_from_idle() && !user) + set_need_resched_current(); __this_cpu_write(rcu_data.rcu_urgent_qs, false); } rcu_flavor_sched_clock_irq(user); @@ -2787,7 +2822,6 @@ static void strict_work_handler(struct work_struct *work) /* Perform RCU core processing work for the current CPU. */ static __latent_entropy void rcu_core(void) { - unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; @@ -2800,8 +2834,8 @@ static __latent_entropy void rcu_core(void) if (IS_ENABLED(CONFIG_PREEMPT_COUNT) && (!(preempt_count() & PREEMPT_MASK))) { rcu_preempt_deferred_qs(current); } else if (rcu_preempt_need_deferred_qs(current)) { - set_tsk_need_resched(current); - set_preempt_need_resched(); + guard(irqsave)(); + set_need_resched_current(); } /* Update RCU state based on any recent quiescent states. */ @@ -2810,10 +2844,9 @@ static __latent_entropy void rcu_core(void) /* No grace period and unregistered callbacks? */ if (!rcu_gp_in_progress() && rcu_segcblist_is_enabled(&rdp->cblist) && !rcu_rdp_is_offloaded(rdp)) { - local_irq_save(flags); + guard(irqsave)(); if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) rcu_accelerate_cbs_unlocked(rnp, rdp); - local_irq_restore(flags); } rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); @@ -3064,6 +3097,10 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) /* Misaligned rcu_head! */ WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); + /* Avoid NULL dereference if callback is NULL. */ + if (WARN_ON_ONCE(!func)) + return; + if (debug_rcu_head_queue(head)) { /* * Probable double call_rcu(), so leak the callback. @@ -3231,7 +3268,7 @@ static void synchronize_rcu_normal(void) trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("request")); - if (!READ_ONCE(rcu_normal_wake_from_gp)) { + if (READ_ONCE(rcu_normal_wake_from_gp) < 1) { wait_rcu_gp(call_rcu_hurry); goto trace_complete_out; } @@ -3759,6 +3796,11 @@ static void rcu_barrier_handler(void *cpu_in) * to complete. For example, if there are no RCU callbacks queued anywhere * in the system, then rcu_barrier() is within its rights to return * immediately, without waiting for anything, much less an RCU grace period. + * In fact, rcu_barrier() will normally not result in any RCU grace periods + * beyond those that were already destined to be executed. + * + * In kernels built with CONFIG_RCU_LAZY=y, this function also hurries all + * pending lazy RCU callbacks. */ void rcu_barrier(void) { @@ -3975,7 +4017,7 @@ bool rcu_cpu_online(int cpu) * RCU on an offline processor during initial boot, hence the check for * rcu_scheduler_fully_active. */ -bool rcu_lockdep_current_cpu_online(void) +bool notrace rcu_lockdep_current_cpu_online(void) { struct rcu_data *rdp; bool ret = false; @@ -4221,6 +4263,8 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->rcu_iw_gp_seq = rdp->gp_seq - 1; trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + rcu_preempt_deferred_qs_init(rdp); rcu_spawn_rnp_kthreads(rnp); rcu_spawn_cpu_nocb_kthread(cpu); ASSERT_EXCLUSIVE_WRITER(rcu_state.n_online_cpus); @@ -4256,7 +4300,6 @@ int rcutree_online_cpu(unsigned int cpu) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) return 0; /* Too early in boot for scheduler work. */ - sync_sched_exp_online_cleanup(cpu); // Stop-machine done, so allow nohz_full to disable tick. tick_dep_clear(TICK_DEP_BIT_RCU); @@ -4346,6 +4389,12 @@ void rcutree_report_cpu_dead(void) * may introduce a new READ-side while it is actually off the QS masks. */ lockdep_assert_irqs_disabled(); + /* + * CPUHP_AP_SMPCFD_DYING was the last call for rcu_exp_handler() execution. + * The requested QS must have been reported on the last context switch + * from stop machine to idle. + */ + WARN_ON_ONCE(rdp->cpu_no_qs.b.exp); // Do any dangling deferred wakeups. do_nocb_deferred_wakeup(rdp); @@ -4353,6 +4402,13 @@ void rcutree_report_cpu_dead(void) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; + + /* + * Hold the ofl_lock and rnp lock to avoid races between CPU going + * offline and doing a QS report (as below), versus rcu_gp_init(). + * See Requirements.rst > Hotplug CPU > Concurrent QS Reporting section + * for more details. + */ arch_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq); @@ -4363,6 +4419,7 @@ void rcutree_report_cpu_dead(void) rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); raw_spin_lock_irqsave_rcu_node(rnp, flags); } + /* Clear from ->qsmaskinitnext to mark offline. */ WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); arch_spin_unlock(&rcu_state.ofl_lock); @@ -4829,12 +4886,18 @@ void __init rcu_init(void) rcutree_online_cpu(cpu); /* Create workqueue for Tree SRCU and for expedited GPs. */ - rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); + rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM | WQ_PERCPU, 0); WARN_ON(!rcu_gp_wq); - sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM, 0); + sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); WARN_ON(!sync_wq); + /* Respect if explicitly disabled via a boot parameter. */ + if (rcu_normal_wake_from_gp < 0) { + if (num_possible_cpus() <= WAKE_FROM_GP_CPU_THRESHOLD) + rcu_normal_wake_from_gp = 1; + } + /* Fill in default value for rcutree.qovld boot parameter. */ /* -After- the rcu_node ->lock fields are initialized! */ if (qovld < 0) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index a9a811d9d7a3..b8bbe7960cda 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -168,12 +168,23 @@ struct rcu_snap_record { u64 cputime_irq; /* Accumulated cputime of hard irqs */ u64 cputime_softirq;/* Accumulated cputime of soft irqs */ u64 cputime_system; /* Accumulated cputime of kernel tasks */ - unsigned long nr_hardirqs; /* Accumulated number of hard irqs */ + u64 nr_hardirqs; /* Accumulated number of hard irqs */ unsigned int nr_softirqs; /* Accumulated number of soft irqs */ unsigned long long nr_csw; /* Accumulated number of task switches */ unsigned long jiffies; /* Track jiffies value */ }; +/* + * An IRQ work (deferred_qs_iw) is used by RCU to get the scheduler's attention. + * to report quiescent states at the soonest possible time. + * The request can be in one of the following states: + * - DEFER_QS_IDLE: An IRQ work is yet to be scheduled. + * - DEFER_QS_PENDING: An IRQ work was scheduled but either not yet run, or it + * ran and we still haven't reported a quiescent state. + */ +#define DEFER_QS_IDLE 0 +#define DEFER_QS_PENDING 1 + /* Per-CPU data for read-copy update. */ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ @@ -183,6 +194,7 @@ struct rcu_data { bool core_needs_qs; /* Core waits for quiescent state. */ bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible ->gp_seq wrap. */ + unsigned int gpwrap_count; /* Count of GP sequence wrap. */ bool cpu_started; /* RCU watching this onlining CPU. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ @@ -191,7 +203,7 @@ struct rcu_data { /* during and after the last grace */ /* period it is aware of. */ struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */ - bool defer_qs_iw_pending; /* Scheduler attention pending? */ + int defer_qs_iw_pending; /* Scheduler attention pending? */ struct work_struct strict_work; /* Schedule readers for strict GPs. */ /* 2) batch handling */ @@ -476,6 +488,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); static void rcu_flavor_sched_clock_irq(int user); static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); +static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static bool rcu_is_callbacks_kthread(struct rcu_data *rdp); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 8d4895c854c5..96c49c56fc14 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -141,6 +141,13 @@ static void __maybe_unused sync_exp_reset_tree(void) raw_spin_lock_irqsave_rcu_node(rnp, flags); WARN_ON_ONCE(rnp->expmask); WRITE_ONCE(rnp->expmask, rnp->expmaskinit); + /* + * Need to wait for any blocked tasks as well. Note that + * additional blocking tasks will also block the expedited GP + * until such time as the ->expmask bits are cleared. + */ + if (rcu_is_leaf_node(rnp) && rcu_preempt_has_tasks(rnp)) + WRITE_ONCE(rnp->exp_tasks, rnp->blkd_tasks.next); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -200,7 +207,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp, if (rnp->parent == NULL) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (wake) - swake_up_one_online(&rcu_state.expedited_wq); + swake_up_one(&rcu_state.expedited_wq); break; } @@ -393,13 +400,6 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) } mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; - /* - * Need to wait for any blocked tasks as well. Note that - * additional blocking tasks will also block the expedited GP - * until such time as the ->expmask bits are cleared. - */ - if (rcu_preempt_has_tasks(rnp)) - WRITE_ONCE(rnp->exp_tasks, rnp->blkd_tasks.next); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ @@ -729,8 +729,7 @@ static void rcu_exp_need_qs(void) __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); /* Store .exp before .rcu_urgent_qs. */ smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); - set_tsk_need_resched(current); - set_preempt_need_resched(); + set_need_resched_current(); } #ifdef CONFIG_PREEMPT_RCU @@ -751,12 +750,8 @@ static void rcu_exp_handler(void *unused) struct task_struct *t = current; /* - * First, is there no need for a quiescent state from this CPU, - * or is this CPU already looking for a quiescent state for the - * current grace period? If either is the case, just leave. - * However, this should not happen due to the preemptible - * sync_sched_exp_online_cleanup() implementation being a no-op, - * so warn if this does happen. + * WARN if the CPU is unexpectedly already looking for a + * QS or has already reported one. */ ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp); if (WARN_ON_ONCE(!(READ_ONCE(rnp->expmask) & rdp->grpmask) || @@ -803,11 +798,6 @@ static void rcu_exp_handler(void *unused) WARN_ON_ONCE(1); } -/* PREEMPTION=y, so no PREEMPTION=n expedited grace period to clean up after. */ -static void sync_sched_exp_online_cleanup(int cpu) -{ -} - /* * Scan the current list of tasks blocked within RCU read-side critical * sections, printing out the tid of each that is blocking the current @@ -885,38 +875,6 @@ static void rcu_exp_handler(void *unused) rcu_exp_need_qs(); } -/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ -static void sync_sched_exp_online_cleanup(int cpu) -{ - unsigned long flags; - int my_cpu; - struct rcu_data *rdp; - int ret; - struct rcu_node *rnp; - - rdp = per_cpu_ptr(&rcu_data, cpu); - rnp = rdp->mynode; - my_cpu = get_cpu(); - /* Quiescent state either not needed or already requested, leave. */ - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || - READ_ONCE(rdp->cpu_no_qs.b.exp)) { - put_cpu(); - return; - } - /* Quiescent state needed on current CPU, so set it up locally. */ - if (my_cpu == cpu) { - local_irq_save(flags); - rcu_exp_need_qs(); - local_irq_restore(flags); - put_cpu(); - return; - } - /* Quiescent state needed on some other CPU, send IPI. */ - ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); - put_cpu(); - WARN_ON_ONCE(ret); -} - /* * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections that are diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index fa269d34167a..e6cd56603cad 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -216,7 +216,7 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp, raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); if (needwake) { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake")); - swake_up_one_online(&rdp_gp->nocb_gp_wq); + swake_up_one(&rdp_gp->nocb_gp_wq); } return needwake; @@ -276,7 +276,7 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, * callback storms, no need to wake up too early. */ if (waketype == RCU_NOCB_WAKE_LAZY && - rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) { + rdp_gp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) { mod_timer(&rdp_gp->nocb_timer, jiffies + rcu_get_jiffies_lazy_flush()); WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); } else if (waketype == RCU_NOCB_WAKE_BYPASS) { @@ -554,19 +554,13 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY, TPS("WakeLazy")); - } else if (!irqs_disabled_flags(flags) && cpu_online(rdp->cpu)) { + } else if (!irqs_disabled_flags(flags)) { /* ... if queue was empty ... */ rcu_nocb_unlock(rdp); wake_nocb_gp(rdp, false); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeEmpty")); } else { - /* - * Don't do the wake-up upfront on fragile paths. - * Also offline CPUs can't call swake_up_one_online() from - * (soft-)IRQs. Rely on the final deferred wake-up from - * rcutree_report_cpu_dead() - */ rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, TPS("WakeEmptyIsDeferred")); @@ -991,7 +985,7 @@ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp, static void do_nocb_deferred_wakeup_timer(struct timer_list *t) { unsigned long flags; - struct rcu_data *rdp = from_timer(rdp, t, nocb_timer); + struct rcu_data *rdp = timer_container_of(rdp, t, nocb_timer); WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer")); @@ -1152,7 +1146,6 @@ static bool rcu_nocb_rdp_offload_wait_cond(struct rcu_data *rdp) static int rcu_nocb_rdp_offload(struct rcu_data *rdp) { int wake_gp; - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; WARN_ON_ONCE(cpu_online(rdp->cpu)); /* @@ -1162,7 +1155,7 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp) if (!rdp->nocb_gp_rdp) return -EINVAL; - if (WARN_ON_ONCE(!rdp_gp->nocb_gp_kthread)) + if (WARN_ON_ONCE(!rdp->nocb_gp_kthread)) return -EINVAL; pr_info("Offloading %d\n", rdp->cpu); @@ -1172,7 +1165,7 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp) wake_gp = rcu_nocb_queue_toggle_rdp(rdp); if (wake_gp) - wake_up_process(rdp_gp->nocb_gp_kthread); + wake_up_process(rdp->nocb_gp_kthread); swait_event_exclusive(rdp->nocb_state_wq, rcu_nocb_rdp_offload_wait_cond(rdp)); @@ -1570,6 +1563,9 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) if (rdp->nocb_gp_rdp == rdp) show_rcu_nocb_gp_state(rdp); + if (!rcu_segcblist_is_offloaded(&rdp->cblist)) + return; + nocb_next_rdp = list_next_or_null_rcu(&rdp->nocb_gp_rdp->nocb_head_rdp, &rdp->nocb_entry_rdp, typeof(*rdp), diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3c0bbbbb686f..dbe2d02be824 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -29,7 +29,7 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) || lockdep_is_held(&rdp->nocb_lock) || lockdep_is_held(&rcu_state.nocb_mutex) || - (!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) && + ((!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) || softirq_count()) && rdp == this_cpu_ptr(&rcu_data)) || rcu_current_is_nocb_kthread(rdp)), "Unsafe read of RCU_NOCB offloaded state" @@ -486,13 +486,16 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) struct rcu_node *rnp; union rcu_special special; + rdp = this_cpu_ptr(&rcu_data); + if (rdp->defer_qs_iw_pending == DEFER_QS_PENDING) + rdp->defer_qs_iw_pending = DEFER_QS_IDLE; + /* * If RCU core is waiting for this CPU to exit its critical section, * report the fact that it has exited. Because irqs are disabled, * t->rcu_read_unlock_special cannot change. */ special = t->rcu_read_unlock_special; - rdp = this_cpu_ptr(&rcu_data); if (!special.s && !rdp->cpu_no_qs.b.exp) { local_irq_restore(flags); return; @@ -534,7 +537,6 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq && (!empty_norm || rnp->qsmask)); empty_exp = sync_rcu_exp_done(rnp); - smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); list_del_init(&t->rcu_node_entry); t->rcu_blocked_node = NULL; @@ -626,8 +628,93 @@ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) { struct rcu_data *rdp; + lockdep_assert_irqs_disabled(); rdp = container_of(iwp, struct rcu_data, defer_qs_iw); - rdp->defer_qs_iw_pending = false; + + /* + * If the IRQ work handler happens to run in the middle of RCU read-side + * critical section, it could be ineffective in getting the scheduler's + * attention to report a deferred quiescent state (the whole point of the + * IRQ work). For this reason, requeue the IRQ work. + * + * Basically, we want to avoid following situation: + * 1. rcu_read_unlock() queues IRQ work (state -> DEFER_QS_PENDING) + * 2. CPU enters new rcu_read_lock() + * 3. IRQ work runs but cannot report QS due to rcu_preempt_depth() > 0 + * 4. rcu_read_unlock() does not re-queue work (state still PENDING) + * 5. Deferred QS reporting does not happen. + */ + if (rcu_preempt_depth() > 0) + WRITE_ONCE(rdp->defer_qs_iw_pending, DEFER_QS_IDLE); +} + +/* + * Check if expedited grace period processing during unlock is needed. + * + * This function determines whether expedited handling is required based on: + * 1. Task blocking an expedited grace period (based on a heuristic, could be + * false-positive, see below.) + * 2. CPU participating in an expedited grace period + * 3. Strict grace period mode requiring expedited handling + * 4. RCU priority deboosting needs when interrupts were disabled + * + * @t: The task being checked + * @rdp: The per-CPU RCU data + * @rnp: The RCU node for this CPU + * @irqs_were_disabled: Whether interrupts were disabled before rcu_read_unlock() + * + * Returns true if expedited processing of the rcu_read_unlock() is needed. + */ +static bool rcu_unlock_needs_exp_handling(struct task_struct *t, + struct rcu_data *rdp, + struct rcu_node *rnp, + bool irqs_were_disabled) +{ + /* + * Check if this task is blocking an expedited grace period. If the + * task was preempted within an RCU read-side critical section and is + * on the expedited grace period blockers list (exp_tasks), we need + * expedited handling to unblock the expedited GP. This is not an exact + * check because 't' might not be on the exp_tasks list at all - its + * just a fast heuristic that can be false-positive sometimes. + */ + if (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) + return true; + + /* + * Check if this CPU is participating in an expedited grace period. + * The expmask bitmap tracks which CPUs need to check in for the + * current expedited GP. If our CPU's bit is set, we need expedited + * handling to help complete the expedited GP. + */ + if (rdp->grpmask & READ_ONCE(rnp->expmask)) + return true; + + /* + * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, all grace periods + * are treated as short for testing purposes even if that means + * disturbing the system more. Check if either: + * - This CPU has not yet reported a quiescent state, or + * - This task was preempted within an RCU critical section + * In either case, require expedited handling for strict GP mode. + */ + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && + ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) + return true; + + /* + * RCU priority boosting case: If a task is subject to RCU priority + * boosting and exits an RCU read-side critical section with interrupts + * disabled, we need expedited handling to ensure timely deboosting. + * Without this, a low-priority task could incorrectly run at high + * real-time priority for an extended period degrading real-time + * responsiveness. This applies to all CONFIG_RCU_BOOST=y kernels, + * not just to PREEMPT_RT. + */ + if (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && t->rcu_blocked_node) + return true; + + return false; } /* @@ -649,18 +736,14 @@ static void rcu_read_unlock_special(struct task_struct *t) local_irq_save(flags); irqs_were_disabled = irqs_disabled_flags(flags); if (preempt_bh_were_disabled || irqs_were_disabled) { - bool expboost; // Expedited GP in flight or possible boosting. + bool needs_exp; // Expedited handling needed. struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) || - (rdp->grpmask & READ_ONCE(rnp->expmask)) || - (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && - ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) || - (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && - t->rcu_blocked_node); + needs_exp = rcu_unlock_needs_exp_handling(t, rdp, rnp, irqs_were_disabled); + // Need to defer quiescent state until everything is enabled. - if (use_softirq && (in_hardirq() || (expboost && !irqs_were_disabled))) { + if (use_softirq && (in_hardirq() || (needs_exp && !irqs_were_disabled))) { // Using softirq, safe to awaken, and either the // wakeup is free or there is either an expedited // GP in flight or a potential need to deboost. @@ -670,20 +753,13 @@ static void rcu_read_unlock_special(struct task_struct *t) // Also if no expediting and no possible deboosting, // slow is OK. Plus nohz_full CPUs eventually get // tick enabled. - set_tsk_need_resched(current); - set_preempt_need_resched(); + set_need_resched_current(); if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && - expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) { + needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING && + cpu_online(rdp->cpu)) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. - if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && - IS_ENABLED(CONFIG_PREEMPT_RT)) - rdp->defer_qs_iw = IRQ_WORK_INIT_HARD( - rcu_preempt_deferred_qs_handler); - else - init_irq_work(&rdp->defer_qs_iw, - rcu_preempt_deferred_qs_handler); - rdp->defer_qs_iw_pending = true; + rdp->defer_qs_iw_pending = DEFER_QS_PENDING; irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } } @@ -736,10 +812,8 @@ static void rcu_flavor_sched_clock_irq(int user) if (rcu_preempt_depth() > 0 || (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) { /* No QS, force context switch if deferred. */ - if (rcu_preempt_need_deferred_qs(t)) { - set_tsk_need_resched(t); - set_preempt_need_resched(); - } + if (rcu_preempt_need_deferred_qs(t)) + set_need_resched_current(); } else if (rcu_preempt_need_deferred_qs(t)) { rcu_preempt_deferred_qs(t); /* Report deferred QS. */ return; @@ -822,6 +896,10 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) } } +static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp) +{ + rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler); +} #else /* #ifdef CONFIG_PREEMPT_RCU */ /* @@ -1021,6 +1099,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks)); } +static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp) { } + #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ /* diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 925fcdad5dea..b67532cb8770 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -17,8 +17,59 @@ // Controlling CPU stall warnings, including delay calculation. /* panic() on RCU Stall sysctl. */ -int sysctl_panic_on_rcu_stall __read_mostly; -int sysctl_max_rcu_stall_to_panic __read_mostly; +static int sysctl_panic_on_rcu_stall __read_mostly; +static int sysctl_max_rcu_stall_to_panic __read_mostly; + +static const struct ctl_table rcu_stall_sysctl_table[] = { + { + .procname = "panic_on_rcu_stall", + .data = &sysctl_panic_on_rcu_stall, + .maxlen = sizeof(sysctl_panic_on_rcu_stall), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "max_rcu_stall_to_panic", + .data = &sysctl_max_rcu_stall_to_panic, + .maxlen = sizeof(sysctl_max_rcu_stall_to_panic), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, + }, +}; + +static int __init init_rcu_stall_sysctl(void) +{ + register_sysctl_init("kernel", rcu_stall_sysctl_table); + return 0; +} + +subsys_initcall(init_rcu_stall_sysctl); + +#ifdef CONFIG_SYSFS + +static unsigned int rcu_stall_count; + +static ssize_t rcu_stall_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%u\n", rcu_stall_count); +} + +static struct kobj_attribute rcu_stall_count_attr = __ATTR_RO(rcu_stall_count); + +static __init int kernel_rcu_stall_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &rcu_stall_count_attr.attr, NULL); + return 0; +} + +late_initcall(kernel_rcu_stall_sysfs_init); + +#endif // CONFIG_SYSFS #ifdef CONFIG_PROVE_RCU #define RCU_STALL_DELAY_DELTA (5 * HZ) @@ -112,6 +163,13 @@ static void panic_on_rcu_stall(void) { static int cpu_stall; + /* + * Attempt to kick out the BPF scheduler if it's installed and defer + * the panic to give the system a chance to recover. + */ + if (scx_rcu_cpu_stall()) + return; + if (++cpu_stall < sysctl_max_rcu_stall_to_panic) return; @@ -435,8 +493,8 @@ static void print_cpu_stat_info(int cpu) rsr.cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu); pr_err("\t hardirqs softirqs csw/system\n"); - pr_err("\t number: %8ld %10d %12lld\n", - kstat_cpu_irqs_sum(cpu) - rsrp->nr_hardirqs, + pr_err("\t number: %8lld %10d %12lld\n", + kstat_cpu_irqs_sum(cpu) + arch_irq_stat_cpu(cpu) - rsrp->nr_hardirqs, kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs, nr_context_switches_cpu(cpu) - rsrp->nr_csw); pr_err("\tcputime: %8lld %10lld %12lld ==> %d(ms)\n", @@ -705,8 +763,7 @@ static void print_cpu_stall(unsigned long gp_seq, unsigned long gps) * progress and it could be we're stuck in kernel space without context * switches for an entirely unreasonable amount of time. */ - set_tsk_need_resched(current); - set_preempt_need_resched(); + set_need_resched_current(); } static bool csd_lock_suppress_rcu_stall; @@ -784,6 +841,10 @@ static void check_cpu_stall(struct rcu_data *rdp) if (kvm_check_and_clear_guest_paused()) return; +#ifdef CONFIG_SYSFS + ++rcu_stall_count; +#endif + rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_NORM, (void *)j - gps); if (READ_ONCE(csd_lock_suppress_rcu_stall) && csd_lock_is_stuck()) { pr_err("INFO: %s detected stall, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name); @@ -927,8 +988,7 @@ void show_rcu_gp_kthreads(void) for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); cbs += data_race(READ_ONCE(rdp->n_cbs_invoked)); - if (rcu_segcblist_is_offloaded(&rdp->cblist)) - show_rcu_nocb_state(rdp); + show_rcu_nocb_state(rdp); } pr_info("RCU callbacks invoked since boot: %lu\n", cbs); show_rcu_tasks_gp_kthreads(); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index c912b594ba98..dfeba9b35395 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -117,7 +117,7 @@ static bool rcu_read_lock_held_common(bool *ret) return false; } -int rcu_read_lock_sched_held(void) +int notrace rcu_read_lock_sched_held(void) { bool ret; @@ -342,7 +342,7 @@ EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); * Note that rcu_read_lock() is disallowed if the CPU is either idle or * offline from an RCU perspective, so check for those as well. */ -int rcu_read_lock_held(void) +int notrace rcu_read_lock_held(void) { bool ret; @@ -367,7 +367,7 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_held); * Note that rcu_read_lock_bh() is disallowed if the CPU is either idle or * offline from an RCU perspective, so check for those as well. */ -int rcu_read_lock_bh_held(void) +int notrace rcu_read_lock_bh_held(void) { bool ret; @@ -377,7 +377,7 @@ int rcu_read_lock_bh_held(void) } EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); -int rcu_read_lock_any_held(void) +int notrace rcu_read_lock_any_held(void) { bool ret; diff --git a/kernel/relay.c b/kernel/relay.c index 5ac7e711e4b6..e36f6b926f7f 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -72,17 +72,18 @@ static void relay_free_page_array(struct page **array) } /** - * relay_mmap_buf: - mmap channel buffer to process address space - * @buf: relay channel buffer - * @vma: vm_area_struct describing memory to be mapped + * relay_mmap_prepare_buf: - mmap channel buffer to process address space + * @buf: the relay channel buffer + * @desc: describing what to map * * Returns 0 if ok, negative on error * * Caller should already have grabbed mmap_lock. */ -static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) +static int relay_mmap_prepare_buf(struct rchan_buf *buf, + struct vm_area_desc *desc) { - unsigned long length = vma->vm_end - vma->vm_start; + unsigned long length = vma_desc_size(desc); if (!buf) return -EBADF; @@ -90,9 +91,9 @@ static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) if (length != (unsigned long)buf->chan->alloc_size) return -EINVAL; - vma->vm_ops = &relay_file_mmap_ops; - vm_flags_set(vma, VM_DONTEXPAND); - vma->vm_private_data = buf; + desc->vm_ops = &relay_file_mmap_ops; + desc->vm_flags |= VM_DONTEXPAND; + desc->private_data = buf; return 0; } @@ -118,7 +119,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) return NULL; for (i = 0; i < n_pages; i++) { - buf->page_array[i] = alloc_page(GFP_KERNEL); + buf->page_array[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); if (unlikely(!buf->page_array[i])) goto depopulate; set_page_private(buf->page_array[i], (unsigned long)buf); @@ -127,7 +128,6 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) if (!mem) goto depopulate; - memset(mem, 0, *size); buf->page_count = n_pages; return mem; @@ -250,13 +250,18 @@ EXPORT_SYMBOL_GPL(relay_buf_full); */ static int relay_subbuf_start(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) + void *prev_subbuf) { + int full = relay_buf_full(buf); + + if (full) + buf->stats.full_count++; + if (!buf->chan->cb->subbuf_start) - return !relay_buf_full(buf); + return !full; return buf->chan->cb->subbuf_start(buf, subbuf, - prev_subbuf, prev_padding); + prev_subbuf); } /** @@ -298,11 +303,13 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) buf->finalized = 0; buf->data = buf->start; buf->offset = 0; + buf->stats.full_count = 0; + buf->stats.big_count = 0; for (i = 0; i < buf->chan->n_subbufs; i++) buf->padding[i] = 0; - relay_subbuf_start(buf, buf->data, NULL, 0); + relay_subbuf_start(buf, buf->data, NULL); } /** @@ -452,7 +459,7 @@ int relay_prepare_cpu(unsigned int cpu) /** * relay_open - create a new relay channel - * @base_filename: base name of files to create, %NULL for buffering only + * @base_filename: base name of files to create * @parent: dentry of parent directory, %NULL for root directory or buffer * @subbuf_size: size of sub-buffers * @n_subbufs: number of sub-buffers @@ -465,10 +472,6 @@ int relay_prepare_cpu(unsigned int cpu) * attributes specified. The created channel buffer files * will be named base_filename0...base_filenameN-1. File * permissions will be %S_IRUSR. - * - * If opening a buffer (@parent = NULL) that you later wish to register - * in a filesystem, call relay_late_setup_files() once the @parent dentry - * is available. */ struct rchan *relay_open(const char *base_filename, struct dentry *parent, @@ -540,111 +543,6 @@ struct rchan_percpu_buf_dispatcher { struct dentry *dentry; }; -/* Called in atomic context. */ -static void __relay_set_buf_dentry(void *info) -{ - struct rchan_percpu_buf_dispatcher *p = info; - - relay_set_buf_dentry(p->buf, p->dentry); -} - -/** - * relay_late_setup_files - triggers file creation - * @chan: channel to operate on - * @base_filename: base name of files to create - * @parent: dentry of parent directory, %NULL for root directory - * - * Returns 0 if successful, non-zero otherwise. - * - * Use to setup files for a previously buffer-only channel created - * by relay_open() with a NULL parent dentry. - * - * For example, this is useful for perfomring early tracing in kernel, - * before VFS is up and then exposing the early results once the dentry - * is available. - */ -int relay_late_setup_files(struct rchan *chan, - const char *base_filename, - struct dentry *parent) -{ - int err = 0; - unsigned int i, curr_cpu; - unsigned long flags; - struct dentry *dentry; - struct rchan_buf *buf; - struct rchan_percpu_buf_dispatcher disp; - - if (!chan || !base_filename) - return -EINVAL; - - strscpy(chan->base_filename, base_filename, NAME_MAX); - - mutex_lock(&relay_channels_mutex); - /* Is chan already set up? */ - if (unlikely(chan->has_base_filename)) { - mutex_unlock(&relay_channels_mutex); - return -EEXIST; - } - chan->has_base_filename = 1; - chan->parent = parent; - - if (chan->is_global) { - err = -EINVAL; - buf = *per_cpu_ptr(chan->buf, 0); - if (!WARN_ON_ONCE(!buf)) { - dentry = relay_create_buf_file(chan, buf, 0); - if (dentry && !WARN_ON_ONCE(!chan->is_global)) { - relay_set_buf_dentry(buf, dentry); - err = 0; - } - } - mutex_unlock(&relay_channels_mutex); - return err; - } - - curr_cpu = get_cpu(); - /* - * The CPU hotplug notifier ran before us and created buffers with - * no files associated. So it's safe to call relay_setup_buf_file() - * on all currently online CPUs. - */ - for_each_online_cpu(i) { - buf = *per_cpu_ptr(chan->buf, i); - if (unlikely(!buf)) { - WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n"); - err = -EINVAL; - break; - } - - dentry = relay_create_buf_file(chan, buf, i); - if (unlikely(!dentry)) { - err = -EINVAL; - break; - } - - if (curr_cpu == i) { - local_irq_save(flags); - relay_set_buf_dentry(buf, dentry); - local_irq_restore(flags); - } else { - disp.buf = buf; - disp.dentry = dentry; - smp_mb(); - /* relay_channels_mutex must be held, so wait. */ - err = smp_call_function_single(i, - __relay_set_buf_dentry, - &disp, 1); - } - if (unlikely(err)) - break; - } - put_cpu(); - mutex_unlock(&relay_channels_mutex); - - return err; -} -EXPORT_SYMBOL_GPL(relay_late_setup_files); - /** * relay_switch_subbuf - switch to a new sub-buffer * @buf: channel buffer @@ -664,9 +562,11 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) goto toobig; if (buf->offset != buf->chan->subbuf_size + 1) { - buf->prev_padding = buf->chan->subbuf_size - buf->offset; + size_t prev_padding; + + prev_padding = buf->chan->subbuf_size - buf->offset; old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; - buf->padding[old_subbuf] = buf->prev_padding; + buf->padding[old_subbuf] = prev_padding; buf->subbufs_produced++; if (buf->dentry) d_inode(buf->dentry)->i_size += @@ -691,7 +591,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; new = buf->start + new_subbuf * buf->chan->subbuf_size; buf->offset = 0; - if (!relay_subbuf_start(buf, new, old, buf->prev_padding)) { + if (!relay_subbuf_start(buf, new, old)) { buf->offset = buf->chan->subbuf_size + 1; return 0; } @@ -704,7 +604,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) return length; toobig: - buf->chan->last_toobig = length; + buf->stats.big_count++; return 0; } EXPORT_SYMBOL_GPL(relay_switch_subbuf); @@ -764,11 +664,6 @@ void relay_close(struct rchan *chan) if ((buf = *per_cpu_ptr(chan->buf, i))) relay_close_buf(buf); - if (chan->last_toobig) - printk(KERN_WARNING "relay: one or more items not logged " - "[item size (%zd) > sub-buffer size (%zd)]\n", - chan->last_toobig, chan->subbuf_size); - list_del(&chan->list); kref_put(&chan->kref, relay_destroy_channel); mutex_unlock(&relay_channels_mutex); @@ -803,6 +698,42 @@ void relay_flush(struct rchan *chan) EXPORT_SYMBOL_GPL(relay_flush); /** + * relay_stats - get channel buffer statistics + * @chan: the channel + * @flags: select particular information to get + * + * Returns the count of certain field that caller specifies. + */ +size_t relay_stats(struct rchan *chan, int flags) +{ + unsigned int i, count = 0; + struct rchan_buf *rbuf; + + if (!chan || flags > RELAY_STATS_LAST) + return 0; + + if (chan->is_global) { + rbuf = *per_cpu_ptr(chan->buf, 0); + if (flags & RELAY_STATS_BUF_FULL) + count = rbuf->stats.full_count; + else if (flags & RELAY_STATS_WRT_BIG) + count = rbuf->stats.big_count; + } else { + for_each_online_cpu(i) { + rbuf = *per_cpu_ptr(chan->buf, i); + if (rbuf) { + if (flags & RELAY_STATS_BUF_FULL) + count += rbuf->stats.full_count; + else if (flags & RELAY_STATS_WRT_BIG) + count += rbuf->stats.big_count; + } + } + } + + return count; +} + +/** * relay_file_open - open file op for relay files * @inode: the inode * @filp: the file @@ -819,16 +750,16 @@ static int relay_file_open(struct inode *inode, struct file *filp) } /** - * relay_file_mmap - mmap file op for relay files - * @filp: the file - * @vma: the vma describing what to map + * relay_file_mmap_prepare - mmap file op for relay files + * @desc: describing what to map * - * Calls upon relay_mmap_buf() to map the file into user space. + * Calls upon relay_mmap_prepare_buf() to map the file into user space. */ -static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma) +static int relay_file_mmap_prepare(struct vm_area_desc *desc) { - struct rchan_buf *buf = filp->private_data; - return relay_mmap_buf(buf, vma); + struct rchan_buf *buf = desc->file->private_data; + + return relay_mmap_prepare_buf(buf, desc); } /** @@ -1076,7 +1007,7 @@ static ssize_t relay_file_read(struct file *filp, const struct file_operations relay_file_operations = { .open = relay_file_open, .poll = relay_file_poll, - .mmap = relay_file_mmap, + .mmap_prepare = relay_file_mmap_prepare, .read = relay_file_read, .release = relay_file_release, }; diff --git a/kernel/resource.c b/kernel/resource.c index 8d3e6ed0bdc1..e4e9bac12e6e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -341,6 +341,8 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, unsigned long flags, unsigned long desc, struct resource *res) { + /* Skip children until we find a top level range that matches */ + bool skip_children = true; struct resource *p; if (!res) @@ -351,7 +353,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, read_lock(&resource_lock); - for_each_resource(&iomem_resource, p, false) { + for_each_resource(&iomem_resource, p, skip_children) { /* If we passed the resource we are looking for, stop */ if (p->start > end) { p = NULL; @@ -362,6 +364,12 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, if (p->end < start) continue; + /* + * We found a top level range that matches what we are looking + * for. Time to start checking children too. + */ + skip_children = false; + /* Found a match, break */ if (is_type_match(p, flags, desc)) break; @@ -1279,8 +1287,9 @@ static int __request_region_locked(struct resource *res, struct resource *parent * become unavailable to other users. Conflicts are * not expected. Warn to aid debugging if encountered. */ - if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) { - pr_warn("Unaddressable device %s %pR conflicts with %pR", + if (parent == &iomem_resource && + conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) { + pr_warn("Unaddressable device %s %pR conflicts with %pR\n", conflict->name, conflict, res); } if (conflict != parent) { @@ -1387,6 +1396,47 @@ void __release_region(struct resource *parent, resource_size_t start, EXPORT_SYMBOL(__release_region); #ifdef CONFIG_MEMORY_HOTREMOVE +static void append_child_to_parent(struct resource *new_parent, struct resource *new_child) +{ + struct resource *child; + + child = new_parent->child; + if (child) { + while (child->sibling) + child = child->sibling; + child->sibling = new_child; + } else { + new_parent->child = new_child; + } + new_child->parent = new_parent; + new_child->sibling = NULL; +} + +/* + * Reparent all child resources that no longer belong to "low" after a split to + * "high". Note that "high" does not have any children, because "low" is the + * original resource and "high" is a new resource. Treat "low" as the original + * resource being split and defer its range adjustment to __adjust_resource(). + */ +static void reparent_children_after_split(struct resource *low, + struct resource *high, + resource_size_t split_addr) +{ + struct resource *child, *next, **p; + + p = &low->child; + while ((child = *p)) { + next = child->sibling; + if (child->start > split_addr) { + /* unlink child */ + *p = next; + append_child_to_parent(high, child); + } else { + p = &child->sibling; + } + } +} + /** * release_mem_region_adjustable - release a previously reserved memory region * @start: resource start address @@ -1396,15 +1446,13 @@ EXPORT_SYMBOL(__release_region); * is released from a currently busy memory resource. The requested region * must either match exactly or fit into a single busy resource entry. In * the latter case, the remaining resource is adjusted accordingly. - * Existing children of the busy memory resource must be immutable in the - * request. * * Note: * - Additional release conditions, such as overlapping region, can be * supported after they are confirmed as valid cases. - * - When a busy memory resource gets split into two entries, the code - * assumes that all children remain in the lower address entry for - * simplicity. Enhance this logic when necessary. + * - When a busy memory resource gets split into two entries, its children are + * reassigned to the correct parent based on their range. If a child memory + * resource overlaps with more than one parent, enhance the logic as needed. */ void release_mem_region_adjustable(resource_size_t start, resource_size_t size) { @@ -1481,6 +1529,7 @@ retry: new_res->parent = res->parent; new_res->sibling = res->sibling; new_res->child = NULL; + reparent_children_after_split(res, new_res, end); if (WARN_ON_ONCE(__adjust_resource(res, res->start, start - res->start))) diff --git a/kernel/rseq.c b/kernel/rseq.c index b7a1ec327e81..395d8b002350 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -8,98 +8,7 @@ * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> */ -#include <linux/sched.h> -#include <linux/uaccess.h> -#include <linux/syscalls.h> -#include <linux/rseq.h> -#include <linux/types.h> -#include <linux/ratelimit.h> -#include <asm/ptrace.h> - -#define CREATE_TRACE_POINTS -#include <trace/events/rseq.h> - -/* The original rseq structure size (including padding) is 32 bytes. */ -#define ORIG_RSEQ_SIZE 32 - -#define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \ - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) - -#ifdef CONFIG_DEBUG_RSEQ -static struct rseq *rseq_kernel_fields(struct task_struct *t) -{ - return (struct rseq *) t->rseq_fields; -} - -static int rseq_validate_ro_fields(struct task_struct *t) -{ - static DEFINE_RATELIMIT_STATE(_rs, - DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - u32 cpu_id_start, cpu_id, node_id, mm_cid; - struct rseq __user *rseq = t->rseq; - - /* - * Validate fields which are required to be read-only by - * user-space. - */ - if (!user_read_access_begin(rseq, t->rseq_len)) - goto efault; - unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end); - unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end); - unsafe_get_user(node_id, &rseq->node_id, efault_end); - unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end); - user_read_access_end(); - - if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start || - cpu_id != rseq_kernel_fields(t)->cpu_id || - node_id != rseq_kernel_fields(t)->node_id || - mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) { - - pr_warn("Detected rseq corruption for pid: %d, name: %s\n" - "\tcpu_id_start: %u ?= %u\n" - "\tcpu_id: %u ?= %u\n" - "\tnode_id: %u ?= %u\n" - "\tmm_cid: %u ?= %u\n", - t->pid, t->comm, - cpu_id_start, rseq_kernel_fields(t)->cpu_id_start, - cpu_id, rseq_kernel_fields(t)->cpu_id, - node_id, rseq_kernel_fields(t)->node_id, - mm_cid, rseq_kernel_fields(t)->mm_cid); - } - - /* For now, only print a console warning on mismatch. */ - return 0; - -efault_end: - user_read_access_end(); -efault: - return -EFAULT; -} - -/* - * Update an rseq field and its in-kernel copy in lock-step to keep a coherent - * state. - */ -#define rseq_unsafe_put_user(t, value, field, error_label) \ - do { \ - unsafe_put_user(value, &t->rseq->field, error_label); \ - rseq_kernel_fields(t)->field = value; \ - } while (0) - -#else -static int rseq_validate_ro_fields(struct task_struct *t) -{ - return 0; -} - -#define rseq_unsafe_put_user(t, value, field, error_label) \ - unsafe_put_user(value, &t->rseq->field, error_label) -#endif - /* - * * Restartable sequences are a lightweight interface that allows * user-level code to be executed atomically relative to scheduler * preemption and signal delivery. Typically used for implementing @@ -158,356 +67,356 @@ static int rseq_validate_ro_fields(struct task_struct *t) * F1. <failure> */ -static int rseq_update_cpu_node_id(struct task_struct *t) -{ - struct rseq __user *rseq = t->rseq; - u32 cpu_id = raw_smp_processor_id(); - u32 node_id = cpu_to_node(cpu_id); - u32 mm_cid = task_mm_cid(t); +/* Required to select the proper per_cpu ops for rseq_stats_inc() */ +#define RSEQ_BUILD_SLOW_PATH - /* - * Validate read-only rseq fields. - */ - if (rseq_validate_ro_fields(t)) - goto efault; - WARN_ON_ONCE((int) mm_cid < 0); - if (!user_write_access_begin(rseq, t->rseq_len)) - goto efault; +#include <linux/debugfs.h> +#include <linux/ratelimit.h> +#include <linux/rseq_entry.h> +#include <linux/sched.h> +#include <linux/syscalls.h> +#include <linux/uaccess.h> +#include <linux/types.h> +#include <asm/ptrace.h> - rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end); - rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); - rseq_unsafe_put_user(t, node_id, node_id, efault_end); - rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); +#define CREATE_TRACE_POINTS +#include <trace/events/rseq.h> - /* - * Additional feature fields added after ORIG_RSEQ_SIZE - * need to be conditionally updated only if - * t->rseq_len != ORIG_RSEQ_SIZE. - */ - user_write_access_end(); - trace_rseq_update(t); - return 0; +DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); -efault_end: - user_write_access_end(); -efault: - return -EFAULT; +static inline void rseq_control_debug(bool on) +{ + if (on) + static_branch_enable(&rseq_debug_enabled); + else + static_branch_disable(&rseq_debug_enabled); } -static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) +static int __init rseq_setup_debug(char *str) { - struct rseq __user *rseq = t->rseq; - u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, - mm_cid = 0; - - /* - * Validate read-only rseq fields. - */ - if (rseq_validate_ro_fields(t)) - goto efault; - - if (!user_write_access_begin(rseq, t->rseq_len)) - goto efault; - - /* - * Reset all fields to their initial state. - * - * All fields have an initial state of 0 except cpu_id which is set to - * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after - * unregistration can figure out that rseq needs to be registered - * again. - */ - rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end); - rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); - rseq_unsafe_put_user(t, node_id, node_id, efault_end); - rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); - - /* - * Additional feature fields added after ORIG_RSEQ_SIZE - * need to be conditionally reset only if - * t->rseq_len != ORIG_RSEQ_SIZE. - */ - user_write_access_end(); - return 0; + bool on; -efault_end: - user_write_access_end(); -efault: - return -EFAULT; + if (kstrtobool(str, &on)) + return -EINVAL; + rseq_control_debug(on); + return 1; } +__setup("rseq_debug=", rseq_setup_debug); +#ifdef CONFIG_TRACEPOINTS /* - * Get the user-space pointer value stored in the 'rseq_cs' field. + * Out of line, so the actual update functions can be in a header to be + * inlined into the exit to user code. */ -static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs) +void __rseq_trace_update(struct task_struct *t) { - if (!rseq_cs) - return -EFAULT; - -#ifdef CONFIG_64BIT - if (get_user(*rseq_cs, &rseq->rseq_cs)) - return -EFAULT; -#else - if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs))) - return -EFAULT; -#endif + trace_rseq_update(t); +} - return 0; +void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, + unsigned long offset, unsigned long abort_ip) +{ + trace_rseq_ip_fixup(ip, start_ip, offset, abort_ip); } +#endif /* CONFIG_TRACEPOINTS */ -/* - * If the rseq_cs field of 'struct rseq' contains a valid pointer to - * user-space, copy 'struct rseq_cs' from user-space and validate its fields. - */ -static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) +#ifdef CONFIG_DEBUG_FS +#ifdef CONFIG_RSEQ_STATS +DEFINE_PER_CPU(struct rseq_stats, rseq_stats); + +static int rseq_stats_show(struct seq_file *m, void *p) { - struct rseq_cs __user *urseq_cs; - u64 ptr; - u32 __user *usig; - u32 sig; - int ret; - - ret = rseq_get_rseq_cs_ptr_val(t->rseq, &ptr); - if (ret) - return ret; - - /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */ - if (!ptr) { - memset(rseq_cs, 0, sizeof(*rseq_cs)); - return 0; + struct rseq_stats stats = { }; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + stats.exit += data_race(per_cpu(rseq_stats.exit, cpu)); + stats.signal += data_race(per_cpu(rseq_stats.signal, cpu)); + stats.slowpath += data_race(per_cpu(rseq_stats.slowpath, cpu)); + stats.fastpath += data_race(per_cpu(rseq_stats.fastpath, cpu)); + stats.ids += data_race(per_cpu(rseq_stats.ids, cpu)); + stats.cs += data_race(per_cpu(rseq_stats.cs, cpu)); + stats.clear += data_race(per_cpu(rseq_stats.clear, cpu)); + stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu)); } - /* Check that the pointer value fits in the user-space process space. */ - if (ptr >= TASK_SIZE) - return -EINVAL; - urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; - if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) - return -EFAULT; - if (rseq_cs->start_ip >= TASK_SIZE || - rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || - rseq_cs->abort_ip >= TASK_SIZE || - rseq_cs->version > 0) - return -EINVAL; - /* Check for overflow. */ - if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) - return -EINVAL; - /* Ensure that abort_ip is not in the critical section. */ - if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) - return -EINVAL; + seq_printf(m, "exit: %16lu\n", stats.exit); + seq_printf(m, "signal: %16lu\n", stats.signal); + seq_printf(m, "slowp: %16lu\n", stats.slowpath); + seq_printf(m, "fastp: %16lu\n", stats.fastpath); + seq_printf(m, "ids: %16lu\n", stats.ids); + seq_printf(m, "cs: %16lu\n", stats.cs); + seq_printf(m, "clear: %16lu\n", stats.clear); + seq_printf(m, "fixup: %16lu\n", stats.fixup); + return 0; +} + +static int rseq_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, rseq_stats_show, inode->i_private); +} - usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); - ret = get_user(sig, usig); - if (ret) - return ret; +static const struct file_operations stat_ops = { + .open = rseq_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; - if (current->rseq_sig != sig) { - printk_ratelimited(KERN_WARNING - "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", - sig, current->rseq_sig, current->pid, usig); - return -EINVAL; - } +static int __init rseq_stats_init(struct dentry *root_dir) +{ + debugfs_create_file("stats", 0444, root_dir, NULL, &stat_ops); return 0; } +#else +static inline void rseq_stats_init(struct dentry *root_dir) { } +#endif /* CONFIG_RSEQ_STATS */ -static bool rseq_warn_flags(const char *str, u32 flags) +static int rseq_debug_show(struct seq_file *m, void *p) { - u32 test_flags; - - if (!flags) - return false; - test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS; - if (test_flags) - pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str); - test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS; - if (test_flags) - pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str); - return true; + bool on = static_branch_unlikely(&rseq_debug_enabled); + + seq_printf(m, "%d\n", on); + return 0; } -static int rseq_need_restart(struct task_struct *t, u32 cs_flags) +static ssize_t rseq_debug_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *ppos) { - u32 flags, event_mask; - int ret; + bool on; - if (rseq_warn_flags("rseq_cs", cs_flags)) + if (kstrtobool_from_user(ubuf, count, &on)) return -EINVAL; - /* Get thread flags. */ - ret = get_user(flags, &t->rseq->flags); - if (ret) - return ret; + rseq_control_debug(on); + return count; +} - if (rseq_warn_flags("rseq", flags)) - return -EINVAL; +static int rseq_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, rseq_debug_show, inode->i_private); +} - /* - * Load and clear event mask atomically with respect to - * scheduler preemption. - */ - preempt_disable(); - event_mask = t->rseq_event_mask; - t->rseq_event_mask = 0; - preempt_enable(); +static const struct file_operations debug_ops = { + .open = rseq_debug_open, + .read = seq_read, + .write = rseq_debug_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init rseq_debugfs_init(void) +{ + struct dentry *root_dir = debugfs_create_dir("rseq", NULL); - return !!event_mask; + debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops); + rseq_stats_init(root_dir); + return 0; } +__initcall(rseq_debugfs_init); +#endif /* CONFIG_DEBUG_FS */ -static int clear_rseq_cs(struct rseq __user *rseq) +static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id) { - /* - * The rseq_cs field is set to NULL on preemption or signal - * delivery on top of rseq assembly block, as well as on top - * of code outside of the rseq assembly block. This performs - * a lazy clear of the rseq_cs field. - * - * Set rseq_cs to NULL. - */ -#ifdef CONFIG_64BIT - return put_user(0UL, &rseq->rseq_cs); -#else - if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs))) - return -EFAULT; - return 0; -#endif + return rseq_set_ids_get_csaddr(t, ids, node_id, NULL); } -/* - * Unsigned comparison will be true when ip >= start_ip, and when - * ip < start_ip + post_commit_offset. - */ -static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) +static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) { - return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; + struct rseq __user *urseq = t->rseq.usrptr; + u64 csaddr; + + scoped_user_read_access(urseq, efault) + unsafe_get_user(csaddr, &urseq->rseq_cs, efault); + if (likely(!csaddr)) + return true; + return rseq_update_user_cs(t, regs, csaddr); +efault: + return false; } -static int rseq_ip_fixup(struct pt_regs *regs) +static void rseq_slowpath_update_usr(struct pt_regs *regs) { - unsigned long ip = instruction_pointer(regs); + /* + * Preserve rseq state and user_irq state. The generic entry code + * clears user_irq on the way out, the non-generic entry + * architectures are not having user_irq. + */ + const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, }; struct task_struct *t = current; - struct rseq_cs rseq_cs; - int ret; + struct rseq_ids ids; + u32 node_id; + bool event; + + if (unlikely(t->flags & PF_EXITING)) + return; - ret = rseq_get_rseq_cs(t, &rseq_cs); - if (ret) - return ret; + rseq_stat_inc(rseq_stats.slowpath); /* - * Handle potentially not being within a critical section. - * If not nested over a rseq critical section, restart is useless. - * Clear the rseq_cs pointer and return. + * Read and clear the event pending bit first. If the task + * was not preempted or migrated or a signal is on the way, + * there is no point in doing any of the heavy lifting here + * on production kernels. In that case TIF_NOTIFY_RESUME + * was raised by some other functionality. + * + * This is correct because the read/clear operation is + * guarded against scheduler preemption, which makes it CPU + * local atomic. If the task is preempted right after + * re-enabling preemption then TIF_NOTIFY_RESUME is set + * again and this function is invoked another time _before_ + * the task is able to return to user mode. + * + * On a debug kernel, invoke the fixup code unconditionally + * with the result handed in to allow the detection of + * inconsistencies. */ - if (!in_rseq_cs(ip, &rseq_cs)) - return clear_rseq_cs(t->rseq); - ret = rseq_need_restart(t, rseq_cs.flags); - if (ret <= 0) - return ret; - ret = clear_rseq_cs(t->rseq); - if (ret) - return ret; - trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, - rseq_cs.abort_ip); - instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); - return 0; + scoped_guard(irq) { + event = t->rseq.event.sched_switch; + t->rseq.event.all &= evt_mask.all; + ids.cpu_id = task_cpu(t); + ids.mm_cid = task_mm_cid(t); + } + + if (!event) + return; + + node_id = cpu_to_node(ids.cpu_id); + + if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) { + /* + * Clear the errors just in case this might survive magically, but + * leave the rest intact. + */ + t->rseq.event.error = 0; + force_sig(SIGSEGV); + } } -/* - * This resume handler must always be executed between any of: - * - preemption, - * - signal delivery, - * and return to user-space. - * - * This is how we can ensure that the entire rseq critical section - * will issue the commit instruction only if executed atomically with - * respect to other threads scheduled on the same CPU, and with respect - * to signal handlers. - */ -void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) +void __rseq_handle_slowpath(struct pt_regs *regs) { - struct task_struct *t = current; - int ret, sig; - - if (unlikely(t->flags & PF_EXITING)) + /* + * If invoked from hypervisors before entering the guest via + * resume_user_mode_work(), then @regs is a NULL pointer. + * + * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises + * it before returning from the ioctl() to user space when + * rseq_event.sched_switch is set. + * + * So it's safe to ignore here instead of pointlessly updating it + * in the vcpu_run() loop. + */ + if (!regs) return; + rseq_slowpath_update_usr(regs); +} + +void __rseq_signal_deliver(int sig, struct pt_regs *regs) +{ + rseq_stat_inc(rseq_stats.signal); /* - * regs is NULL if and only if the caller is in a syscall path. Skip - * fixup and leave rseq_cs as is so that rseq_sycall() will detect and - * kill a misbehaving userspace on debug kernels. + * Don't update IDs, they are handled on exit to user if + * necessary. The important thing is to abort a critical section of + * the interrupted context as after this point the instruction + * pointer in @regs points to the signal handler. */ - if (regs) { - ret = rseq_ip_fixup(regs); - if (unlikely(ret < 0)) - goto error; + if (unlikely(!rseq_handle_cs(current, regs))) { + /* + * Clear the errors just in case this might survive + * magically, but leave the rest intact. + */ + current->rseq.event.error = 0; + force_sigsegv(sig); } - if (unlikely(rseq_update_cpu_node_id(t))) - goto error; - return; - -error: - sig = ksig ? ksig->sig : 0; - force_sigsegv(sig); } -#ifdef CONFIG_DEBUG_RSEQ - /* * Terminate the process if a syscall is issued within a restartable * sequence. */ -void rseq_syscall(struct pt_regs *regs) +void __rseq_debug_syscall_return(struct pt_regs *regs) { - unsigned long ip = instruction_pointer(regs); struct task_struct *t = current; - struct rseq_cs rseq_cs; + u64 csaddr; - if (!t->rseq) + if (!t->rseq.event.has_rseq) return; - if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) - force_sig(SIGSEGV); + if (get_user(csaddr, &t->rseq.usrptr->rseq_cs)) + goto fail; + if (likely(!csaddr)) + return; + if (unlikely(csaddr >= TASK_SIZE)) + goto fail; + if (rseq_debug_update_user_cs(t, regs, csaddr)) + return; +fail: + force_sig(SIGSEGV); } +#ifdef CONFIG_DEBUG_RSEQ +/* Kept around to keep GENERIC_ENTRY=n architectures supported. */ +void rseq_syscall(struct pt_regs *regs) +{ + __rseq_debug_syscall_return(regs); +} #endif +static bool rseq_reset_ids(void) +{ + struct rseq_ids ids = { + .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, + .mm_cid = 0, + }; + + /* + * If this fails, terminate it because this leaves the kernel in + * stupid state as exit to user space will try to fixup the ids + * again. + */ + if (rseq_set_ids(current, &ids, 0)) + return true; + + force_sig(SIGSEGV); + return false; +} + +/* The original rseq structure size (including padding) is 32 bytes. */ +#define ORIG_RSEQ_SIZE 32 + /* * sys_rseq - setup restartable sequences for caller thread. */ -SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, - int, flags, u32, sig) +SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { - int ret; - u64 rseq_cs; - if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) return -EINVAL; /* Unregister rseq for current thread. */ - if (current->rseq != rseq || !current->rseq) + if (current->rseq.usrptr != rseq || !current->rseq.usrptr) return -EINVAL; - if (rseq_len != current->rseq_len) + if (rseq_len != current->rseq.len) return -EINVAL; - if (current->rseq_sig != sig) + if (current->rseq.sig != sig) return -EPERM; - ret = rseq_reset_rseq_cpu_node_id(current); - if (ret) - return ret; - current->rseq = NULL; - current->rseq_sig = 0; - current->rseq_len = 0; + if (!rseq_reset_ids()) + return -EFAULT; + rseq_reset(current); return 0; } if (unlikely(flags)) return -EINVAL; - if (current->rseq) { + if (current->rseq.usrptr) { /* * If rseq is already registered, check whether * the provided address differs from the prior * one. */ - if (current->rseq != rseq || rseq_len != current->rseq_len) + if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len) return -EINVAL; - if (current->rseq_sig != sig) + if (current->rseq.sig != sig) return -EPERM; /* Already registered. */ return -EBUSY; @@ -531,43 +440,39 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, if (!access_ok(rseq, rseq_len)) return -EFAULT; - /* - * If the rseq_cs pointer is non-NULL on registration, clear it to - * avoid a potential segfault on return to user-space. The proper thing - * to do would have been to fail the registration but this would break - * older libcs that reuse the rseq area for new threads without - * clearing the fields. - */ - if (rseq_get_rseq_cs_ptr_val(rseq, &rseq_cs)) - return -EFAULT; - if (rseq_cs && clear_rseq_cs(rseq)) - return -EFAULT; + scoped_user_write_access(rseq, efault) { + /* + * If the rseq_cs pointer is non-NULL on registration, clear it to + * avoid a potential segfault on return to user-space. The proper thing + * to do would have been to fail the registration but this would break + * older libcs that reuse the rseq area for new threads without + * clearing the fields. Don't bother reading it, just reset it. + */ + unsafe_put_user(0UL, &rseq->rseq_cs, efault); + /* Initialize IDs in user space */ + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault); + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); + unsafe_put_user(0U, &rseq->node_id, efault); + unsafe_put_user(0U, &rseq->mm_cid, efault); + } -#ifdef CONFIG_DEBUG_RSEQ - /* - * Initialize the in-kernel rseq fields copy for validation of - * read-only fields. - */ - if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) || - get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) || - get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) || - get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid)) - return -EFAULT; -#endif /* * Activate the registration by setting the rseq area address, length * and signature in the task struct. */ - current->rseq = rseq; - current->rseq_len = rseq_len; - current->rseq_sig = sig; + current->rseq.usrptr = rseq; + current->rseq.len = rseq_len; + current->rseq.sig = sig; /* * If rseq was previously inactive, and has just been * registered, ensure the cpu_id_start and cpu_id fields * are updated before returning to user-space. */ - rseq_set_notify_resume(current); - + current->rseq.event.has_rseq = true; + rseq_force_update(); return 0; + +efault: + return -EFAULT; } diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 2b331822c7e7..954137775f38 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -4,6 +4,9 @@ * Auto-group scheduling implementation: */ +#include "autogroup.h" +#include "sched.h" + unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; static struct autogroup autogroup_default; static atomic_t autogroup_seq_nr; @@ -25,9 +28,9 @@ static void __init sched_autogroup_sysctl_init(void) { register_sysctl_init("kernel", sched_autogroup_sysctls); } -#else +#else /* !CONFIG_SYSCTL: */ #define sched_autogroup_sysctl_init() do { } while (0) -#endif +#endif /* !CONFIG_SYSCTL */ void __init autogroup_init(struct task_struct *init_task) { @@ -108,7 +111,7 @@ static inline struct autogroup *autogroup_create(void) free_rt_sched_group(tg); tg->rt_se = root_task_group.rt_se; tg->rt_rq = root_task_group.rt_rq; -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ tg->autogroup = ag; sched_online_group(tg, &root_task_group); @@ -175,8 +178,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) * this process can already run with task_group() == prev->tg or we can * race with cgroup code which can read autogroup = prev under rq->lock. * In the latter case for_each_thread() can not miss a migrating thread, - * cpu_cgroup_attach() must not be possible after cgroup_exit() and it - * can't be removed from thread list, we hold ->siglock. + * cpu_cgroup_attach() must not be possible after cgroup_task_exit() + * and it can't be removed from thread list, we hold ->siglock. * * If an exiting thread was already removed from thread list we rely on * sched_autogroup_exit_task(). diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h index 90d69f2c5eaf..06c82b2bdfb5 100644 --- a/kernel/sched/autogroup.h +++ b/kernel/sched/autogroup.h @@ -2,6 +2,8 @@ #ifndef _KERNEL_SCHED_AUTOGROUP_H #define _KERNEL_SCHED_AUTOGROUP_H +#include "sched.h" + #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup { @@ -41,7 +43,7 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg) extern int autogroup_path(struct task_group *tg, char *buf, int buflen); -#else /* !CONFIG_SCHED_AUTOGROUP */ +#else /* !CONFIG_SCHED_AUTOGROUP: */ static inline void autogroup_init(struct task_struct *init_task) { } static inline void autogroup_free(struct task_group *tg) { } @@ -61,6 +63,6 @@ static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) return 0; } -#endif /* CONFIG_SCHED_AUTOGROUP */ +#endif /* !CONFIG_SCHED_AUTOGROUP */ #endif /* _KERNEL_SCHED_AUTOGROUP_H */ diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index 72d97aa8b726..755883faf751 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -50,16 +50,15 @@ #include "idle.c" #include "rt.c" +#include "cpudeadline.c" -#ifdef CONFIG_SMP -# include "cpudeadline.c" -# include "pelt.c" -#endif +#include "pelt.c" #include "cputime.c" #include "deadline.c" #ifdef CONFIG_SCHED_CLASS_EXT +# include "ext_internal.h" # include "ext.c" # include "ext_idle.c" #endif diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index bf9d8db94b70..e2cf3b08d4e9 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -80,11 +80,10 @@ #include "wait_bit.c" #include "wait.c" -#ifdef CONFIG_SMP -# include "cpupri.c" -# include "stop_task.c" -# include "topology.c" -#endif +#include "cpupri.c" +#include "stop_task.c" + +#include "topology.c" #ifdef CONFIG_SCHED_CORE # include "core_sched.c" diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index a09655b48140..f5e6dd6a6b3a 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -54,6 +54,9 @@ * */ +#include <linux/sched/clock.h> +#include "sched.h" + /* * Scheduler clock - returns current time in nanosec units. * This is default implementation. @@ -471,7 +474,7 @@ notrace void sched_clock_idle_wakeup_event(void) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ +#else /* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK: */ void __init sched_clock_init(void) { @@ -489,7 +492,7 @@ notrace u64 sched_clock_cpu(int cpu) return sched_clock(); } -#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ +#endif /* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ /* * Running clock - returns the time that has elapsed while a guest has been diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 3561ab533dd4..19ee702273c0 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -13,6 +13,11 @@ * Waiting for completion is a typically sync point, but not an exclusion point. */ +#include <linux/linkage.h> +#include <linux/sched/debug.h> +#include <linux/completion.h> +#include "sched.h" + static void complete_with_flags(struct completion *x, int wake_flags) { unsigned long flags; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c81cf642dba0..41ba0be16911 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7,6 +7,8 @@ * Copyright (C) 1991-2002 Linus Torvalds * Copyright (C) 1998-2024 Ingo Molnar, Red Hat */ +#define INSTANTIATE_EXPORTED_MIGRATE_DISABLE +#include <linux/sched.h> #include <linux/highmem.h> #include <linux/hrtimer_api.h> #include <linux/ktime_api.h> @@ -66,10 +68,11 @@ #include <linux/vtime.h> #include <linux/wait_api.h> #include <linux/workqueue_api.h> +#include <linux/livepatch_sched.h> #ifdef CONFIG_PREEMPT_DYNAMIC -# ifdef CONFIG_GENERIC_ENTRY -# include <linux/entry-common.h> +# ifdef CONFIG_GENERIC_IRQ_ENTRY +# include <linux/irq-entry-common.h> # endif #endif @@ -95,6 +98,7 @@ #include "../workqueue_internal.h" #include "../../io_uring/io-wq.h" #include "../smpboot.h" +#include "../locking/mutex.h" EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu); EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask); @@ -117,6 +121,36 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +DEFINE_PER_CPU(struct rnd_state, sched_rnd_state); + +#ifdef CONFIG_SCHED_PROXY_EXEC +DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec); +static int __init setup_proxy_exec(char *str) +{ + bool proxy_enable = true; + + if (*str && kstrtobool(str + 1, &proxy_enable)) { + pr_warn("Unable to parse sched_proxy_exec=\n"); + return 0; + } + + if (proxy_enable) { + pr_info("sched_proxy_exec enabled via boot arg\n"); + static_branch_enable(&__sched_proxy_exec); + } else { + pr_info("sched_proxy_exec disabled via boot arg\n"); + static_branch_disable(&__sched_proxy_exec); + } + return 1; +} +#else +static int __init setup_proxy_exec(char *str) +{ + pr_warn("CONFIG_SCHED_PROXY_EXEC=n, so it cannot be enabled or disabled at boot time\n"); + return 0; +} +#endif +__setup("sched_proxy_exec", setup_proxy_exec); /* * Debugging: various feature bits @@ -480,13 +514,13 @@ void sched_core_put(void) schedule_work(&_work); } -#else /* !CONFIG_SCHED_CORE */ +#else /* !CONFIG_SCHED_CORE: */ static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { } static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } -#endif /* CONFIG_SCHED_CORE */ +#endif /* !CONFIG_SCHED_CORE */ /* need a wrapper since we may need to trace from modules */ EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp); @@ -550,8 +584,8 @@ EXPORT_SYMBOL(__trace_set_current_state); * * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }: * - * is set by activate_task() and cleared by deactivate_task(), under - * rq->lock. Non-zero indicates the task is runnable, the special + * is set by activate_task() and cleared by deactivate_task()/block_task(), + * under rq->lock. Non-zero indicates the task is runnable, the special * ON_RQ_MIGRATING state is used for migration without holding both * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). * @@ -649,7 +683,6 @@ void raw_spin_rq_unlock(struct rq *rq) raw_spin_unlock(rq_lockp(rq)); } -#ifdef CONFIG_SMP /* * double_rq_lock - safely lock two runqueues */ @@ -666,7 +699,6 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2) double_rq_clock_clear_update(rq1, rq2); } -#endif /* * __task_rq_lock - lock the rq @p resides on. @@ -846,14 +878,12 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) rq_lock(rq, &rf); update_rq_clock(rq); - rq->donor->sched_class->task_tick(rq, rq->curr, 1); + rq->donor->sched_class->task_tick(rq, rq->donor, 1); rq_unlock(rq, &rf); return HRTIMER_NORESTART; } -#ifdef CONFIG_SMP - static void __hrtick_restart(struct rq *rq) { struct hrtimer *timer = &rq->hrtick_timer; @@ -890,7 +920,7 @@ void hrtick_start(struct rq *rq, u64 delay) * doesn't make sense and can cause timer DoS. */ delta = max_t(s64, delay, 10000LL); - rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta); + rq->hrtick_time = ktime_add_ns(hrtimer_cb_get_time(timer), delta); if (rq == this_rq()) __hrtick_restart(rq); @@ -898,33 +928,12 @@ void hrtick_start(struct rq *rq, u64 delay) smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); } -#else -/* - * Called to set the hrtick timer state. - * - * called with rq->lock held and IRQs disabled - */ -void hrtick_start(struct rq *rq, u64 delay) -{ - /* - * Don't schedule slices shorter than 10000ns, that just - * doesn't make sense. Rely on vruntime for fairness. - */ - delay = max_t(u64, delay, 10000LL); - hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), - HRTIMER_MODE_REL_PINNED_HARD); -} - -#endif /* CONFIG_SMP */ - static void hrtick_rq_init(struct rq *rq) { -#ifdef CONFIG_SMP INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); -#endif hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } -#else /* CONFIG_SCHED_HRTICK */ +#else /* !CONFIG_SCHED_HRTICK: */ static inline void hrtick_clear(struct rq *rq) { } @@ -932,7 +941,7 @@ static inline void hrtick_clear(struct rq *rq) static inline void hrtick_rq_init(struct rq *rq) { } -#endif /* CONFIG_SCHED_HRTICK */ +#endif /* !CONFIG_SCHED_HRTICK */ /* * try_cmpxchg based fetch_or() macro so it works for different integer types: @@ -948,7 +957,7 @@ static inline void hrtick_rq_init(struct rq *rq) _val; \ }) -#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) +#ifdef TIF_POLLING_NRFLAG /* * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, * this avoids any races wrt polling state changes and thereby avoids @@ -987,13 +996,11 @@ static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif) return true; } -#ifdef CONFIG_SMP static inline bool set_nr_if_polling(struct task_struct *p) { return false; } #endif -#endif static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) { @@ -1109,6 +1116,7 @@ static void __resched_curr(struct rq *rq, int tif) cpu = cpu_of(rq); + trace_sched_set_need_resched_tp(curr, cpu, tif); if (cpu == smp_processor_id()) { set_ti_thread_flag(cti, tif); if (tif == TIF_NEED_RESCHED) @@ -1124,6 +1132,11 @@ static void __resched_curr(struct rq *rq, int tif) } } +void __trace_set_need_resched(struct task_struct *curr, int tif) +{ + trace_sched_set_need_resched_tp(curr, smp_processor_id(), tif); +} + void resched_curr(struct rq *rq) { __resched_curr(rq, TIF_NEED_RESCHED); @@ -1166,7 +1179,6 @@ void resched_cpu(int cpu) raw_spin_rq_unlock_irqrestore(rq, flags); } -#ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ_COMMON /* * In the semi idle case, use the nearest busy CPU for migrating timers @@ -1373,10 +1385,8 @@ bool sched_can_stop_tick(struct rq *rq) return true; } #endif /* CONFIG_NO_HZ_FULL */ -#endif /* CONFIG_SMP */ -#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ - (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) +#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_FAIR_GROUP_SCHED) /* * Iterate task_group tree rooted at *from, calling @down when first entering a * node and @up when leaving it for the final time. @@ -1752,7 +1762,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, } } -static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) +static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags) { enum uclamp_id clamp_id; @@ -1768,7 +1778,8 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) if (unlikely(!p->sched_class->uclamp_enabled)) return; - if (p->se.sched_delayed) + /* Only inc the delayed task which being woken up. */ + if (p->se.sched_delayed && !(flags & ENQUEUE_DELAYED)) return; for_each_clamp_id(clamp_id) @@ -1969,7 +1980,7 @@ undo: sysctl_sched_uclamp_util_min_rt_default = old_min_rt; return result; } -#endif +#endif /* CONFIG_SYSCTL */ static void uclamp_fork(struct task_struct *p) { @@ -2035,13 +2046,13 @@ static void __init init_uclamp(void) } } -#else /* !CONFIG_UCLAMP_TASK */ -static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } +#else /* !CONFIG_UCLAMP_TASK: */ +static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags) { } static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } static inline void uclamp_fork(struct task_struct *p) { } static inline void uclamp_post_fork(struct task_struct *p) { } static inline void init_uclamp(void) { } -#endif /* CONFIG_UCLAMP_TASK */ +#endif /* !CONFIG_UCLAMP_TASK */ bool sched_task_on_rq(struct task_struct *p) { @@ -2072,12 +2083,15 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & ENQUEUE_NOCLOCK)) update_rq_clock(rq); - p->sched_class->enqueue_task(rq, p, flags); /* - * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear - * ->sched_delayed. + * Can be before ->enqueue_task() because uclamp considers the + * ENQUEUE_DELAYED task before its ->sched_delayed gets cleared + * in ->enqueue_task(). */ - uclamp_rq_inc(rq, p); + uclamp_rq_inc(rq, p, flags); + + rq->queue_mask |= p->sched_class->queue_mask; + p->sched_class->enqueue_task(rq, p, flags); psi_enqueue(p, flags); @@ -2109,6 +2123,7 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) * and mark the task ->sched_delayed. */ uclamp_rq_dec(rq, p); + rq->queue_mask |= p->sched_class->queue_mask; return p->sched_class->dequeue_task(rq, p, flags); } @@ -2116,8 +2131,6 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_on_rq_migrating(p)) flags |= ENQUEUE_MIGRATED; - if (flags & ENQUEUE_MIGRATED) - sched_mm_cid_migrate_to(rq, p); enqueue_task(rq, p, flags); @@ -2157,37 +2170,6 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } -/* - * ->switching_to() is called with the pi_lock and rq_lock held and must not - * mess with locking. - */ -void check_class_changing(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class) -{ - if (prev_class != p->sched_class && p->sched_class->switching_to) - p->sched_class->switching_to(rq, p); -} - -/* - * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, - * use the balance_callback list if you want balancing. - * - * this means any call to check_class_changed() must be followed by a call to - * balance_callback(). - */ -void check_class_changed(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class, - int oldprio) -{ - if (prev_class != p->sched_class) { - if (prev_class->switched_from) - prev_class->switched_from(rq, p); - - p->sched_class->switched_to(rq, p); - } else if (oldprio != p->prio || dl_task(p)) - p->sched_class->prio_changed(rq, p, oldprio); -} - void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *donor = rq->donor; @@ -2283,6 +2265,12 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state * just go back and repeat. */ rq = task_rq_lock(p, &rf); + /* + * If task is sched_delayed, force dequeue it, to avoid always + * hitting the tick timeout in the queued case + */ + if (p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); trace_sched_wait_task(p); running = task_on_cpu(rq, p); queued = task_on_rq_queued(p); @@ -2343,10 +2331,8 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state return ncsw; } -#ifdef CONFIG_SMP - static void -__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); +do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); static void migrate_disable_switch(struct rq *rq, struct task_struct *p) { @@ -2361,34 +2347,11 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p) if (p->cpus_ptr != &p->cpus_mask) return; - /* - * Violates locking rules! See comment in __do_set_cpus_allowed(). - */ - __do_set_cpus_allowed(p, &ac); + scoped_guard (task_rq_lock, p) + do_set_cpus_allowed(p, &ac); } -void migrate_disable(void) -{ - struct task_struct *p = current; - - if (p->migration_disabled) { -#ifdef CONFIG_DEBUG_PREEMPT - /* - *Warn about overflow half-way through the range. - */ - WARN_ON_ONCE((s16)p->migration_disabled < 0); -#endif - p->migration_disabled++; - return; - } - - guard(preempt)(); - this_rq()->nr_pinned++; - p->migration_disabled = 1; -} -EXPORT_SYMBOL_GPL(migrate_disable); - -void migrate_enable(void) +void ___migrate_enable(void) { struct task_struct *p = current; struct affinity_context ac = { @@ -2396,35 +2359,19 @@ void migrate_enable(void) .flags = SCA_MIGRATE_ENABLE, }; -#ifdef CONFIG_DEBUG_PREEMPT - /* - * Check both overflow from migrate_disable() and superfluous - * migrate_enable(). - */ - if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) - return; -#endif + __set_cpus_allowed_ptr(p, &ac); +} +EXPORT_SYMBOL_GPL(___migrate_enable); - if (p->migration_disabled > 1) { - p->migration_disabled--; - return; - } +void migrate_disable(void) +{ + __migrate_disable(); +} +EXPORT_SYMBOL_GPL(migrate_disable); - /* - * Ensure stop_task runs either before or after this, and that - * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). - */ - guard(preempt)(); - if (p->cpus_ptr != &p->cpus_mask) - __set_cpus_allowed_ptr(p, &ac); - /* - * Mustn't clear migration_disabled() until cpus_ptr points back at the - * regular cpus_mask, otherwise things that race (eg. - * select_fallback_rq) get confused. - */ - barrier(); - p->migration_disabled = 0; - this_rq()->nr_pinned--; +void migrate_enable(void) +{ + __migrate_enable(); } EXPORT_SYMBOL_GPL(migrate_enable); @@ -2634,7 +2581,8 @@ static int migration_cpu_stop(void *data) */ WARN_ON_ONCE(!pending->stop_pending); preempt_disable(); - task_rq_unlock(rq, p, &rf); + rq_unlock(rq, &rf); + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, &pending->arg, &pending->stop_work); preempt_enable(); @@ -2643,7 +2591,8 @@ static int migration_cpu_stop(void *data) out: if (pending) pending->stop_pending = false; - task_rq_unlock(rq, p, &rf); + rq_unlock(rq, &rf); + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); if (complete) complete_all(&pending->done); @@ -2692,6 +2641,8 @@ out_unlock: return 0; } +static inline void mm_update_cpus_allowed(struct mm_struct *mm, const cpumask_t *affmask); + /* * sched_class::set_cpus_allowed must do the below, but is not required to * actually call this function. @@ -2705,6 +2656,7 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx cpumask_copy(&p->cpus_mask, ctx->new_mask); p->nr_cpus_allowed = cpumask_weight(ctx->new_mask); + mm_update_cpus_allowed(p->mm, ctx->new_mask); /* * Swap in a new user_cpus_ptr if SCA_USER flag set @@ -2714,56 +2666,17 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx } static void -__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) +do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) { - struct rq *rq = task_rq(p); - bool queued, running; - - /* - * This here violates the locking rules for affinity, since we're only - * supposed to change these variables while holding both rq->lock and - * p->pi_lock. - * - * HOWEVER, it magically works, because ttwu() is the only code that - * accesses these variables under p->pi_lock and only does so after - * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() - * before finish_task(). - * - * XXX do further audits, this smells like something putrid. - */ - if (ctx->flags & SCA_MIGRATE_DISABLE) - WARN_ON_ONCE(!p->on_cpu); - else - lockdep_assert_held(&p->pi_lock); - - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - - if (queued) { - /* - * Because __kthread_bind() calls this on blocked tasks without - * holding rq->lock. - */ - lockdep_assert_rq_held(rq); - dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); - } - if (running) - put_prev_task(rq, p); - - p->sched_class->set_cpus_allowed(p, ctx); - mm_set_cpus_allowed(p->mm, ctx->new_mask); - - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); + scoped_guard (sched_change, p, DEQUEUE_SAVE) + p->sched_class->set_cpus_allowed(p, ctx); } /* * Used for kthread_bind() and select_fallback_rq(), in both cases the user * affinity (if any) should be destroyed too. */ -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask) { struct affinity_context ac = { .new_mask = new_mask, @@ -2775,7 +2688,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) struct rcu_head rcu; }; - __do_set_cpus_allowed(p, &ac); + scoped_guard (__task_rq_lock, p) + do_set_cpus_allowed(p, &ac); /* * Because this is called with p->pi_lock held, it is not possible @@ -2813,7 +2727,7 @@ int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, * Use pi_lock to protect content of user_cpus_ptr * * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent - * do_set_cpus_allowed(). + * set_cpus_allowed_force(). */ raw_spin_lock_irqsave(&src->pi_lock, flags); if (src->user_cpus_ptr) { @@ -2926,8 +2840,15 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag struct set_affinity_pending my_pending = { }, *pending = NULL; bool stop_pending, complete = false; - /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { + /* + * Can the task run on the task's current CPU? If so, we're done + * + * We are also done if the task is the current donor, boosting a lock- + * holding proxy, (and potentially has been migrated outside its + * current or previous affinity mask) + */ + if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask) || + (task_current_donor(rq, p) && !task_current(rq, p))) { struct task_struct *push_task = NULL; if ((flags & SCA_MIGRATE_ENABLE) && @@ -3078,8 +2999,6 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, unsigned int dest_cpu; int ret = 0; - update_rq_clock(rq); - if (kthread || is_migration_disabled(p)) { /* * Kernel threads are allowed on online && !active CPUs, @@ -3134,7 +3053,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, goto out; } - __do_set_cpus_allowed(p, ctx); + do_set_cpus_allowed(p, ctx); return affine_move_task(rq, p, rf, dest_cpu, ctx->flags); @@ -3295,6 +3214,8 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p) WARN_ON_ONCE(ret); } +#ifdef CONFIG_SMP + void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { unsigned int state = READ_ONCE(p->__state); @@ -3341,13 +3262,12 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; - rseq_migrate(p); - sched_mm_cid_migrate_from(p); perf_event_task_migrate(p); } __set_task_cpu(p, new_cpu); } +#endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING static void __migrate_swap_task(struct task_struct *p, int cpu) @@ -3540,13 +3460,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) } fallthrough; case possible: - /* - * XXX When called from select_task_rq() we only - * hold p->pi_lock and again violate locking order. - * - * More yuck to audit. - */ - do_set_cpus_allowed(p, task_cpu_fallback_mask(p)); + set_cpus_allowed_force(p, task_cpu_fallback_mask(p)); state = fail; break; case fail: @@ -3647,17 +3561,6 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) } } -#else /* CONFIG_SMP */ - -static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } - -static inline bool rq_has_pinned_tasks(struct rq *rq) -{ - return false; -} - -#endif /* !CONFIG_SMP */ - static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) { @@ -3668,7 +3571,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) rq = this_rq(); -#ifdef CONFIG_SMP if (cpu == rq->cpu) { __schedstat_inc(rq->ttwu_local); __schedstat_inc(p->stats.nr_wakeups_local); @@ -3688,7 +3590,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) if (wake_flags & WF_MIGRATED) __schedstat_inc(p->stats.nr_wakeups_migrate); -#endif /* CONFIG_SMP */ __schedstat_inc(rq->ttwu_count); __schedstat_inc(p->stats.nr_wakeups); @@ -3717,13 +3618,11 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, if (p->sched_contributes_to_load) rq->nr_uninterruptible--; -#ifdef CONFIG_SMP if (wake_flags & WF_RQ_SELECTED) en_flags |= ENQUEUE_RQ_SELECTED; if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; else -#endif if (p->in_iowait) { delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); @@ -3734,7 +3633,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, ttwu_do_wakeup(p); -#ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* * Our task @p is fully woken up and running; so it's safe to @@ -3756,7 +3654,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, rq->idle_stamp = 0; } -#endif } /* @@ -3805,12 +3702,11 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) ttwu_do_wakeup(p); ret = 1; } - __task_rq_unlock(rq, &rf); + __task_rq_unlock(rq, p, &rf); return ret; } -#ifdef CONFIG_SMP void sched_ttwu_pending(void *arg) { struct llist_node *llist = arg; @@ -3877,7 +3773,9 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); WRITE_ONCE(rq->ttwu_pending, 1); +#ifdef CONFIG_SMP __smp_call_single_queue(cpu, &p->wake_entry.llist); +#endif } void wake_up_if_idle(int cpu) @@ -3929,6 +3827,11 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) if (!scx_allow_ttwu_queue(p)) return false; +#ifdef CONFIG_SMP + if (p->sched_class == &stop_sched_class) + return false; +#endif + /* * Do not complicate things with the async wake_list while the CPU is * in hotplug state. @@ -3978,15 +3881,6 @@ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) return false; } -#else /* !CONFIG_SMP */ - -static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -{ - return false; -} - -#endif /* CONFIG_SMP */ - static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); @@ -4242,7 +4136,6 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) break; -#ifdef CONFIG_SMP /* * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be * possible to, falsely, observe p->on_cpu == 0. @@ -4263,7 +4156,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * __schedule(). See the comment for smp_mb__after_spinlock(). * * Form a control-dep-acquire with p->on_rq == 0 above, to ensure - * schedule()'s deactivate_task() has 'happened' and p will no longer + * schedule()'s block_task() has 'happened' and p will no longer * care about it's own p->state. See the comment in __schedule(). */ smp_acquire__after_ctrl_dep(); @@ -4321,9 +4214,6 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } -#else - cpu = task_cpu(p); -#endif /* CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); } @@ -4356,14 +4246,12 @@ static bool __task_needs_rq_lock(struct task_struct *p) if (p->on_rq) return true; -#ifdef CONFIG_SMP /* * Ensure the task has finished __schedule() and will not be referenced * anymore. Again, see try_to_wake_up() for a longer comment. */ smp_rmb(); smp_cond_load_acquire(&p->on_cpu, !VAL); -#endif return false; } @@ -4407,7 +4295,7 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg) ret = func(p, arg); if (rq) - rq_unlock(rq, &rf); + __task_rq_unlock(rq, p, &rf); raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); return ret; @@ -4474,7 +4362,7 @@ int wake_up_state(struct task_struct *p, unsigned int state) * __sched_fork() is basic setup which is also used by sched_init() to * initialize the boot CPU's idle task. */ -static void __sched_fork(unsigned long clone_flags, struct task_struct *p) +static void __sched_fork(u64 clone_flags, struct task_struct *p) { p->on_rq = 0; @@ -4492,6 +4380,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; +#ifdef CONFIG_CFS_BANDWIDTH + init_cfs_throttle_work(p); +#endif #endif #ifdef CONFIG_SCHEDSTATS @@ -4519,11 +4410,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->capture_control = NULL; #endif init_numa_balancing(clone_flags, p); -#ifdef CONFIG_SMP p->wake_entry.u_flags = CSD_TYPE_TTWU; p->migration_pending = NULL; -#endif - init_sched_mm_cid(p); } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -4585,8 +4473,8 @@ static int sysctl_numa_balancing(const struct ctl_table *table, int write, } return err; } -#endif -#endif +#endif /* CONFIG_PROC_SYSCTL */ +#endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_SCHEDSTATS @@ -4711,7 +4599,7 @@ late_initcall(sched_core_sysctl_init); /* * fork()/clone()-time setup: */ -int sched_fork(unsigned long clone_flags, struct task_struct *p) +int sched_fork(u64 clone_flags, struct task_struct *p) { __sched_fork(clone_flags, p); /* @@ -4773,14 +4661,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif -#if defined(CONFIG_SMP) p->on_cpu = 0; -#endif init_task_preempt_count(p); -#ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); RB_CLEAR_NODE(&p->pushable_dl_tasks); -#endif + return 0; } @@ -4802,7 +4687,6 @@ int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) p->sched_task_group = tg; } #endif - rseq_migrate(p); /* * We're setting the CPU for the first time, we don't migrate, * so use __set_task_cpu(). @@ -4857,7 +4741,6 @@ void wake_up_new_task(struct task_struct *p) raw_spin_lock_irqsave(&p->pi_lock, rf.flags); WRITE_ONCE(p->__state, TASK_RUNNING); -#ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: * - cpus_ptr can change in the fork path @@ -4867,9 +4750,7 @@ void wake_up_new_task(struct task_struct *p) * as we're not fully set-up yet. */ p->recent_used_cpu = task_cpu(p); - rseq_migrate(p); __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); -#endif rq = __task_rq_lock(p, &rf); update_rq_clock(rq); post_init_entity_util_avg(p); @@ -4877,7 +4758,6 @@ void wake_up_new_task(struct task_struct *p) activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL); trace_sched_wakeup_new(p); wakeup_preempt(rq, p, wake_flags); -#ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* * Nothing relies on rq->lock after this, so it's fine to @@ -4887,7 +4767,6 @@ void wake_up_new_task(struct task_struct *p) p->sched_class->task_woken(rq, p); rq_repin_lock(rq, &rf); } -#endif task_rq_unlock(rq, p, &rf); } @@ -4964,7 +4843,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, __fire_sched_out_preempt_notifiers(curr, next); } -#else /* !CONFIG_PREEMPT_NOTIFIERS */ +#else /* !CONFIG_PREEMPT_NOTIFIERS: */ static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) { @@ -4976,11 +4855,10 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, { } -#endif /* CONFIG_PREEMPT_NOTIFIERS */ +#endif /* !CONFIG_PREEMPT_NOTIFIERS */ static inline void prepare_task(struct task_struct *next) { -#ifdef CONFIG_SMP /* * Claim the task as running, we do this before switching to it * such that any running task will have this set. @@ -4989,12 +4867,10 @@ static inline void prepare_task(struct task_struct *next) * its ordering comment. */ WRITE_ONCE(next->on_cpu, 1); -#endif } static inline void finish_task(struct task_struct *prev) { -#ifdef CONFIG_SMP /* * This must be the very last reference to @prev from this CPU. After * p->on_cpu is cleared, the task can be moved to a different CPU. We @@ -5007,11 +4883,8 @@ static inline void finish_task(struct task_struct *prev) * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). */ smp_store_release(&prev->on_cpu, 0); -#endif } -#ifdef CONFIG_SMP - static void do_balance_callbacks(struct rq *rq, struct balance_callback *head) { void (*func)(struct rq *rq); @@ -5093,14 +4966,6 @@ void balance_callbacks(struct rq *rq, struct balance_callback *head) } } -#else - -static inline void __balance_callbacks(struct rq *rq) -{ -} - -#endif - static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) { @@ -5178,7 +5043,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, kcov_prepare_switch(prev); sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); - rseq_preempt(prev); fire_sched_out_preempt_notifiers(prev, next); kmap_local_sched_out(); prepare_task(next); @@ -5279,6 +5143,14 @@ static struct rq *finish_task_switch(struct task_struct *prev) if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); + /* + * sched_ext_dead() must come before cgroup_task_dead() to + * prevent cgroups from being removed while its member tasks are + * visible to SCX schedulers. + */ + sched_ext_dead(prev); + cgroup_task_dead(prev); + /* Task is done with its stack. */ put_task_stack(prev); @@ -5310,7 +5182,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) * switched the context for the first time. It is returning from * schedule for the first time in this path. */ - trace_sched_exit_tp(true, CALLER_ADDR0); + trace_sched_exit_tp(true); preempt_enable(); if (current->set_child_tid) @@ -5341,19 +5213,16 @@ context_switch(struct rq *rq, struct task_struct *prev, * * kernel -> user switch + mmdrop_lazy_tlb() active * user -> user switch - * - * switch_mm_cid() needs to be updated if the barriers provided - * by context_switch() are modified. */ - if (!next->mm) { // to kernel + if (!next->mm) { // to kernel enter_lazy_tlb(prev->active_mm, next); next->active_mm = prev->active_mm; - if (prev->mm) // from user + if (prev->mm) // from user mmgrab_lazy_tlb(prev->active_mm); else prev->active_mm = NULL; - } else { // to user + } else { // to user membarrier_switch_mm(rq, prev->active_mm, next->mm); /* * sys_membarrier() requires an smp_mb() between setting @@ -5366,15 +5235,20 @@ context_switch(struct rq *rq, struct task_struct *prev, switch_mm_irqs_off(prev->active_mm, next->mm, next); lru_gen_use_mm(next->mm); - if (!prev->mm) { // from kernel + if (!prev->mm) { // from kernel /* will mmdrop_lazy_tlb() in finish_task_switch(). */ rq->prev_mm = prev->active_mm; prev->active_mm = NULL; } } - /* switch_mm_cid() requires the memory barriers above. */ - switch_mm_cid(rq, prev, next); + mm_cid_switch_to(prev, next); + + /* + * Tell rseq that the task was scheduled in. Must be after + * switch_mm_cid() to get the TIF flag set. + */ + rseq_sched_switch_event(next); prepare_lock_switch(rq, next, rf); @@ -5488,8 +5362,6 @@ unsigned int nr_iowait(void) return sum; } -#ifdef CONFIG_SMP - /* * sched_exec - execve() is a valuable balancing opportunity, because at * this point the task has the smallest effective memory and cache footprint. @@ -5513,8 +5385,6 @@ void sched_exec(void) stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); } -#endif - DEFINE_PER_CPU(struct kernel_stat, kstat); DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); @@ -5549,7 +5419,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) struct rq *rq; u64 ns; -#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) +#ifdef CONFIG_64BIT /* * 64-bit doesn't need locks to atomically read a 64-bit value. * So we have a optimization chance when the task's delta_exec is 0. @@ -5663,7 +5533,6 @@ void sched_tick(void) resched_latency = cpu_resched_latency(rq); calc_global_load_tick(rq); sched_core_tick(rq); - task_tick_mm_cid(rq, donor); scx_tick(rq); rq_unlock(rq, &rf); @@ -5676,12 +5545,10 @@ void sched_tick(void) if (donor->flags & PF_WQ_WORKER) wq_worker_tick(donor); -#ifdef CONFIG_SMP if (!scx_switched_all()) { rq->idle_balance = idle_cpu(cpu); sched_balance_trigger(rq); } -#endif } #ifdef CONFIG_NO_HZ_FULL @@ -5755,7 +5622,7 @@ static void sched_tick_remote(struct work_struct *work) * reasonable amount of time. */ u64 delta = rq_clock_task(rq) - curr->se.exec_start; - WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 30); } curr->sched_class->task_tick(rq, curr, 0); @@ -5821,10 +5688,10 @@ int __init sched_tick_offload_init(void) return 0; } -#else /* !CONFIG_NO_HZ_FULL */ +#else /* !CONFIG_NO_HZ_FULL: */ static inline void sched_tick_start(int cpu) { } static inline void sched_tick_stop(int cpu) { } -#endif +#endif /* !CONFIG_NO_HZ_FULL */ #if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) @@ -5979,19 +5846,6 @@ static void prev_balance(struct rq *rq, struct task_struct *prev, const struct sched_class *start_class = prev->sched_class; const struct sched_class *class; -#ifdef CONFIG_SCHED_CLASS_EXT - /* - * SCX requires a balance() call before every pick_task() including when - * waking up from SCHED_IDLE. If @start_class is below SCX, start from - * SCX instead. Also, set a flag to detect missing balance() call. - */ - if (scx_enabled()) { - rq->scx.flags |= SCX_RQ_BAL_PENDING; - if (sched_class_above(&ext_sched_class, start_class)) - start_class = &ext_sched_class; - } -#endif - /* * We must do the balancing pass before put_prev_task(), such * that when we release the rq->lock the task is in the same @@ -6035,7 +5889,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* Assume the next prioritized class is idle_sched_class */ if (!p) { - p = pick_task_idle(rq); + p = pick_task_idle(rq, rf); put_prev_set_next_task(rq, prev, p); } @@ -6047,11 +5901,15 @@ restart: for_each_active_class(class) { if (class->pick_next_task) { - p = class->pick_next_task(rq, prev); + p = class->pick_next_task(rq, prev, rf); + if (unlikely(p == RETRY_TASK)) + goto restart; if (p) return p; } else { - p = class->pick_task(rq); + p = class->pick_task(rq, rf); + if (unlikely(p == RETRY_TASK)) + goto restart; if (p) { put_prev_set_next_task(rq, prev, p); return p; @@ -6081,7 +5939,11 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b) return a->core_cookie == b->core_cookie; } -static inline struct task_struct *pick_task(struct rq *rq) +/* + * Careful; this can return RETRY_TASK, it does not include the retry-loop + * itself due to the whole SMT pick retry thing below. + */ +static inline struct task_struct *pick_task(struct rq *rq, struct rq_flags *rf) { const struct sched_class *class; struct task_struct *p; @@ -6089,7 +5951,7 @@ static inline struct task_struct *pick_task(struct rq *rq) rq->dl_server = NULL; for_each_active_class(class) { - p = class->pick_task(rq); + p = class->pick_task(rq, rf); if (p) return p; } @@ -6104,7 +5966,7 @@ static void queue_core_balance(struct rq *rq); static struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - struct task_struct *next, *p, *max = NULL; + struct task_struct *next, *p, *max; const struct cpumask *smt_mask; bool fi_before = false; bool core_clock_updated = (rq == rq->core); @@ -6189,7 +6051,10 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * and there are no cookied tasks running on siblings. */ if (!need_sync) { - next = pick_task(rq); +restart_single: + next = pick_task(rq, rf); + if (unlikely(next == RETRY_TASK)) + goto restart_single; if (!next->core_cookie) { rq->core_pick = NULL; rq->core_dl_server = NULL; @@ -6209,6 +6074,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * * Tie-break prio towards the current CPU */ +restart_multi: + max = NULL; for_each_cpu_wrap(i, smt_mask, cpu) { rq_i = cpu_rq(i); @@ -6220,7 +6087,11 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (i != cpu && (rq_i != rq->core || !core_clock_updated)) update_rq_clock(rq_i); - rq_i->core_pick = p = pick_task(rq_i); + p = pick_task(rq_i, rf); + if (unlikely(p == RETRY_TASK)) + goto restart_multi; + + rq_i->core_pick = p; rq_i->core_dl_server = rq_i->dl_server; if (!max || prio_less(max, p, fi_before)) @@ -6242,7 +6113,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (cookie) p = sched_core_find(rq_i, cookie); if (!p) - p = idle_sched_class.pick_task(rq_i); + p = idle_sched_class.pick_task(rq_i, rf); } rq_i->core_pick = p; @@ -6539,7 +6410,7 @@ static inline void sched_core_cpu_dying(unsigned int cpu) rq->core = rq; } -#else /* !CONFIG_SCHED_CORE */ +#else /* !CONFIG_SCHED_CORE: */ static inline void sched_core_cpu_starting(unsigned int cpu) {} static inline void sched_core_cpu_deactivate(unsigned int cpu) {} @@ -6551,7 +6422,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return __pick_next_task(rq, prev, rf); } -#endif /* CONFIG_SCHED_CORE */ +#endif /* !CONFIG_SCHED_CORE */ /* * Constants for the sched_mode argument of __schedule(). @@ -6567,19 +6438,33 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* * Helper function for __schedule() * - * If a task does not have signals pending, deactivate it - * Otherwise marks the task's __state as RUNNING + * Tries to deactivate the task, unless the should_block arg + * is false or if a signal is pending. In the case a signal + * is pending, marks the task's __state as RUNNING (and clear + * blocked_on). */ static bool try_to_block_task(struct rq *rq, struct task_struct *p, - unsigned long task_state) + unsigned long *task_state_p, bool should_block) { + unsigned long task_state = *task_state_p; int flags = DEQUEUE_NOCLOCK; if (signal_pending_state(task_state, p)) { WRITE_ONCE(p->__state, TASK_RUNNING); + *task_state_p = TASK_RUNNING; return false; } + /* + * We check should_block after signal_pending because we + * will want to wake the task in that case. But if + * should_block is false, its likely due to the task being + * blocked on a mutex, and we want to keep it on the runqueue + * to be selectable for proxy-execution. + */ + if (!should_block) + return false; + p->sched_contributes_to_load = (task_state & TASK_UNINTERRUPTIBLE) && !(task_state & TASK_NOLOAD) && @@ -6603,6 +6488,194 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, return true; } +#ifdef CONFIG_SCHED_PROXY_EXEC +static inline struct task_struct *proxy_resched_idle(struct rq *rq) +{ + put_prev_set_next_task(rq, rq->donor, rq->idle); + rq_set_donor(rq, rq->idle); + set_tsk_need_resched(rq->idle); + return rq->idle; +} + +static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor) +{ + unsigned long state = READ_ONCE(donor->__state); + + /* Don't deactivate if the state has been changed to TASK_RUNNING */ + if (state == TASK_RUNNING) + return false; + /* + * Because we got donor from pick_next_task(), it is *crucial* + * that we call proxy_resched_idle() before we deactivate it. + * As once we deactivate donor, donor->on_rq is set to zero, + * which allows ttwu() to immediately try to wake the task on + * another rq. So we cannot use *any* references to donor + * after that point. So things like cfs_rq->curr or rq->donor + * need to be changed from next *before* we deactivate. + */ + proxy_resched_idle(rq); + return try_to_block_task(rq, donor, &state, true); +} + +static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *donor) +{ + if (!__proxy_deactivate(rq, donor)) { + /* + * XXX: For now, if deactivation failed, set donor + * as unblocked, as we aren't doing proxy-migrations + * yet (more logic will be needed then). + */ + donor->blocked_on = NULL; + } + return NULL; +} + +/* + * Find runnable lock owner to proxy for mutex blocked donor + * + * Follow the blocked-on relation: + * task->blocked_on -> mutex->owner -> task... + * + * Lock order: + * + * p->pi_lock + * rq->lock + * mutex->wait_lock + * + * Returns the task that is going to be used as execution context (the one + * that is actually going to be run on cpu_of(rq)). + */ +static struct task_struct * +find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) +{ + struct task_struct *owner = NULL; + int this_cpu = cpu_of(rq); + struct task_struct *p; + struct mutex *mutex; + + /* Follow blocked_on chain. */ + for (p = donor; task_is_blocked(p); p = owner) { + mutex = p->blocked_on; + /* Something changed in the chain, so pick again */ + if (!mutex) + return NULL; + /* + * By taking mutex->wait_lock we hold off concurrent mutex_unlock() + * and ensure @owner sticks around. + */ + guard(raw_spinlock)(&mutex->wait_lock); + + /* Check again that p is blocked with wait_lock held */ + if (mutex != __get_task_blocked_on(p)) { + /* + * Something changed in the blocked_on chain and + * we don't know if only at this level. So, let's + * just bail out completely and let __schedule() + * figure things out (pick_again loop). + */ + return NULL; + } + + owner = __mutex_owner(mutex); + if (!owner) { + __clear_task_blocked_on(p, mutex); + return p; + } + + if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) { + /* XXX Don't handle blocked owners/delayed dequeue yet */ + return proxy_deactivate(rq, donor); + } + + if (task_cpu(owner) != this_cpu) { + /* XXX Don't handle migrations yet */ + return proxy_deactivate(rq, donor); + } + + if (task_on_rq_migrating(owner)) { + /* + * One of the chain of mutex owners is currently migrating to this + * CPU, but has not yet been enqueued because we are holding the + * rq lock. As a simple solution, just schedule rq->idle to give + * the migration a chance to complete. Much like the migrate_task + * case we should end up back in find_proxy_task(), this time + * hopefully with all relevant tasks already enqueued. + */ + return proxy_resched_idle(rq); + } + + /* + * Its possible to race where after we check owner->on_rq + * but before we check (owner_cpu != this_cpu) that the + * task on another cpu was migrated back to this cpu. In + * that case it could slip by our checks. So double check + * we are still on this cpu and not migrating. If we get + * inconsistent results, try again. + */ + if (!task_on_rq_queued(owner) || task_cpu(owner) != this_cpu) + return NULL; + + if (owner == p) { + /* + * It's possible we interleave with mutex_unlock like: + * + * lock(&rq->lock); + * find_proxy_task() + * mutex_unlock() + * lock(&wait_lock); + * donor(owner) = current->blocked_donor; + * unlock(&wait_lock); + * + * wake_up_q(); + * ... + * ttwu_runnable() + * __task_rq_lock() + * lock(&wait_lock); + * owner == p + * + * Which leaves us to finish the ttwu_runnable() and make it go. + * + * So schedule rq->idle so that ttwu_runnable() can get the rq + * lock and mark owner as running. + */ + return proxy_resched_idle(rq); + } + /* + * OK, now we're absolutely sure @owner is on this + * rq, therefore holding @rq->lock is sufficient to + * guarantee its existence, as per ttwu_remote(). + */ + } + + WARN_ON_ONCE(owner && !owner->on_rq); + return owner; +} +#else /* SCHED_PROXY_EXEC */ +static struct task_struct * +find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) +{ + WARN_ONCE(1, "This should never be called in the !SCHED_PROXY_EXEC case\n"); + return donor; +} +#endif /* SCHED_PROXY_EXEC */ + +static inline void proxy_tag_curr(struct rq *rq, struct task_struct *owner) +{ + if (!sched_proxy_exec()) + return; + /* + * pick_next_task() calls set_next_task() on the chosen task + * at some point, which ensures it is not push/pullable. + * However, the chosen/donor task *and* the mutex owner form an + * atomic pair wrt push/pull. + * + * Make sure owner we run is not pushable. Unfortunately we can + * only deal with that by means of a dequeue/enqueue cycle. :-/ + */ + dequeue_task(rq, owner, DEQUEUE_NOCLOCK | DEQUEUE_SAVE); + enqueue_task(rq, owner, ENQUEUE_NOCLOCK | ENQUEUE_RESTORE); +} + /* * __schedule() is the main scheduler function. * @@ -6657,7 +6730,8 @@ static void __sched notrace __schedule(int sched_mode) struct rq *rq; int cpu; - trace_sched_entry_tp(preempt, CALLER_ADDR0); + /* Trace preemptions consistently with task switches */ + trace_sched_entry_tp(sched_mode == SM_PREEMPT); cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -6668,8 +6742,11 @@ static void __sched notrace __schedule(int sched_mode) if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) hrtick_clear(rq); + klp_sched_try_switch(prev); + local_irq_disable(); rcu_note_context_switch(preempt); + migrate_disable_switch(rq, prev); /* * Make sure that signal_pending_state()->signal_pending() below @@ -6713,15 +6790,31 @@ static void __sched notrace __schedule(int sched_mode) goto picked; } } else if (!preempt && prev_state) { - try_to_block_task(rq, prev, prev_state); + /* + * We pass task_is_blocked() as the should_block arg + * in order to keep mutex-blocked tasks on the runqueue + * for slection with proxy-exec (without proxy-exec + * task_is_blocked() will always be false). + */ + try_to_block_task(rq, prev, &prev_state, + !task_is_blocked(prev)); switch_count = &prev->nvcsw; } - next = pick_next_task(rq, prev, &rf); +pick_again: + next = pick_next_task(rq, rq->donor, &rf); rq_set_donor(rq, next); + if (unlikely(task_is_blocked(next))) { + next = find_proxy_task(rq, next, &rf); + if (!next) + goto pick_again; + if (next == rq->idle) + goto keep_resched; + } picked: clear_tsk_need_resched(prev); clear_preempt_need_resched(); +keep_resched: rq->last_seen_need_resched_ns = 0; is_switch = prev != next; @@ -6732,6 +6825,10 @@ picked: * changes to task_struct made by pick_next_task(). */ RCU_INIT_POINTER(rq->curr, next); + + if (!task_current_donor(rq, next)) + proxy_tag_curr(rq, next); + /* * The membarrier system call requires each architecture * to have a full memory barrier after updating @@ -6756,7 +6853,6 @@ picked: */ ++*switch_count; - migrate_disable_switch(rq, prev); psi_account_irqtime(rq, prev, next); psi_sched_switch(prev, next, !task_on_rq_queued(prev) || prev->se.sched_delayed); @@ -6766,11 +6862,15 @@ picked: /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); } else { + /* In case next was already curr but just got blocked_donor */ + if (!task_current_donor(rq, next)) + proxy_tag_curr(rq, next); + rq_unpin_lock(rq, &rf); __balance_callbacks(rq); raw_spin_rq_unlock_irq(rq); } - trace_sched_exit_tp(is_switch, CALLER_ADDR0); + trace_sched_exit_tp(is_switch); } void __noreturn do_task_dead(void) @@ -6974,14 +7074,14 @@ NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); #ifdef CONFIG_PREEMPT_DYNAMIC -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#ifndef preempt_schedule_dynamic_enabled -#define preempt_schedule_dynamic_enabled preempt_schedule -#define preempt_schedule_dynamic_disabled NULL -#endif +# ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL +# ifndef preempt_schedule_dynamic_enabled +# define preempt_schedule_dynamic_enabled preempt_schedule +# define preempt_schedule_dynamic_disabled NULL +# endif DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled); EXPORT_STATIC_CALL_TRAMP(preempt_schedule); -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule); void __sched notrace dynamic_preempt_schedule(void) { @@ -6991,8 +7091,8 @@ void __sched notrace dynamic_preempt_schedule(void) } NOKPROBE_SYMBOL(dynamic_preempt_schedule); EXPORT_SYMBOL(dynamic_preempt_schedule); -#endif -#endif +# endif +#endif /* CONFIG_PREEMPT_DYNAMIC */ /** * preempt_schedule_notrace - preempt_schedule called by tracing @@ -7047,14 +7147,14 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) EXPORT_SYMBOL_GPL(preempt_schedule_notrace); #ifdef CONFIG_PREEMPT_DYNAMIC -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#ifndef preempt_schedule_notrace_dynamic_enabled -#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace -#define preempt_schedule_notrace_dynamic_disabled NULL -#endif +# if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +# ifndef preempt_schedule_notrace_dynamic_enabled +# define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace +# define preempt_schedule_notrace_dynamic_disabled NULL +# endif DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled); EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace); void __sched notrace dynamic_preempt_schedule_notrace(void) { @@ -7064,7 +7164,7 @@ void __sched notrace dynamic_preempt_schedule_notrace(void) } NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace); EXPORT_SYMBOL(dynamic_preempt_schedule_notrace); -#endif +# endif #endif #endif /* CONFIG_PREEMPTION */ @@ -7160,7 +7260,7 @@ void rt_mutex_post_schedule(void) */ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) { - int prio, oldprio, queued, running, queue_flag = + int prio, oldprio, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *prev_class, *next_class; struct rq_flags rf; @@ -7222,68 +7322,52 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) prev_class = p->sched_class; next_class = __setscheduler_class(p->policy, prio); - if (prev_class != next_class && p->se.sched_delayed) - dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); + if (prev_class != next_class) + queue_flag |= DEQUEUE_CLASS; - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - if (queued) - dequeue_task(rq, p, queue_flag); - if (running) - put_prev_task(rq, p); - - /* - * Boosting condition are: - * 1. -rt task is running and holds mutex A - * --> -dl task blocks on mutex A - * - * 2. -dl task is running and holds mutex A - * --> -dl task blocks on mutex A and could preempt the - * running task - */ - if (dl_prio(prio)) { - if (!dl_prio(p->normal_prio) || - (pi_task && dl_prio(pi_task->prio) && - dl_entity_preempt(&pi_task->dl, &p->dl))) { - p->dl.pi_se = pi_task->dl.pi_se; - queue_flag |= ENQUEUE_REPLENISH; + scoped_guard (sched_change, p, queue_flag) { + /* + * Boosting condition are: + * 1. -rt task is running and holds mutex A + * --> -dl task blocks on mutex A + * + * 2. -dl task is running and holds mutex A + * --> -dl task blocks on mutex A and could preempt the + * running task + */ + if (dl_prio(prio)) { + if (!dl_prio(p->normal_prio) || + (pi_task && dl_prio(pi_task->prio) && + dl_entity_preempt(&pi_task->dl, &p->dl))) { + p->dl.pi_se = pi_task->dl.pi_se; + scope->flags |= ENQUEUE_REPLENISH; + } else { + p->dl.pi_se = &p->dl; + } + } else if (rt_prio(prio)) { + if (dl_prio(oldprio)) + p->dl.pi_se = &p->dl; + if (oldprio < prio) + scope->flags |= ENQUEUE_HEAD; } else { - p->dl.pi_se = &p->dl; + if (dl_prio(oldprio)) + p->dl.pi_se = &p->dl; + if (rt_prio(oldprio)) + p->rt.timeout = 0; } - } else if (rt_prio(prio)) { - if (dl_prio(oldprio)) - p->dl.pi_se = &p->dl; - if (oldprio < prio) - queue_flag |= ENQUEUE_HEAD; - } else { - if (dl_prio(oldprio)) - p->dl.pi_se = &p->dl; - if (rt_prio(oldprio)) - p->rt.timeout = 0; - } - - p->sched_class = next_class; - p->prio = prio; - - check_class_changing(rq, p, prev_class); - - if (queued) - enqueue_task(rq, p, queue_flag); - if (running) - set_next_task(rq, p); - check_class_changed(rq, p, prev_class, oldprio); + p->sched_class = next_class; + p->prio = prio; + } out_unlock: - /* Avoid rq from going away on us: */ - preempt_disable(); + /* Caller holds task_struct::pi_lock, IRQs are still disabled */ rq_unpin_lock(rq, &rf); __balance_callbacks(rq); - raw_spin_rq_unlock(rq); - - preempt_enable(); + rq_repin_lock(rq, &rf); + __task_rq_unlock(rq, p, &rf); } -#endif +#endif /* CONFIG_RT_MUTEXES */ #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) int __sched __cond_resched(void) @@ -7314,21 +7398,20 @@ EXPORT_SYMBOL(__cond_resched); #endif #ifdef CONFIG_PREEMPT_DYNAMIC -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#define cond_resched_dynamic_enabled __cond_resched -#define cond_resched_dynamic_disabled ((void *)&__static_call_return0) +# ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL +# define cond_resched_dynamic_enabled __cond_resched +# define cond_resched_dynamic_disabled ((void *)&__static_call_return0) DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched); EXPORT_STATIC_CALL_TRAMP(cond_resched); -#define might_resched_dynamic_enabled __cond_resched -#define might_resched_dynamic_disabled ((void *)&__static_call_return0) +# define might_resched_dynamic_enabled __cond_resched +# define might_resched_dynamic_disabled ((void *)&__static_call_return0) DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched); EXPORT_STATIC_CALL_TRAMP(might_resched); -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); int __sched dynamic_cond_resched(void) { - klp_sched_try_switch(); if (!static_branch_unlikely(&sk_dynamic_cond_resched)) return 0; return __cond_resched(); @@ -7343,8 +7426,8 @@ int __sched dynamic_might_resched(void) return __cond_resched(); } EXPORT_SYMBOL(dynamic_might_resched); -#endif -#endif +# endif +#endif /* CONFIG_PREEMPT_DYNAMIC */ /* * __cond_resched_lock() - if a reschedule is pending, drop the given lock, @@ -7410,9 +7493,9 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); #ifdef CONFIG_PREEMPT_DYNAMIC -#ifdef CONFIG_GENERIC_ENTRY -#include <linux/entry-common.h> -#endif +# ifdef CONFIG_GENERIC_IRQ_ENTRY +# include <linux/irq-entry-common.h> +# endif /* * SC:cond_resched @@ -7467,40 +7550,39 @@ int preempt_dynamic_mode = preempt_dynamic_undefined; int sched_dynamic_mode(const char *str) { -#ifndef CONFIG_PREEMPT_RT +# ifndef CONFIG_PREEMPT_RT if (!strcmp(str, "none")) return preempt_dynamic_none; if (!strcmp(str, "voluntary")) return preempt_dynamic_voluntary; -#endif +# endif if (!strcmp(str, "full")) return preempt_dynamic_full; -#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY +# ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY if (!strcmp(str, "lazy")) return preempt_dynamic_lazy; -#endif +# endif return -EINVAL; } -#define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key) -#define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key) +# define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key) +# define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key) -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) -#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) -#define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f) -#define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f) -#else -#error "Unsupported PREEMPT_DYNAMIC mechanism" -#endif +# if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +# define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) +# define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) +# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +# define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f) +# define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f) +# else +# error "Unsupported PREEMPT_DYNAMIC mechanism" +# endif static DEFINE_MUTEX(sched_dynamic_mutex); -static bool klp_override; static void __sched_dynamic_update(int mode) { @@ -7508,8 +7590,7 @@ static void __sched_dynamic_update(int mode) * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in * the ZERO state, which is invalid. */ - if (!klp_override) - preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(cond_resched); preempt_dynamic_enable(might_resched); preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); @@ -7518,8 +7599,7 @@ static void __sched_dynamic_update(int mode) switch (mode) { case preempt_dynamic_none: - if (!klp_override) - preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(cond_resched); preempt_dynamic_disable(might_resched); preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule_notrace); @@ -7530,8 +7610,7 @@ static void __sched_dynamic_update(int mode) break; case preempt_dynamic_voluntary: - if (!klp_override) - preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(cond_resched); preempt_dynamic_enable(might_resched); preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule_notrace); @@ -7542,8 +7621,7 @@ static void __sched_dynamic_update(int mode) break; case preempt_dynamic_full: - if (!klp_override) - preempt_dynamic_disable(cond_resched); + preempt_dynamic_disable(cond_resched); preempt_dynamic_disable(might_resched); preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); @@ -7554,8 +7632,7 @@ static void __sched_dynamic_update(int mode) break; case preempt_dynamic_lazy: - if (!klp_override) - preempt_dynamic_disable(cond_resched); + preempt_dynamic_disable(cond_resched); preempt_dynamic_disable(might_resched); preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); @@ -7576,36 +7653,6 @@ void sched_dynamic_update(int mode) mutex_unlock(&sched_dynamic_mutex); } -#ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL - -static int klp_cond_resched(void) -{ - __klp_sched_try_switch(); - return __cond_resched(); -} - -void sched_dynamic_klp_enable(void) -{ - mutex_lock(&sched_dynamic_mutex); - - klp_override = true; - static_call_update(cond_resched, klp_cond_resched); - - mutex_unlock(&sched_dynamic_mutex); -} - -void sched_dynamic_klp_disable(void) -{ - mutex_lock(&sched_dynamic_mutex); - - klp_override = false; - __sched_dynamic_update(preempt_dynamic_mode); - - mutex_unlock(&sched_dynamic_mutex); -} - -#endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ - static int __init setup_preempt_mode(char *str) { int mode = sched_dynamic_mode(str); @@ -7637,7 +7684,7 @@ static void __init preempt_dynamic_init(void) } } -#define PREEMPT_MODEL_ACCESSOR(mode) \ +# define PREEMPT_MODEL_ACCESSOR(mode) \ bool preempt_model_##mode(void) \ { \ WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \ @@ -7682,7 +7729,7 @@ const char *preempt_model_str(void) if (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC)) { seq_buf_printf(&s, "(%s)%s", - preempt_dynamic_mode > 0 ? + preempt_dynamic_mode >= 0 ? preempt_modes[preempt_dynamic_mode] : "undef", brace ? "}" : ""); return seq_buf_str(&s); @@ -7838,12 +7885,10 @@ void show_state_filter(unsigned int state_filter) */ void __init init_idle(struct task_struct *idle, int cpu) { -#ifdef CONFIG_SMP struct affinity_context ac = (struct affinity_context) { .new_mask = cpumask_of(cpu), .flags = 0, }; -#endif struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -7859,13 +7904,11 @@ void __init init_idle(struct task_struct *idle, int cpu) idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY; kthread_set_per_cpu(idle, cpu); -#ifdef CONFIG_SMP /* * No validation and serialization required at boot time and for * setting up the idle tasks of not yet online CPUs. */ set_cpus_allowed_common(idle, &ac); -#endif /* * We're having a chicken and egg problem, even though we are * holding rq->lock, the CPU isn't yet set to this CPU so the @@ -7884,9 +7927,7 @@ void __init init_idle(struct task_struct *idle, int cpu) rq_set_donor(rq, idle); rcu_assign_pointer(rq->curr, idle); idle->on_rq = TASK_ON_RQ_QUEUED; -#ifdef CONFIG_SMP idle->on_cpu = 1; -#endif raw_spin_rq_unlock(rq); raw_spin_unlock_irqrestore(&idle->pi_lock, flags); @@ -7899,13 +7940,9 @@ void __init init_idle(struct task_struct *idle, int cpu) idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); vtime_init_idle(idle, cpu); -#ifdef CONFIG_SMP sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); -#endif } -#ifdef CONFIG_SMP - int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { @@ -7965,26 +8002,9 @@ int migrate_task_to(struct task_struct *p, int target_cpu) */ void sched_setnuma(struct task_struct *p, int nid) { - bool queued, running; - struct rq_flags rf; - struct rq *rq; - - rq = task_rq_lock(p, &rf); - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - - if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); - if (running) - put_prev_task(rq, p); - - p->numa_preferred_nid = nid; - - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); - task_rq_unlock(rq, p, &rf); + guard(task_rq_lock)(p); + scoped_guard (sched_change, p, DEQUEUE_SAVE) + p->numa_preferred_nid = nid; } #endif /* CONFIG_NUMA_BALANCING */ @@ -8022,18 +8042,15 @@ static int __balance_push_cpu_stop(void *arg) struct rq_flags rf; int cpu; - raw_spin_lock_irq(&p->pi_lock); - rq_lock(rq, &rf); - - update_rq_clock(rq); - - if (task_rq(p) == rq && task_on_rq_queued(p)) { + scoped_guard (raw_spinlock_irq, &p->pi_lock) { cpu = select_fallback_rq(rq->cpu, p); - rq = __migrate_task(rq, &rf, p, cpu); - } - rq_unlock(rq, &rf); - raw_spin_unlock_irq(&p->pi_lock); + rq_lock(rq, &rf); + update_rq_clock(rq); + if (task_rq(p) == rq && task_on_rq_queued(p)) + rq = __migrate_task(rq, &rf, p, cpu); + rq_unlock(rq, &rf); + } put_task_struct(p); @@ -8141,7 +8158,7 @@ static void balance_hotplug_wait(void) TASK_UNINTERRUPTIBLE); } -#else +#else /* !CONFIG_HOTPLUG_CPU: */ static inline void balance_push(struct rq *rq) { @@ -8155,7 +8172,7 @@ static inline void balance_hotplug_wait(void) { } -#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* !CONFIG_HOTPLUG_CPU */ void set_rq_online(struct rq *rq) { @@ -8452,10 +8469,12 @@ int sched_cpu_dying(unsigned int cpu) sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); + update_rq_clock(rq); if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) { WARN(true, "Dying CPU not properly vacated!"); dump_rq_tasks(rq, KERN_WARNING); } + dl_server_stop(&rq->fair_server); rq_unlock_irqrestore(rq, &rf); calc_load_migrate(rq); @@ -8464,12 +8483,14 @@ int sched_cpu_dying(unsigned int cpu) sched_core_cpu_dying(cpu); return 0; } -#endif +#endif /* CONFIG_HOTPLUG_CPU */ void __init sched_init_smp(void) { sched_init_numa(NUMA_NO_NODE); + prandom_init_once(&sched_rnd_state); + /* * There's no userspace yet to cause hotplug operations; hence all the * CPU masks are stable and all blatant races in the below code cannot @@ -8488,6 +8509,8 @@ void __init sched_init_smp(void) init_sched_rt_class(); init_sched_dl_class(); + sched_init_dl_servers(); + sched_smp_initialized = true; } @@ -8498,13 +8521,6 @@ static int __init migration_init(void) } early_initcall(migration_init); -#else -void __init sched_init_smp(void) -{ - sched_init_granularity(); -} -#endif /* CONFIG_SMP */ - int in_sched_functions(unsigned long addr) { return in_lock_functions(addr) || @@ -8530,9 +8546,7 @@ void __init sched_init(void) int i; /* Make sure the linker didn't screw up */ -#ifdef CONFIG_SMP BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class)); -#endif BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class)); BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class)); BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class)); @@ -8563,7 +8577,7 @@ void __init sched_init(void) init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_EXT_GROUP_SCHED - root_task_group.scx_weight = CGROUP_WEIGHT_DFL; + scx_tg_init(&root_task_group); #endif /* CONFIG_EXT_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; @@ -8575,9 +8589,7 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ } -#ifdef CONFIG_SMP init_defrootdomain(); -#endif #ifdef CONFIG_RT_GROUP_SCHED init_rt_bandwidth(&root_task_group.rt_bandwidth, @@ -8638,7 +8650,6 @@ void __init sched_init(void) rq->rt.rt_runtime = global_rt_runtime(); init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif -#ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; rq->cpu_capacity = SCHED_CAPACITY_SCALE; @@ -8664,7 +8675,6 @@ void __init sched_init(void) #ifdef CONFIG_HOTPLUG_CPU rcuwait_init(&rq->hotplug_wait); #endif -#endif /* CONFIG_SMP */ hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); fair_server_init(rq); @@ -8712,10 +8722,9 @@ void __init sched_init(void) calc_load_update = jiffies + LOAD_FREQ; -#ifdef CONFIG_SMP idle_thread_set_boot_cpu(); + balance_push_set(smp_processor_id(), false); -#endif init_sched_fair_class(); init_sched_ext_class(); @@ -8848,7 +8857,7 @@ void __cant_sleep(const char *file, int line, int preempt_offset) } EXPORT_SYMBOL_GPL(__cant_sleep); -#ifdef CONFIG_SMP +# ifdef CONFIG_SMP void __cant_migrate(const char *file, int line) { static unsigned long prev_jiffy; @@ -8879,8 +8888,8 @@ void __cant_migrate(const char *file, int line) add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } EXPORT_SYMBOL_GPL(__cant_migrate); -#endif -#endif +# endif /* CONFIG_SMP */ +#endif /* CONFIG_DEBUG_ATOMIC_SLEEP */ #ifdef CONFIG_MAGIC_SYSRQ void normalize_rt_tasks(void) @@ -8920,7 +8929,7 @@ void normalize_rt_tasks(void) #endif /* CONFIG_MAGIC_SYSRQ */ -#if defined(CONFIG_KGDB_KDB) +#ifdef CONFIG_KGDB_KDB /* * These functions are only useful for KDB. * @@ -8944,7 +8953,7 @@ struct task_struct *curr_task(int cpu) return cpu_curr(cpu); } -#endif /* defined(CONFIG_KGDB_KDB) */ +#endif /* CONFIG_KGDB_KDB */ #ifdef CONFIG_CGROUP_SCHED /* task_group_lock serializes the addition/removal of task groups */ @@ -9003,7 +9012,7 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; - scx_group_set_weight(tg, CGROUP_WEIGHT_DFL); + scx_tg_init(tg); alloc_uclamp_sched_group(tg, parent); return tg; @@ -9018,7 +9027,7 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) unsigned long flags; spin_lock_irqsave(&task_group_lock, flags); - list_add_rcu(&tg->list, &task_groups); + list_add_tail_rcu(&tg->list, &task_groups); /* Root should already exist: */ WARN_ON(!parent); @@ -9098,38 +9107,23 @@ static void sched_change_group(struct task_struct *tsk) */ void sched_move_task(struct task_struct *tsk, bool for_autogroup) { - int queued, running, queue_flags = - DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; + bool resched = false; struct rq *rq; CLASS(task_rq_lock, rq_guard)(tsk); rq = rq_guard.rq; - update_rq_clock(rq); - - running = task_current_donor(rq, tsk); - queued = task_on_rq_queued(tsk); - - if (queued) - dequeue_task(rq, tsk, queue_flags); - if (running) - put_prev_task(rq, tsk); - - sched_change_group(tsk); - if (!for_autogroup) - scx_cgroup_move_task(tsk); + scoped_guard (sched_change, tsk, queue_flags) { + sched_change_group(tsk); + if (!for_autogroup) + scx_cgroup_move_task(tsk); + if (scope->running) + resched = true; + } - if (queued) - enqueue_task(rq, tsk, queue_flags); - if (running) { - set_next_task(rq, tsk); - /* - * After changing group, the running task may have joined a - * throttled one but it's still the running task. Trigger a - * resched to make sure that task can still run. - */ + if (resched) resched_curr(rq); - } } static struct cgroup_subsys_state * @@ -9204,11 +9198,15 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) struct task_struct *task; struct cgroup_subsys_state *css; + if (!rt_group_sched_enabled()) + goto scx_check; + cgroup_taskset_for_each(task, css, tset) { if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; } -#endif +scx_check: +#endif /* CONFIG_RT_GROUP_SCHED */ return scx_cgroup_can_attach(tset); } @@ -9219,8 +9217,6 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) cgroup_taskset_for_each(task, css, tset) sched_move_task(task, false); - - scx_cgroup_finish_attach(); } static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset) @@ -9408,7 +9404,7 @@ static unsigned long tg_weight(struct task_group *tg) #ifdef CONFIG_FAIR_GROUP_SCHED return scale_load_down(tg->shares); #else - return sched_weight_from_cgroup(tg->scx_weight); + return sched_weight_from_cgroup(tg->scx.weight); #endif } @@ -9436,47 +9432,23 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, #ifdef CONFIG_CFS_BANDWIDTH static DEFINE_MUTEX(cfs_constraints_mutex); -const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ -static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ -/* More than 203 days if BW_SHIFT equals 20. */ -static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC; - static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); -static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, - u64 burst) +static int tg_set_cfs_bandwidth(struct task_group *tg, + u64 period_us, u64 quota_us, u64 burst_us) { int i, ret = 0, runtime_enabled, runtime_was_enabled; struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + u64 period, quota, burst; - if (tg == &root_task_group) - return -EINVAL; - - /* - * Ensure we have at some amount of bandwidth every period. This is - * to prevent reaching a state of large arrears when throttled via - * entity_tick() resulting in prolonged exit starvation. - */ - if (quota < min_cfs_quota_period || period < min_cfs_quota_period) - return -EINVAL; - - /* - * Likewise, bound things on the other side by preventing insane quota - * periods. This also allows us to normalize in computing quota - * feasibility. - */ - if (period > max_cfs_quota_period) - return -EINVAL; + period = (u64)period_us * NSEC_PER_USEC; - /* - * Bound quota to defend quota against overflow during bandwidth shift. - */ - if (quota != RUNTIME_INF && quota > max_cfs_runtime) - return -EINVAL; + if (quota_us == RUNTIME_INF) + quota = RUNTIME_INF; + else + quota = (u64)quota_us * NSEC_PER_USEC; - if (quota != RUNTIME_INF && (burst > quota || - burst + quota > max_cfs_runtime)) - return -EINVAL; + burst = (u64)burst_us * NSEC_PER_USEC; /* * Prevent race between setting of cfs_rq->runtime_enabled and @@ -9519,7 +9491,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, guard(rq_lock_irq)(rq); cfs_rq->runtime_enabled = runtime_enabled; - cfs_rq->runtime_remaining = 0; + cfs_rq->runtime_remaining = 1; if (cfs_rq->throttled) unthrottle_cfs_rq(cfs_rq); @@ -9531,28 +9503,22 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, return 0; } -static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) +static u64 tg_get_cfs_period(struct task_group *tg) { - u64 quota, period, burst; + u64 cfs_period_us; - period = ktime_to_ns(tg->cfs_bandwidth.period); - burst = tg->cfs_bandwidth.burst; - if (cfs_quota_us < 0) - quota = RUNTIME_INF; - else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC) - quota = (u64)cfs_quota_us * NSEC_PER_USEC; - else - return -EINVAL; + cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); + do_div(cfs_period_us, NSEC_PER_USEC); - return tg_set_cfs_bandwidth(tg, period, quota, burst); + return cfs_period_us; } -static long tg_get_cfs_quota(struct task_group *tg) +static u64 tg_get_cfs_quota(struct task_group *tg) { u64 quota_us; if (tg->cfs_bandwidth.quota == RUNTIME_INF) - return -1; + return RUNTIME_INF; quota_us = tg->cfs_bandwidth.quota; do_div(quota_us, NSEC_PER_USEC); @@ -9560,45 +9526,7 @@ static long tg_get_cfs_quota(struct task_group *tg) return quota_us; } -static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) -{ - u64 quota, period, burst; - - if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC) - return -EINVAL; - - period = (u64)cfs_period_us * NSEC_PER_USEC; - quota = tg->cfs_bandwidth.quota; - burst = tg->cfs_bandwidth.burst; - - return tg_set_cfs_bandwidth(tg, period, quota, burst); -} - -static long tg_get_cfs_period(struct task_group *tg) -{ - u64 cfs_period_us; - - cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); - do_div(cfs_period_us, NSEC_PER_USEC); - - return cfs_period_us; -} - -static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us) -{ - u64 quota, period, burst; - - if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC) - return -EINVAL; - - burst = (u64)cfs_burst_us * NSEC_PER_USEC; - period = ktime_to_ns(tg->cfs_bandwidth.period); - quota = tg->cfs_bandwidth.quota; - - return tg_set_cfs_bandwidth(tg, period, quota, burst); -} - -static long tg_get_cfs_burst(struct task_group *tg) +static u64 tg_get_cfs_burst(struct task_group *tg) { u64 burst_us; @@ -9608,42 +9536,6 @@ static long tg_get_cfs_burst(struct task_group *tg) return burst_us; } -static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return tg_get_cfs_quota(css_tg(css)); -} - -static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, - struct cftype *cftype, s64 cfs_quota_us) -{ - return tg_set_cfs_quota(css_tg(css), cfs_quota_us); -} - -static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return tg_get_cfs_period(css_tg(css)); -} - -static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, - struct cftype *cftype, u64 cfs_period_us) -{ - return tg_set_cfs_period(css_tg(css), cfs_period_us); -} - -static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return tg_get_cfs_burst(css_tg(css)); -} - -static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css, - struct cftype *cftype, u64 cfs_burst_us) -{ - return tg_set_cfs_burst(css_tg(css), cfs_burst_us); -} - struct cfs_schedulable_data { struct task_group *tg; u64 period, quota; @@ -9778,6 +9670,143 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v) } #endif /* CONFIG_CFS_BANDWIDTH */ +#ifdef CONFIG_GROUP_SCHED_BANDWIDTH +const u64 max_bw_quota_period_us = 1 * USEC_PER_SEC; /* 1s */ +static const u64 min_bw_quota_period_us = 1 * USEC_PER_MSEC; /* 1ms */ +/* More than 203 days if BW_SHIFT equals 20. */ +static const u64 max_bw_runtime_us = MAX_BW; + +static void tg_bandwidth(struct task_group *tg, + u64 *period_us_p, u64 *quota_us_p, u64 *burst_us_p) +{ +#ifdef CONFIG_CFS_BANDWIDTH + if (period_us_p) + *period_us_p = tg_get_cfs_period(tg); + if (quota_us_p) + *quota_us_p = tg_get_cfs_quota(tg); + if (burst_us_p) + *burst_us_p = tg_get_cfs_burst(tg); +#else /* !CONFIG_CFS_BANDWIDTH */ + if (period_us_p) + *period_us_p = tg->scx.bw_period_us; + if (quota_us_p) + *quota_us_p = tg->scx.bw_quota_us; + if (burst_us_p) + *burst_us_p = tg->scx.bw_burst_us; +#endif /* CONFIG_CFS_BANDWIDTH */ +} + +static u64 cpu_period_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + u64 period_us; + + tg_bandwidth(css_tg(css), &period_us, NULL, NULL); + return period_us; +} + +static int tg_set_bandwidth(struct task_group *tg, + u64 period_us, u64 quota_us, u64 burst_us) +{ + const u64 max_usec = U64_MAX / NSEC_PER_USEC; + int ret = 0; + + if (tg == &root_task_group) + return -EINVAL; + + /* Values should survive translation to nsec */ + if (period_us > max_usec || + (quota_us != RUNTIME_INF && quota_us > max_usec) || + burst_us > max_usec) + return -EINVAL; + + /* + * Ensure we have some amount of bandwidth every period. This is to + * prevent reaching a state of large arrears when throttled via + * entity_tick() resulting in prolonged exit starvation. + */ + if (quota_us < min_bw_quota_period_us || + period_us < min_bw_quota_period_us) + return -EINVAL; + + /* + * Likewise, bound things on the other side by preventing insane quota + * periods. This also allows us to normalize in computing quota + * feasibility. + */ + if (period_us > max_bw_quota_period_us) + return -EINVAL; + + /* + * Bound quota to defend quota against overflow during bandwidth shift. + */ + if (quota_us != RUNTIME_INF && quota_us > max_bw_runtime_us) + return -EINVAL; + + if (quota_us != RUNTIME_INF && (burst_us > quota_us || + burst_us + quota_us > max_bw_runtime_us)) + return -EINVAL; + +#ifdef CONFIG_CFS_BANDWIDTH + ret = tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us); +#endif /* CONFIG_CFS_BANDWIDTH */ + if (!ret) + scx_group_set_bandwidth(tg, period_us, quota_us, burst_us); + return ret; +} + +static s64 cpu_quota_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + u64 quota_us; + + tg_bandwidth(css_tg(css), NULL, "a_us, NULL); + return quota_us; /* (s64)RUNTIME_INF becomes -1 */ +} + +static u64 cpu_burst_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + u64 burst_us; + + tg_bandwidth(css_tg(css), NULL, NULL, &burst_us); + return burst_us; +} + +static int cpu_period_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 period_us) +{ + struct task_group *tg = css_tg(css); + u64 quota_us, burst_us; + + tg_bandwidth(tg, NULL, "a_us, &burst_us); + return tg_set_bandwidth(tg, period_us, quota_us, burst_us); +} + +static int cpu_quota_write_s64(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 quota_us) +{ + struct task_group *tg = css_tg(css); + u64 period_us, burst_us; + + if (quota_us < 0) + quota_us = RUNTIME_INF; + + tg_bandwidth(tg, &period_us, NULL, &burst_us); + return tg_set_bandwidth(tg, period_us, quota_us, burst_us); +} + +static int cpu_burst_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 burst_us) +{ + struct task_group *tg = css_tg(css); + u64 period_us, quota_us; + + tg_bandwidth(tg, &period_us, "a_us, NULL); + return tg_set_bandwidth(tg, period_us, quota_us, burst_us); +} +#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */ + #ifdef CONFIG_RT_GROUP_SCHED static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, struct cftype *cft, s64 val) @@ -9821,7 +9850,7 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, scx_group_set_idle(css_tg(css), idle); return ret; } -#endif +#endif /* CONFIG_GROUP_SCHED_WEIGHT */ static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_GROUP_SCHED_WEIGHT @@ -9836,22 +9865,24 @@ static struct cftype cpu_legacy_files[] = { .write_s64 = cpu_idle_write_s64, }, #endif -#ifdef CONFIG_CFS_BANDWIDTH +#ifdef CONFIG_GROUP_SCHED_BANDWIDTH { - .name = "cfs_quota_us", - .read_s64 = cpu_cfs_quota_read_s64, - .write_s64 = cpu_cfs_quota_write_s64, + .name = "cfs_period_us", + .read_u64 = cpu_period_read_u64, + .write_u64 = cpu_period_write_u64, }, { - .name = "cfs_period_us", - .read_u64 = cpu_cfs_period_read_u64, - .write_u64 = cpu_cfs_period_write_u64, + .name = "cfs_quota_us", + .read_s64 = cpu_quota_read_s64, + .write_s64 = cpu_quota_write_s64, }, { .name = "cfs_burst_us", - .read_u64 = cpu_cfs_burst_read_u64, - .write_u64 = cpu_cfs_burst_write_u64, + .read_u64 = cpu_burst_read_u64, + .write_u64 = cpu_burst_write_u64, }, +#endif +#ifdef CONFIG_CFS_BANDWIDTH { .name = "stat", .seq_show = cpu_cfs_stat_show, @@ -9861,18 +9892,6 @@ static struct cftype cpu_legacy_files[] = { .seq_show = cpu_cfs_local_stat_show, }, #endif -#ifdef CONFIG_RT_GROUP_SCHED - { - .name = "rt_runtime_us", - .read_s64 = cpu_rt_runtime_read, - .write_s64 = cpu_rt_runtime_write, - }, - { - .name = "rt_period_us", - .read_u64 = cpu_rt_period_read_uint, - .write_u64 = cpu_rt_period_write_uint, - }, -#endif #ifdef CONFIG_UCLAMP_TASK_GROUP { .name = "uclamp.min", @@ -9890,6 +9909,55 @@ static struct cftype cpu_legacy_files[] = { { } /* Terminate */ }; +#ifdef CONFIG_RT_GROUP_SCHED +static struct cftype rt_group_files[] = { + { + .name = "rt_runtime_us", + .read_s64 = cpu_rt_runtime_read, + .write_s64 = cpu_rt_runtime_write, + }, + { + .name = "rt_period_us", + .read_u64 = cpu_rt_period_read_uint, + .write_u64 = cpu_rt_period_write_uint, + }, + { } /* Terminate */ +}; + +# ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED +DEFINE_STATIC_KEY_FALSE(rt_group_sched); +# else +DEFINE_STATIC_KEY_TRUE(rt_group_sched); +# endif + +static int __init setup_rt_group_sched(char *str) +{ + long val; + + if (kstrtol(str, 0, &val) || val < 0 || val > 1) { + pr_warn("Unable to set rt_group_sched\n"); + return 1; + } + if (val) + static_branch_enable(&rt_group_sched); + else + static_branch_disable(&rt_group_sched); + + return 1; +} +__setup("rt_group_sched=", setup_rt_group_sched); + +static int __init cpu_rt_group_init(void) +{ + if (!rt_group_sched_enabled()) + return 0; + + WARN_ON(cgroup_add_legacy_cftypes(&cpu_cgrp_subsys, rt_group_files)); + return 0; +} +subsys_initcall(cpu_rt_group_init); +#endif /* CONFIG_RT_GROUP_SCHED */ + static int cpu_extra_stat_show(struct seq_file *sf, struct cgroup_subsys_state *css) { @@ -9912,7 +9980,7 @@ static int cpu_extra_stat_show(struct seq_file *sf, cfs_b->nr_periods, cfs_b->nr_throttled, throttled_usec, cfs_b->nr_burst, burst_usec); } -#endif +#endif /* CONFIG_CFS_BANDWIDTH */ return 0; } @@ -10010,32 +10078,32 @@ static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, } /* caller should put the current value in *@periodp before calling */ -static int __maybe_unused cpu_period_quota_parse(char *buf, - u64 *periodp, u64 *quotap) +static int __maybe_unused cpu_period_quota_parse(char *buf, u64 *period_us_p, + u64 *quota_us_p) { char tok[21]; /* U64_MAX */ - if (sscanf(buf, "%20s %llu", tok, periodp) < 1) + if (sscanf(buf, "%20s %llu", tok, period_us_p) < 1) return -EINVAL; - *periodp *= NSEC_PER_USEC; - - if (sscanf(tok, "%llu", quotap)) - *quotap *= NSEC_PER_USEC; - else if (!strcmp(tok, "max")) - *quotap = RUNTIME_INF; - else - return -EINVAL; + if (sscanf(tok, "%llu", quota_us_p) < 1) { + if (!strcmp(tok, "max")) + *quota_us_p = RUNTIME_INF; + else + return -EINVAL; + } return 0; } -#ifdef CONFIG_CFS_BANDWIDTH +#ifdef CONFIG_GROUP_SCHED_BANDWIDTH static int cpu_max_show(struct seq_file *sf, void *v) { struct task_group *tg = css_tg(seq_css(sf)); + u64 period_us, quota_us; - cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg)); + tg_bandwidth(tg, &period_us, "a_us, NULL); + cpu_period_quota_print(sf, period_us, quota_us); return 0; } @@ -10043,17 +10111,16 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct task_group *tg = css_tg(of_css(of)); - u64 period = tg_get_cfs_period(tg); - u64 burst = tg->cfs_bandwidth.burst; - u64 quota; + u64 period_us, quota_us, burst_us; int ret; - ret = cpu_period_quota_parse(buf, &period, "a); + tg_bandwidth(tg, &period_us, NULL, &burst_us); + ret = cpu_period_quota_parse(buf, &period_us, "a_us); if (!ret) - ret = tg_set_cfs_bandwidth(tg, period, quota, burst); + ret = tg_set_bandwidth(tg, period_us, quota_us, burst_us); return ret ?: nbytes; } -#endif +#endif /* CONFIG_CFS_BANDWIDTH */ static struct cftype cpu_files[] = { #ifdef CONFIG_GROUP_SCHED_WEIGHT @@ -10076,7 +10143,7 @@ static struct cftype cpu_files[] = { .write_s64 = cpu_idle_write_s64, }, #endif -#ifdef CONFIG_CFS_BANDWIDTH +#ifdef CONFIG_GROUP_SCHED_BANDWIDTH { .name = "max", .flags = CFTYPE_NOT_ON_ROOT, @@ -10086,10 +10153,10 @@ static struct cftype cpu_files[] = { { .name = "max.burst", .flags = CFTYPE_NOT_ON_ROOT, - .read_u64 = cpu_cfs_burst_read_u64, - .write_u64 = cpu_cfs_burst_write_u64, + .read_u64 = cpu_burst_read_u64, + .write_u64 = cpu_burst_write_u64, }, -#endif +#endif /* CONFIG_CFS_BANDWIDTH */ #ifdef CONFIG_UCLAMP_TASK_GROUP { .name = "uclamp.min", @@ -10103,7 +10170,7 @@ static struct cftype cpu_files[] = { .seq_show = cpu_uclamp_max_show, .write = cpu_uclamp_max_write, }, -#endif +#endif /* CONFIG_UCLAMP_TASK_GROUP */ { } /* terminate */ }; @@ -10124,7 +10191,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .threaded = true, }; -#endif /* CONFIG_CGROUP_SCHED */ +#endif /* CONFIG_CGROUP_SCHED */ void dump_cpu_task(int cpu) { @@ -10192,557 +10259,571 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) } #ifdef CONFIG_SCHED_MM_CID - -/* - * @cid_lock: Guarantee forward-progress of cid allocation. - * - * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock - * is only used when contention is detected by the lock-free allocation so - * forward progress can be guaranteed. - */ -DEFINE_RAW_SPINLOCK(cid_lock); - -/* - * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock. - * - * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is - * detected, it is set to 1 to ensure that all newly coming allocations are - * serialized by @cid_lock until the allocation which detected contention - * completes and sets @use_cid_lock back to 0. This guarantees forward progress - * of a cid allocation. - */ -int use_cid_lock; - /* - * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid - * concurrently with respect to the execution of the source runqueue context - * switch. - * - * There is one basic properties we want to guarantee here: + * Concurrency IDentifier management * - * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively - * used by a task. That would lead to concurrent allocation of the cid and - * userspace corruption. - * - * Provide this guarantee by introducing a Dekker memory ordering to guarantee - * that a pair of loads observe at least one of a pair of stores, which can be - * shown as: + * Serialization rules: * - * X = Y = 0 + * mm::mm_cid::mutex: Serializes fork() and exit() and therefore + * protects mm::mm_cid::users. * - * w[X]=1 w[Y]=1 - * MB MB - * r[Y]=y r[X]=x + * mm::mm_cid::lock: Serializes mm_update_max_cids() and + * mm_update_cpus_allowed(). Nests in mm_cid::mutex + * and runqueue lock. * - * Which guarantees that x==0 && y==0 is impossible. But rather than using - * values 0 and 1, this algorithm cares about specific state transitions of the - * runqueue current task (as updated by the scheduler context switch), and the - * per-mm/cpu cid value. + * The mm_cidmask bitmap is not protected by any of the mm::mm_cid locks + * and can only be modified with atomic operations. * - * Let's introduce task (Y) which has task->mm == mm and task (N) which has - * task->mm != mm for the rest of the discussion. There are two scheduler state - * transitions on context switch we care about: + * The mm::mm_cid:pcpu per CPU storage is protected by the CPUs runqueue + * lock. * - * (TSA) Store to rq->curr with transition from (N) to (Y) + * CID ownership: * - * (TSB) Store to rq->curr with transition from (Y) to (N) + * A CID is either owned by a task (stored in task_struct::mm_cid.cid) or + * by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the + * MM_CID_ONCPU bit set. During transition from CPU to task ownership mode, + * MM_CID_TRANSIT is set on the per task CIDs. When this bit is set the + * task needs to drop the CID into the pool when scheduling out. Both bits + * (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is + * actually handed over to user space in the RSEQ memory. * - * On the remote-clear side, there is one transition we care about: + * Mode switching: * - * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag + * Switching to per CPU mode happens when the user count becomes greater + * than the maximum number of CIDs, which is calculated by: * - * There is also a transition to UNSET state which can be performed from all - * sides (scheduler, remote-clear). It is always performed with a cmpxchg which - * guarantees that only a single thread will succeed: + * opt_cids = min(mm_cid::nr_cpus_allowed, mm_cid::users); + * max_cids = min(1.25 * opt_cids, num_possible_cpus()); * - * (TMB) cmpxchg to *pcpu_cid to mark UNSET + * The +25% allowance is useful for tight CPU masks in scenarios where only + * a few threads are created and destroyed to avoid frequent mode + * switches. Though this allowance shrinks, the closer opt_cids becomes to + * num_possible_cpus(), which is the (unfortunate) hard ABI limit. * - * Just to be clear, what we do _not_ want to happen is a transition to UNSET - * when a thread is actively using the cid (property (1)). + * At the point of switching to per CPU mode the new user is not yet + * visible in the system, so the task which initiated the fork() runs the + * fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and + * either transfers each tasks owned CID to the CPU the task runs on or + * drops it into the CID pool if a task is not on a CPU at that point in + * time. Tasks which schedule in before the task walk reaches them do the + * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes + * it's guaranteed that no task related to that MM owns a CID anymore. * - * Let's looks at the relevant combinations of TSA/TSB, and TMA transitions. + * Switching back to task mode happens when the user count goes below the + * threshold which was recorded on the per CPU mode switch: * - * Scenario A) (TSA)+(TMA) (from next task perspective) + * pcpu_thrs = min(opt_cids - (opt_cids / 4), num_possible_cpus() / 2); * - * CPU0 CPU1 + * This threshold is updated when a affinity change increases the number of + * allowed CPUs for the MM, which might cause a switch back to per task + * mode. * - * Context switch CS-1 Remote-clear - * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pcpu_id to LAZY (TMA) - * (implied barrier after cmpxchg) - * - switch_mm_cid() - * - memory barrier (see switch_mm_cid() - * comment explaining how this barrier - * is combined with other scheduler - * barriers) - * - mm_cid_get (next) - * - READ_ONCE(*pcpu_cid) - rcu_dereference(src_rq->curr) + * If the switch back was initiated by a exiting task, then that task runs + * the fixup function. If it was initiated by a affinity change, then it's + * run either in the deferred update function in context of a workqueue or + * by a task which forks a new one or by a task which exits. Whatever + * happens first. mm_cid_fixup_cpus_to_task() walks through the possible + * CPUs and either transfers the CPU owned CIDs to a related task which + * runs on the CPU or drops it into the pool. Tasks which schedule in on a + * CPU which the walk did not cover yet do the handover themself. * - * This Dekker ensures that either task (Y) is observed by the - * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are - * observed. + * This transition from CPU to per task ownership happens in two phases: * - * If task (Y) store is observed by rcu_dereference(), it means that there is - * still an active task on the cpu. Remote-clear will therefore not transition - * to UNSET, which fulfills property (1). + * 1) mm:mm_cid.transit contains MM_CID_TRANSIT This is OR'ed on the task + * CID and denotes that the CID is only temporarily owned by the + * task. When it schedules out the task drops the CID back into the + * pool if this bit is set. * - * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(), - * it will move its state to UNSET, which clears the percpu cid perhaps - * uselessly (which is not an issue for correctness). Because task (Y) is not - * observed, CPU1 can move ahead to set the state to UNSET. Because moving - * state to UNSET is done with a cmpxchg expecting that the old state has the - * LAZY flag set, only one thread will successfully UNSET. + * 2) The initiating context walks the per CPU space and after completion + * clears mm:mm_cid.transit. So after that point the CIDs are strictly + * task owned again. * - * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0 - * will observe the LAZY flag and transition to UNSET (perhaps uselessly), and - * CPU1 will observe task (Y) and do nothing more, which is fine. + * This two phase transition is required to prevent CID space exhaustion + * during the transition as a direct transfer of ownership would fail if + * two tasks are scheduled in on the same CPU before the fixup freed per + * CPU CIDs. * - * What we are effectively preventing with this Dekker is a scenario where - * neither LAZY flag nor store (Y) are observed, which would fail property (1) - * because this would UNSET a cid which is actively used. + * When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID + * related to that MM is owned by a CPU anymore. */ -void sched_mm_cid_migrate_from(struct task_struct *t) -{ - t->migrate_from_cpu = task_cpu(t); -} - -static -int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq, - struct task_struct *t, - struct mm_cid *src_pcpu_cid) +/* + * Update the CID range properties when the constraints change. Invoked via + * fork(), exit() and affinity changes + */ +static void __mm_update_max_cids(struct mm_mm_cid *mc) { - struct mm_struct *mm = t->mm; - struct task_struct *src_task; - int src_cid, last_mm_cid; + unsigned int opt_cids, max_cids; - if (!mm) - return -1; + /* Calculate the new optimal constraint */ + opt_cids = min(mc->nr_cpus_allowed, mc->users); - last_mm_cid = t->last_mm_cid; - /* - * If the migrated task has no last cid, or if the current - * task on src rq uses the cid, it means the source cid does not need - * to be moved to the destination cpu. - */ - if (last_mm_cid == -1) - return -1; - src_cid = READ_ONCE(src_pcpu_cid->cid); - if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid) - return -1; + /* Adjust the maximum CIDs to +25% limited by the number of possible CPUs */ + max_cids = min(opt_cids + (opt_cids / 4), num_possible_cpus()); + WRITE_ONCE(mc->max_cids, max_cids); +} - /* - * If we observe an active task using the mm on this rq, it means we - * are not the last task to be migrated from this cpu for this mm, so - * there is no need to move src_cid to the destination cpu. - */ - guard(rcu)(); - src_task = rcu_dereference(src_rq->curr); - if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { - t->last_mm_cid = -1; - return -1; - } +static inline unsigned int mm_cid_calc_pcpu_thrs(struct mm_mm_cid *mc) +{ + unsigned int opt_cids; - return src_cid; + opt_cids = min(mc->nr_cpus_allowed, mc->users); + /* Has to be at least 1 because 0 indicates PCPU mode off */ + return max(min(opt_cids - opt_cids / 4, num_possible_cpus() / 2), 1); } -static -int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq, - struct task_struct *t, - struct mm_cid *src_pcpu_cid, - int src_cid) +static bool mm_update_max_cids(struct mm_struct *mm) { - struct task_struct *src_task; - struct mm_struct *mm = t->mm; - int lazy_cid; + struct mm_mm_cid *mc = &mm->mm_cid; - if (src_cid == -1) - return -1; + lockdep_assert_held(&mm->mm_cid.lock); - /* - * Attempt to clear the source cpu cid to move it to the destination - * cpu. - */ - lazy_cid = mm_cid_set_lazy_put(src_cid); - if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid)) - return -1; + /* Clear deferred mode switch flag. A change is handled by the caller */ + mc->update_deferred = false; + __mm_update_max_cids(mc); - /* - * The implicit barrier after cmpxchg per-mm/cpu cid before loading - * rq->curr->mm matches the scheduler barrier in context_switch() - * between store to rq->curr and load of prev and next task's - * per-mm/cpu cid. - * - * The implicit barrier after cmpxchg per-mm/cpu cid before loading - * rq->curr->mm_cid_active matches the barrier in - * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and - * sched_mm_cid_after_execve() between store to t->mm_cid_active and - * load of per-mm/cpu cid. - */ - - /* - * If we observe an active task using the mm on this rq after setting - * the lazy-put flag, this task will be responsible for transitioning - * from lazy-put flag set to MM_CID_UNSET. - */ - scoped_guard (rcu) { - src_task = rcu_dereference(src_rq->curr); - if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { - /* - * We observed an active task for this mm, there is therefore - * no point in moving this cid to the destination cpu. - */ - t->last_mm_cid = -1; - return -1; - } + /* Check whether owner mode must be changed */ + if (!mc->percpu) { + /* Enable per CPU mode when the number of users is above max_cids */ + if (mc->users > mc->max_cids) + mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc); + } else { + /* Switch back to per task if user count under threshold */ + if (mc->users < mc->pcpu_thrs) + mc->pcpu_thrs = 0; } - /* - * The src_cid is unused, so it can be unset. - */ - if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) - return -1; - WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET); - return src_cid; + /* Mode change required? */ + if (!!mc->percpu == !!mc->pcpu_thrs) + return false; + /* When switching back to per TASK mode, set the transition flag */ + if (!mc->pcpu_thrs) + WRITE_ONCE(mc->transit, MM_CID_TRANSIT); + WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs); + return true; } -/* - * Migration to dst cpu. Called with dst_rq lock held. - * Interrupts are disabled, which keeps the window of cid ownership without the - * source rq lock held small. - */ -void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) +static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { - struct mm_cid *src_pcpu_cid, *dst_pcpu_cid; - struct mm_struct *mm = t->mm; - int src_cid, src_cpu; - bool dst_cid_is_set; - struct rq *src_rq; + struct cpumask *mm_allowed; + struct mm_mm_cid *mc; + unsigned int weight; - lockdep_assert_rq_held(dst_rq); - - if (!mm) - return; - src_cpu = t->migrate_from_cpu; - if (src_cpu == -1) { - t->last_mm_cid = -1; + if (!mm || !READ_ONCE(mm->mm_cid.users)) return; - } /* - * Move the src cid if the dst cid is unset. This keeps id - * allocation closest to 0 in cases where few threads migrate around - * many CPUs. - * - * If destination cid or recent cid is already set, we may have - * to just clear the src cid to ensure compactness in frequent - * migrations scenarios. - * - * It is not useful to clear the src cid when the number of threads is - * greater or equal to the number of allowed CPUs, because user-space - * can expect that the number of allowed cids can reach the number of - * allowed CPUs. - */ - dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq)); - dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) || - !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid)); - if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed)) + * mm::mm_cid::mm_cpus_allowed is the superset of each threads + * allowed CPUs mask which means it can only grow. + */ + mc = &mm->mm_cid; + guard(raw_spinlock)(&mc->lock); + mm_allowed = mm_cpus_allowed(mm); + weight = cpumask_weighted_or(mm_allowed, mm_allowed, affmsk); + if (weight == mc->nr_cpus_allowed) return; - src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu); - src_rq = cpu_rq(src_cpu); - src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid); - if (src_cid == -1) + + WRITE_ONCE(mc->nr_cpus_allowed, weight); + __mm_update_max_cids(mc); + if (!mc->percpu) return; - src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid, - src_cid); - if (src_cid == -1) + + /* Adjust the threshold to the wider set */ + mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc); + /* Switch back to per task mode? */ + if (mc->users >= mc->pcpu_thrs) return; - if (dst_cid_is_set) { - __mm_cid_put(mm, src_cid); + + /* Don't queue twice */ + if (mc->update_deferred) return; - } - /* Move src_cid to dst cpu. */ - mm_cid_snapshot_time(dst_rq, mm); - WRITE_ONCE(dst_pcpu_cid->cid, src_cid); - WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid); + + /* Queue the irq work, which schedules the real work */ + mc->update_deferred = true; + irq_work_queue(&mc->irq_work); } -static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid, - int cpu) +static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp) { - struct rq *rq = cpu_rq(cpu); - struct task_struct *t; - int cid, lazy_cid; + if (cid_on_cpu(t->mm_cid.cid)) { + unsigned int cid = cpu_cid_to_cid(t->mm_cid.cid); - cid = READ_ONCE(pcpu_cid->cid); - if (!mm_cid_is_valid(cid)) - return; + t->mm_cid.cid = cid_to_transit_cid(cid); + pcp->cid = t->mm_cid.cid; + } +} - /* - * Clear the cpu cid if it is set to keep cid allocation compact. If - * there happens to be other tasks left on the source cpu using this - * mm, the next task using this mm will reallocate its cid on context - * switch. - */ - lazy_cid = mm_cid_set_lazy_put(cid); - if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid)) - return; +static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm) +{ + unsigned int cpu; - /* - * The implicit barrier after cmpxchg per-mm/cpu cid before loading - * rq->curr->mm matches the scheduler barrier in context_switch() - * between store to rq->curr and load of prev and next task's - * per-mm/cpu cid. - * - * The implicit barrier after cmpxchg per-mm/cpu cid before loading - * rq->curr->mm_cid_active matches the barrier in - * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and - * sched_mm_cid_after_execve() between store to t->mm_cid_active and - * load of per-mm/cpu cid. - */ + /* Walk the CPUs and fixup all stale CIDs */ + for_each_possible_cpu(cpu) { + struct mm_cid_pcpu *pcp = per_cpu_ptr(mm->mm_cid.pcpu, cpu); + struct rq *rq = cpu_rq(cpu); - /* - * If we observe an active task using the mm on this rq after setting - * the lazy-put flag, that task will be responsible for transitioning - * from lazy-put flag set to MM_CID_UNSET. - */ - scoped_guard (rcu) { - t = rcu_dereference(rq->curr); - if (READ_ONCE(t->mm_cid_active) && t->mm == mm) - return; + /* Remote access to mm::mm_cid::pcpu requires rq_lock */ + guard(rq_lock_irq)(rq); + /* Is the CID still owned by the CPU? */ + if (cid_on_cpu(pcp->cid)) { + /* + * If rq->curr has @mm, transfer it with the + * transition bit set. Otherwise drop it. + */ + if (rq->curr->mm == mm && rq->curr->mm_cid.active) + mm_cid_transit_to_task(rq->curr, pcp); + else + mm_drop_cid_on_cpu(mm, pcp); + + } else if (rq->curr->mm == mm && rq->curr->mm_cid.active) { + unsigned int cid = rq->curr->mm_cid.cid; + + /* Ensure it has the transition bit set */ + if (!cid_in_transit(cid)) { + cid = cid_to_transit_cid(cid); + rq->curr->mm_cid.cid = cid; + pcp->cid = cid; + } + } } + /* Clear the transition bit */ + WRITE_ONCE(mm->mm_cid.transit, 0); +} - /* - * The cid is unused, so it can be unset. - * Disable interrupts to keep the window of cid ownership without rq - * lock small. - */ - scoped_guard (irqsave) { - if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) - __mm_cid_put(mm, cid); +static inline void mm_cid_transfer_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp) +{ + if (cid_on_task(t->mm_cid.cid)) { + t->mm_cid.cid = cid_to_cpu_cid(t->mm_cid.cid); + pcp->cid = t->mm_cid.cid; } } -static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu) +static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm) { - struct rq *rq = cpu_rq(cpu); - struct mm_cid *pcpu_cid; - struct task_struct *curr; - u64 rq_clock; + /* Remote access to mm::mm_cid::pcpu requires rq_lock */ + guard(task_rq_lock)(t); + /* If the task is not active it is not in the users count */ + if (!t->mm_cid.active) + return false; + if (cid_on_task(t->mm_cid.cid)) { + /* If running on the CPU, transfer the CID, otherwise drop it */ + if (task_rq(t)->curr == t) + mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t))); + else + mm_unset_cid_on_task(t); + } + return true; +} - /* - * rq->clock load is racy on 32-bit but one spurious clear once in a - * while is irrelevant. - */ - rq_clock = READ_ONCE(rq->clock); - pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); +static void mm_cid_fixup_tasks_to_cpus(void) +{ + struct mm_struct *mm = current->mm; + struct task_struct *p, *t; + unsigned int users; /* - * In order to take care of infrequently scheduled tasks, bump the time - * snapshot associated with this cid if an active task using the mm is - * observed on this rq. + * This can obviously race with a concurrent affinity change, which + * increases the number of allowed CPUs for this mm, but that does + * not affect the mode and only changes the CID constraints. A + * possible switch back to per task mode happens either in the + * deferred handler function or in the next fork()/exit(). + * + * The caller has already transferred. The newly incoming task is + * already accounted for, but not yet visible. */ - scoped_guard (rcu) { - curr = rcu_dereference(rq->curr); - if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) { - WRITE_ONCE(pcpu_cid->time, rq_clock); - return; - } + users = mm->mm_cid.users - 2; + if (!users) + return; + + guard(rcu)(); + for_other_threads(current, t) { + if (mm_cid_fixup_task_to_cpu(t, mm)) + users--; } - if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS) + if (!users) return; - sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); + + /* Happens only for VM_CLONE processes. */ + for_each_process_thread(p, t) { + if (t == current || t->mm != mm) + continue; + if (mm_cid_fixup_task_to_cpu(t, mm)) { + if (--users == 0) + return; + } + } } -static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu, - int weight) +static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm) { - struct mm_cid *pcpu_cid; - int cid; - - pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); - cid = READ_ONCE(pcpu_cid->cid); - if (!mm_cid_is_valid(cid) || cid < weight) - return; - sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); + t->mm_cid.active = 1; + mm->mm_cid.users++; + return mm_update_max_cids(mm); } -static void task_mm_cid_work(struct callback_head *work) +void sched_mm_cid_fork(struct task_struct *t) { - unsigned long now = jiffies, old_scan, next_scan; - struct task_struct *t = current; - struct cpumask *cidmask; - struct mm_struct *mm; - int weight, cpu; + struct mm_struct *mm = t->mm; + bool percpu; - WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work)); + WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET); - work->next = work; /* Prevent double-add */ - if (t->flags & PF_EXITING) - return; - mm = t->mm; - if (!mm) - return; - old_scan = READ_ONCE(mm->mm_cid_next_scan); - next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY); - if (!old_scan) { - unsigned long res; - - res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan); - if (res != old_scan) - old_scan = res; + guard(mutex)(&mm->mm_cid.mutex); + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + struct mm_cid_pcpu *pcp = this_cpu_ptr(mm->mm_cid.pcpu); + + /* First user ? */ + if (!mm->mm_cid.users) { + sched_mm_cid_add_user(t, mm); + t->mm_cid.cid = mm_get_cid(mm); + /* Required for execve() */ + pcp->cid = t->mm_cid.cid; + return; + } + + if (!sched_mm_cid_add_user(t, mm)) { + if (!mm->mm_cid.percpu) + t->mm_cid.cid = mm_get_cid(mm); + return; + } + + /* Handle the mode change and transfer current's CID */ + percpu = !!mm->mm_cid.percpu; + if (!percpu) + mm_cid_transit_to_task(current, pcp); else - old_scan = next_scan; + mm_cid_transfer_to_cpu(current, pcp); } - if (time_before(now, old_scan)) - return; - if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan)) - return; - cidmask = mm_cidmask(mm); - /* Clear cids that were not recently used. */ - for_each_possible_cpu(cpu) - sched_mm_cid_remote_clear_old(mm, cpu); - weight = cpumask_weight(cidmask); - /* - * Clear cids that are greater or equal to the cidmask weight to - * recompact it. - */ - for_each_possible_cpu(cpu) - sched_mm_cid_remote_clear_weight(mm, cpu, weight); -} - -void init_sched_mm_cid(struct task_struct *t) -{ - struct mm_struct *mm = t->mm; - int mm_users = 0; - if (mm) { - mm_users = atomic_read(&mm->mm_users); - if (mm_users == 1) - mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY); + if (percpu) { + mm_cid_fixup_tasks_to_cpus(); + } else { + mm_cid_fixup_cpus_to_tasks(mm); + t->mm_cid.cid = mm_get_cid(mm); } - t->cid_work.next = &t->cid_work; /* Protect against double add */ - init_task_work(&t->cid_work, task_mm_cid_work); } -void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) +static bool sched_mm_cid_remove_user(struct task_struct *t) { - struct callback_head *work = &curr->cid_work; - unsigned long now = jiffies; - - if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || - work->next != work) - return; - if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) - return; - - /* No page allocation under rq lock */ - task_work_add(curr, work, TWA_RESUME); + t->mm_cid.active = 0; + scoped_guard(preempt) { + /* Clear the transition bit */ + t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid); + mm_unset_cid_on_task(t); + } + t->mm->mm_cid.users--; + return mm_update_max_cids(t->mm); } -void sched_mm_cid_exit_signals(struct task_struct *t) +static bool __sched_mm_cid_exit(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq *rq; - - if (!mm) - return; - preempt_disable(); - rq = this_rq(); - guard(rq_lock_irqsave)(rq); - preempt_enable_no_resched(); /* holding spinlock */ - WRITE_ONCE(t->mm_cid_active, 0); + if (!sched_mm_cid_remove_user(t)) + return false; /* - * Store t->mm_cid_active before loading per-mm/cpu cid. - * Matches barrier in sched_mm_cid_remote_clear_old(). + * Contrary to fork() this only deals with a switch back to per + * task mode either because the above decreased users or an + * affinity change increased the number of allowed CPUs and the + * deferred fixup did not run yet. */ - smp_mb(); - mm_cid_put(mm); - t->last_mm_cid = t->mm_cid = -1; + if (WARN_ON_ONCE(mm->mm_cid.percpu)) + return false; + /* + * A failed fork(2) cleanup never gets here, so @current must have + * the same MM as @t. That's true for exit() and the failed + * pthread_create() cleanup case. + */ + if (WARN_ON_ONCE(current->mm != mm)) + return false; + return true; } -void sched_mm_cid_before_execve(struct task_struct *t) +/* + * When a task exits, the MM CID held by the task is not longer required as + * the task cannot return to user space. + */ +void sched_mm_cid_exit(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq *rq; - if (!mm) + if (!mm || !t->mm_cid.active) return; + /* + * Ensure that only one instance is doing MM CID operations within + * a MM. The common case is uncontended. The rare fixup case adds + * some overhead. + */ + scoped_guard(mutex, &mm->mm_cid.mutex) { + /* mm_cid::mutex is sufficient to protect mm_cid::users */ + if (likely(mm->mm_cid.users > 1)) { + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + if (!__sched_mm_cid_exit(t)) + return; + /* Mode change required. Transfer currents CID */ + mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu)); + } + mm_cid_fixup_cpus_to_tasks(mm); + return; + } + /* Last user */ + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + /* Required across execve() */ + if (t == current) + mm_cid_transit_to_task(t, this_cpu_ptr(mm->mm_cid.pcpu)); + /* Ignore mode change. There is nothing to do. */ + sched_mm_cid_remove_user(t); + } + } - preempt_disable(); - rq = this_rq(); - guard(rq_lock_irqsave)(rq); - preempt_enable_no_resched(); /* holding spinlock */ - WRITE_ONCE(t->mm_cid_active, 0); /* - * Store t->mm_cid_active before loading per-mm/cpu cid. - * Matches barrier in sched_mm_cid_remote_clear_old(). + * As this is the last user (execve(), process exit or failed + * fork(2)) there is no concurrency anymore. + * + * Synchronize eventually pending work to ensure that there are no + * dangling references left. @t->mm_cid.users is zero so nothing + * can queue this work anymore. */ - smp_mb(); - mm_cid_put(mm); - t->last_mm_cid = t->mm_cid = -1; + irq_work_sync(&mm->mm_cid.irq_work); + cancel_work_sync(&mm->mm_cid.work); } +/* Deactivate MM CID allocation across execve() */ +void sched_mm_cid_before_execve(struct task_struct *t) +{ + sched_mm_cid_exit(t); +} + +/* Reactivate MM CID after successful execve() */ void sched_mm_cid_after_execve(struct task_struct *t) { - struct mm_struct *mm = t->mm; - struct rq *rq; + sched_mm_cid_fork(t); +} + +static void mm_cid_work_fn(struct work_struct *work) +{ + struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work); - if (!mm) + guard(mutex)(&mm->mm_cid.mutex); + /* Did the last user task exit already? */ + if (!mm->mm_cid.users) return; - preempt_disable(); - rq = this_rq(); - scoped_guard (rq_lock_irqsave, rq) { - preempt_enable_no_resched(); /* holding spinlock */ - WRITE_ONCE(t->mm_cid_active, 1); - /* - * Store t->mm_cid_active before loading per-mm/cpu cid. - * Matches barrier in sched_mm_cid_remote_clear_old(). - */ - smp_mb(); - t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm); + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + /* Have fork() or exit() handled it already? */ + if (!mm->mm_cid.update_deferred) + return; + /* This clears mm_cid::update_deferred */ + if (!mm_update_max_cids(mm)) + return; + /* Affinity changes can only switch back to task mode */ + if (WARN_ON_ONCE(mm->mm_cid.percpu)) + return; } + mm_cid_fixup_cpus_to_tasks(mm); } -void sched_mm_cid_fork(struct task_struct *t) +static void mm_cid_irq_work(struct irq_work *work) { - WARN_ON_ONCE(!t->mm || t->mm_cid != -1); - t->mm_cid_active = 1; + struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.irq_work); + + /* + * Needs to be unconditional because mm_cid::lock cannot be held + * when scheduling work as mm_update_cpus_allowed() nests inside + * rq::lock and schedule_work() might end up in wakeup... + */ + schedule_work(&mm->mm_cid.work); } -#endif -#ifdef CONFIG_SCHED_CLASS_EXT -void sched_deq_and_put_task(struct task_struct *p, int queue_flags, - struct sched_enq_and_set_ctx *ctx) +void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { + mm->mm_cid.max_cids = 0; + mm->mm_cid.percpu = 0; + mm->mm_cid.transit = 0; + mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed; + mm->mm_cid.users = 0; + mm->mm_cid.pcpu_thrs = 0; + mm->mm_cid.update_deferred = 0; + raw_spin_lock_init(&mm->mm_cid.lock); + mutex_init(&mm->mm_cid.mutex); + mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work); + INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn); + cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); + bitmap_zero(mm_cidmask(mm), num_possible_cpus()); +} +#else /* CONFIG_SCHED_MM_CID */ +static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { } +#endif /* !CONFIG_SCHED_MM_CID */ + +static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx); + +struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags) +{ + struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx); struct rq *rq = task_rq(p); + /* + * Must exclusively use matched flags since this is both dequeue and + * enqueue. + */ + WARN_ON_ONCE(flags & 0xFFFF0000); + lockdep_assert_rq_held(rq); - *ctx = (struct sched_enq_and_set_ctx){ + if (!(flags & DEQUEUE_NOCLOCK)) { + update_rq_clock(rq); + flags |= DEQUEUE_NOCLOCK; + } + + if (flags & DEQUEUE_CLASS) { + if (p->sched_class->switching_from) + p->sched_class->switching_from(rq, p); + } + + *ctx = (struct sched_change_ctx){ .p = p, - .queue_flags = queue_flags, + .flags = flags, .queued = task_on_rq_queued(p), - .running = task_current(rq, p), + .running = task_current_donor(rq, p), }; - update_rq_clock(rq); + if (!(flags & DEQUEUE_CLASS)) { + if (p->sched_class->get_prio) + ctx->prio = p->sched_class->get_prio(rq, p); + else + ctx->prio = p->prio; + } + if (ctx->queued) - dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK); + dequeue_task(rq, p, flags); if (ctx->running) put_prev_task(rq, p); + + if ((flags & DEQUEUE_CLASS) && p->sched_class->switched_from) + p->sched_class->switched_from(rq, p); + + return ctx; } -void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx) +void sched_change_end(struct sched_change_ctx *ctx) { - struct rq *rq = task_rq(ctx->p); + struct task_struct *p = ctx->p; + struct rq *rq = task_rq(p); lockdep_assert_rq_held(rq); + if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to) + p->sched_class->switching_to(rq, p); + if (ctx->queued) - enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK); + enqueue_task(rq, p, ctx->flags); if (ctx->running) - set_next_task(rq, ctx->p); + set_next_task(rq, p); + + if (ctx->flags & ENQUEUE_CLASS) { + if (p->sched_class->switched_to) + p->sched_class->switched_to(rq, p); + } else { + p->sched_class->prio_changed(rq, p, ctx->prio); + } } -#endif /* CONFIG_SCHED_CLASS_EXT */ diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index c4606ca89210..9ede71ecba7f 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -4,6 +4,8 @@ * A simple wrapper around refcount. An allocated sched_core_cookie's * address is used to compute the cookie of the task. */ +#include "sched.h" + struct sched_core_cookie { refcount_t refcnt; }; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 0de9dda09949..23a56ba12d81 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -6,6 +6,8 @@ * Based on the work by Paul Menage (menage@google.com) and Balbir Singh * (balbir@in.ibm.com). */ +#include <linux/sched/cputime.h> +#include "sched.h" /* Time spent by the tasks of the CPU accounting group executing in ... */ enum cpuacct_stat_index { diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 95baa12a1029..37b572cc8aca 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -6,6 +6,7 @@ * * Author: Juri Lelli <j.lelli@sssup.it> */ +#include "sched.h" static inline int parent(int i) { @@ -165,12 +166,13 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, * cpudl_clear - remove a CPU from the cpudl max-heap * @cp: the cpudl max-heap context * @cpu: the target CPU + * @online: the online state of the deadline runqueue * * Notes: assumes cpu_rq(cpu)->lock is locked * * Returns: (void) */ -void cpudl_clear(struct cpudl *cp, int cpu) +void cpudl_clear(struct cpudl *cp, int cpu, bool online) { int old_idx, new_cpu; unsigned long flags; @@ -183,7 +185,7 @@ void cpudl_clear(struct cpudl *cp, int cpu) if (old_idx == IDX_INVALID) { /* * Nothing to remove if old_idx was invalid. - * This could happen if a rq_offline_dl is + * This could happen if rq_online_dl or rq_offline_dl is * called for a CPU without -dl tasks running. */ } else { @@ -194,9 +196,12 @@ void cpudl_clear(struct cpudl *cp, int cpu) cp->elements[new_cpu].idx = old_idx; cp->elements[cpu].idx = IDX_INVALID; cpudl_heapify(cp, old_idx); - - cpumask_set_cpu(cpu, cp->free_cpus); } + if (likely(online)) + __cpumask_set_cpu(cpu, cp->free_cpus); + else + __cpumask_clear_cpu(cpu, cp->free_cpus); + raw_spin_unlock_irqrestore(&cp->lock, flags); } @@ -227,7 +232,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) cp->elements[new_idx].cpu = cpu; cp->elements[cpu].idx = new_idx; cpudl_heapify_up(cp, new_idx); - cpumask_clear_cpu(cpu, cp->free_cpus); + __cpumask_clear_cpu(cpu, cp->free_cpus); } else { cp->elements[old_idx].dl = dl; cpudl_heapify(cp, old_idx); @@ -237,26 +242,6 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) } /* - * cpudl_set_freecpu - Set the cpudl.free_cpus - * @cp: the cpudl max-heap context - * @cpu: rd attached CPU - */ -void cpudl_set_freecpu(struct cpudl *cp, int cpu) -{ - cpumask_set_cpu(cpu, cp->free_cpus); -} - -/* - * cpudl_clear_freecpu - Clear the cpudl.free_cpus - * @cp: the cpudl max-heap context - * @cpu: rd attached CPU - */ -void cpudl_clear_freecpu(struct cpudl *cp, int cpu) -{ - cpumask_clear_cpu(cpu, cp->free_cpus); -} - -/* * cpudl_init - initialize the cpudl structure * @cp: the cpudl max-heap context */ diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 0adeda93b5fb..d7699468eedd 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -1,4 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/types.h> +#include <linux/spinlock.h> #define IDX_INVALID -1 @@ -15,12 +17,8 @@ struct cpudl { struct cpudl_item *elements; }; -#ifdef CONFIG_SMP int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); void cpudl_set(struct cpudl *cp, int cpu, u64 dl); -void cpudl_clear(struct cpudl *cp, int cpu); +void cpudl_clear(struct cpudl *cp, int cpu, bool online); int cpudl_init(struct cpudl *cp); -void cpudl_set_freecpu(struct cpudl *cp, int cpu); -void cpudl_clear_freecpu(struct cpudl *cp, int cpu); void cpudl_cleanup(struct cpudl *cp); -#endif /* CONFIG_SMP */ diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 5252fb191fae..742fb9e62e1a 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -5,6 +5,7 @@ * Copyright (C) 2016, Intel Corporation * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> */ +#include "sched.h" DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 816f07f9d30f..0ab5f9d4bc59 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -5,6 +5,8 @@ * Copyright (C) 2016, Intel Corporation * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> */ +#include <uapi/linux/sched/types.h> +#include "sched.h" #define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) @@ -380,9 +382,9 @@ static bool sugov_hold_freq(struct sugov_cpu *sg_cpu) sg_cpu->saved_idle_calls = idle_calls; return ret; } -#else +#else /* !CONFIG_NO_HZ_COMMON: */ static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; } -#endif /* CONFIG_NO_HZ_COMMON */ +#endif /* !CONFIG_NO_HZ_COMMON */ /* * Make sugov_should_update_freq() ignore the rate limit when DL @@ -630,7 +632,7 @@ static const struct kobj_type sugov_tunables_ktype = { /********************** cpufreq governor interface *********************/ -struct cpufreq_governor schedutil_gov; +static struct cpufreq_governor schedutil_gov; static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) { @@ -909,7 +911,7 @@ static void sugov_limits(struct cpufreq_policy *policy) WRITE_ONCE(sg_policy->limits_changed, true); } -struct cpufreq_governor schedutil_gov = { +static struct cpufreq_governor schedutil_gov = { .name = "schedutil", .owner = THIS_MODULE, .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING, @@ -927,4 +929,9 @@ struct cpufreq_governor *cpufreq_default_governor(void) } #endif +bool sugov_is_governor(struct cpufreq_policy *policy) +{ + return policy->governor == &schedutil_gov; +} + cpufreq_governor_init(schedutil_gov); diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 42c40cfdf836..76a9ac5eb794 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -22,6 +22,7 @@ * worst case complexity of O(min(101, nr_domcpus)), though the scenario that * yields the worst case search is fairly contrived. */ +#include "sched.h" /* * p->rt_priority p->prio newpri cpupri diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index d6cba0020064..6f562088c056 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -1,4 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/atomic.h> +#include <linux/cpumask.h> +#include <linux/sched/rt.h> #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO+1) @@ -17,7 +20,6 @@ struct cpupri { int *cpu_to_pri; }; -#ifdef CONFIG_SMP int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, @@ -26,4 +28,3 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, void cpupri_set(struct cpupri *cp, int cpu, int pri); int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); -#endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 6dab4854c6c0..4f97896887ec 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -2,6 +2,9 @@ /* * Simple CPU accounting cgroup controller */ +#include <linux/sched/cputime.h> +#include <linux/tsacct_kern.h> +#include "sched.h" #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE #include <asm/cputime.h> @@ -88,7 +91,7 @@ static u64 irqtime_tick_accounted(u64 maxtime) return delta; } -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ +#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ static u64 irqtime_tick_accounted(u64 dummy) { @@ -241,7 +244,7 @@ void __account_forceidle_time(struct task_struct *p, u64 delta) task_group_account_field(p, CPUTIME_FORCEIDLE, delta); } -#endif +#endif /* CONFIG_SCHED_CORE */ /* * When a guest is interrupted for a longer amount of time, missed clock @@ -262,7 +265,7 @@ static __always_inline u64 steal_account_process_time(u64 maxtime) return steal; } -#endif +#endif /* CONFIG_PARAVIRT */ return 0; } @@ -288,7 +291,7 @@ static inline u64 read_sum_exec_runtime(struct task_struct *t) { return t->se.sum_exec_runtime; } -#else +#else /* !CONFIG_64BIT: */ static u64 read_sum_exec_runtime(struct task_struct *t) { u64 ns; @@ -301,7 +304,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) return ns; } -#endif +#endif /* !CONFIG_64BIT */ /* * Accumulate raw cputime values of dead tasks (sig->[us]time) and live @@ -310,10 +313,8 @@ static u64 read_sum_exec_runtime(struct task_struct *t) void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) { struct signal_struct *sig = tsk->signal; - u64 utime, stime; struct task_struct *t; - unsigned int seq, nextseq; - unsigned long flags; + u64 utime, stime; /* * Update current task runtime to account pending time since last @@ -326,27 +327,19 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) if (same_thread_group(current, tsk)) (void) task_sched_runtime(current); - rcu_read_lock(); - /* Attempt a lockless read on the first round. */ - nextseq = 0; - do { - seq = nextseq; - flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); + guard(rcu)(); + scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) { times->utime = sig->utime; times->stime = sig->stime; times->sum_exec_runtime = sig->sum_sched_runtime; - for_each_thread(tsk, t) { + __for_each_thread(sig, t) { task_cputime(t, &utime, &stime); times->utime += utime; times->stime += stime; times->sum_exec_runtime += read_sum_exec_runtime(t); } - /* If lockless access failed, take the lock. */ - nextseq = 1; - } while (need_seqretry(&sig->stats_lock, seq)); - done_seqretry_irqrestore(&sig->stats_lock, seq, flags); - rcu_read_unlock(); + } } #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -411,11 +404,11 @@ static void irqtime_account_idle_ticks(int ticks) { irqtime_account_process_tick(current, 0, ticks); } -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ +#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ static inline void irqtime_account_idle_ticks(int ticks) { } static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, int nr_ticks) { } -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ +#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ /* * Use precise platform statistics if available: diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ad45a8fea245..319439fe1870 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -17,6 +17,10 @@ */ #include <linux/cpuset.h> +#include <linux/sched/clock.h> +#include <uapi/linux/sched/types.h> +#include "sched.h" +#include "pelt.h" /* * Default limits for DL period; on the top end we guard against small util @@ -51,7 +55,7 @@ static int __init sched_dl_sysctl_init(void) return 0; } late_initcall(sched_dl_sysctl_init); -#endif +#endif /* CONFIG_SYSCTL */ static bool dl_server(struct sched_dl_entity *dl_se) { @@ -99,7 +103,7 @@ static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) { return pi_of(dl_se) != dl_se; } -#else +#else /* !CONFIG_RT_MUTEXES: */ static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) { return dl_se; @@ -109,9 +113,8 @@ static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) { return false; } -#endif +#endif /* !CONFIG_RT_MUTEXES */ -#ifdef CONFIG_SMP static inline struct dl_bw *dl_bw_of(int i) { RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), @@ -122,20 +125,11 @@ static inline struct dl_bw *dl_bw_of(int i) static inline int dl_bw_cpus(int i) { struct root_domain *rd = cpu_rq(i)->rd; - int cpus; RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), "sched RCU must be held"); - if (cpumask_subset(rd->span, cpu_active_mask)) - return cpumask_weight(rd->span); - - cpus = 0; - - for_each_cpu_and(i, rd->span, cpu_active_mask) - cpus++; - - return cpus; + return cpumask_weight_and(rd->span, cpu_active_mask); } static inline unsigned long __dl_bw_capacity(const struct cpumask *mask) @@ -191,35 +185,6 @@ void __dl_update(struct dl_bw *dl_b, s64 bw) rq->dl.extra_bw += bw; } } -#else -static inline struct dl_bw *dl_bw_of(int i) -{ - return &cpu_rq(i)->dl.dl_bw; -} - -static inline int dl_bw_cpus(int i) -{ - return 1; -} - -static inline unsigned long dl_bw_capacity(int i) -{ - return SCHED_CAPACITY_SCALE; -} - -bool dl_bw_visited(int cpu, u64 cookie) -{ - return false; -} - -static inline -void __dl_update(struct dl_bw *dl_b, s64 bw) -{ - struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); - - dl->extra_bw += bw; -} -#endif static inline void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) @@ -431,7 +396,7 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se); * up, and checks if the task is still in the "ACTIVE non contending" * state or not (in the second case, it updates running_bw). */ -static void task_non_contending(struct sched_dl_entity *dl_se) +static void task_non_contending(struct sched_dl_entity *dl_se, bool dl_task) { struct hrtimer *timer = &dl_se->inactive_timer; struct rq *rq = rq_of_dl_se(dl_se); @@ -470,10 +435,10 @@ static void task_non_contending(struct sched_dl_entity *dl_se) } else { struct task_struct *p = dl_task_of(dl_se); - if (dl_task(p)) + if (dl_task) sub_running_bw(dl_se, dl_rq); - if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { + if (!dl_task || READ_ONCE(p->__state) == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); if (READ_ONCE(p->__state) == TASK_DEAD) @@ -552,23 +517,17 @@ void init_dl_rq(struct dl_rq *dl_rq) { dl_rq->root = RB_ROOT_CACHED; -#ifdef CONFIG_SMP /* zero means no -deadline tasks */ dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0; dl_rq->overloaded = 0; dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED; -#else - init_dl_bw(&dl_rq->dl_bw); -#endif dl_rq->running_bw = 0; dl_rq->this_bw = 0; init_dl_rq_bw_ratio(dl_rq); } -#ifdef CONFIG_SMP - static inline int dl_overloaded(struct rq *rq) { return atomic_read(&rq->rd->dlo_count); @@ -753,37 +712,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p return later_rq; } -#else - -static inline -void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) -{ -} - -static inline -void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) -{ -} - -static inline -void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) -{ -} - -static inline -void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) -{ -} - -static inline void deadline_queue_push_tasks(struct rq *rq) -{ -} - -static inline void deadline_queue_pull_task(struct rq *rq) -{ -} -#endif /* CONFIG_SMP */ - static void enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags); static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); @@ -824,6 +752,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); + update_rq_clock(rq); + WARN_ON(is_dl_boosted(dl_se)); WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); @@ -936,7 +866,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) */ if (dl_se->dl_defer && !dl_se->dl_defer_running && dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) { - if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) { + if (!is_dl_boosted(dl_se)) { /* * Set dl_se->dl_defer_armed and dl_throttled variables to @@ -1195,7 +1125,6 @@ static int start_dl_timer(struct sched_dl_entity *dl_se) static void __push_dl_task(struct rq *rq, struct rq_flags *rf) { -#ifdef CONFIG_SMP /* * Queueing this task back might have overloaded rq, check if we need * to kick someone away. @@ -1209,7 +1138,6 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf) push_dl_task(rq); rq_repin_lock(rq, rf); } -#endif } /* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ @@ -1229,11 +1157,15 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_ sched_clock_tick(); update_rq_clock(rq); - if (!dl_se->dl_runtime) - return HRTIMER_NORESTART; + /* + * Make sure current has propagated its pending runtime into + * any relevant server through calling dl_server_update() and + * friends. + */ + rq->donor->sched_class->update_curr(rq); - if (!dl_se->server_has_tasks(dl_se)) { - replenish_dl_entity(dl_se); + if (dl_se->dl_defer_idle) { + dl_server_stop(dl_se); return HRTIMER_NORESTART; } @@ -1339,7 +1271,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) goto unlock; } -#ifdef CONFIG_SMP if (unlikely(!rq->online)) { /* * If the runqueue is no longer available, migrate the @@ -1356,7 +1287,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * there. */ } -#endif enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (dl_task(rq->donor)) @@ -1486,10 +1416,11 @@ s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta } static inline void -update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, - int flags); +update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, int flags); + static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) { + bool idle = rq->curr == rq->idle; s64 scaled_delta_exec; if (unlikely(delta_exec <= 0)) { @@ -1504,10 +1435,15 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 if (dl_entity_is_special(dl_se)) return; - scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec); + scaled_delta_exec = delta_exec; + if (!dl_server(dl_se)) + scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec); dl_se->runtime -= scaled_delta_exec; + if (dl_se->dl_defer_idle && !idle) + dl_se->dl_defer_idle = 0; + /* * The fair server can consume its runtime while throttled (not queued/ * running as regular CFS). @@ -1518,6 +1454,29 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 */ if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) { /* + * Non-servers would never get time accounted while throttled. + */ + WARN_ON_ONCE(!dl_server(dl_se)); + + /* + * While the server is marked idle, do not push out the + * activation further, instead wait for the period timer + * to lapse and stop the server. + */ + if (dl_se->dl_defer_idle && idle) { + /* + * The timer is at the zero-laxity point, this means + * dl_server_stop() / dl_server_start() can happen + * while now < deadline. This means update_dl_entity() + * will not replenish. Additionally start_dl_timer() + * will be set for 'deadline - runtime'. Negative + * runtime will not do. + */ + dl_se->runtime = 0; + return; + } + + /* * If the server was previously activated - the starving condition * took place, it this point it went away because the fair scheduler * was able to get runtime in background. So return to the initial @@ -1529,6 +1488,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 replenish_dl_new_period(dl_se, dl_se->rq); + if (idle) + dl_se->dl_defer_idle = 1; + /* * Not being able to start the timer seems problematic. If it could not * be started for whatever reason, we need to "unthrottle" the DL server @@ -1556,10 +1518,12 @@ throttle: } if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) { - if (dl_server(dl_se)) - enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); - else + if (dl_server(dl_se)) { + replenish_dl_new_period(dl_se, rq); + start_dl_timer(dl_se); + } else { enqueue_task_dl(rq, dl_task_of(dl_se), ENQUEUE_REPLENISH); + } } if (!is_leftmost(dl_se, &rq->dl)) @@ -1598,7 +1562,7 @@ throttle: rt_rq->rt_time += delta_exec; raw_spin_unlock(&rt_rq->rt_runtime_lock); } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ } /* @@ -1609,61 +1573,226 @@ throttle: * as time available for the fair server, avoiding a penalty for the * rt scheduler that did not consumed that time. */ -void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) +void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec) { - s64 delta_exec, scaled_delta_exec; - - if (!rq->fair_server.dl_defer) - return; - - /* no need to discount more */ - if (rq->fair_server.runtime < 0) - return; - - delta_exec = rq_clock_task(rq) - p->se.exec_start; - if (delta_exec < 0) - return; - - scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec); - - rq->fair_server.runtime -= scaled_delta_exec; - - if (rq->fair_server.runtime < 0) { - rq->fair_server.dl_defer_running = 0; - rq->fair_server.runtime = 0; - } - - p->se.exec_start = rq_clock_task(rq); + if (dl_se->dl_server_active && dl_se->dl_runtime && dl_se->dl_defer) + update_curr_dl_se(dl_se->rq, dl_se, delta_exec); } void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) { /* 0 runtime = fair server disabled */ - if (dl_se->dl_runtime) + if (dl_se->dl_server_active && dl_se->dl_runtime) update_curr_dl_se(dl_se->rq, dl_se, delta_exec); } +/* + * dl_server && dl_defer: + * + * 6 + * +--------------------+ + * v | + * +-------------+ 4 +-----------+ 5 +------------------+ + * +-> | A:init | <--- | D:running | -----> | E:replenish-wait | + * | +-------------+ +-----------+ +------------------+ + * | | | 1 ^ ^ | + * | | 1 +----------+ | 3 | + * | v | | + * | +--------------------------------+ 2 | + * | | | ----+ | + * | 8 | B:zero_laxity-wait | | | + * | | | <---+ | + * | +--------------------------------+ | + * | | ^ ^ 2 | + * | | 7 | 2 +--------------------+ + * | v | + * | +-------------+ | + * +-- | C:idle-wait | -+ + * +-------------+ + * ^ 7 | + * +---------+ + * + * + * [A] - init + * dl_server_active = 0 + * dl_throttled = 0 + * dl_defer_armed = 0 + * dl_defer_running = 0/1 + * dl_defer_idle = 0 + * + * [B] - zero_laxity-wait + * dl_server_active = 1 + * dl_throttled = 1 + * dl_defer_armed = 1 + * dl_defer_running = 0 + * dl_defer_idle = 0 + * + * [C] - idle-wait + * dl_server_active = 1 + * dl_throttled = 1 + * dl_defer_armed = 1 + * dl_defer_running = 0 + * dl_defer_idle = 1 + * + * [D] - running + * dl_server_active = 1 + * dl_throttled = 0 + * dl_defer_armed = 0 + * dl_defer_running = 1 + * dl_defer_idle = 0 + * + * [E] - replenish-wait + * dl_server_active = 1 + * dl_throttled = 1 + * dl_defer_armed = 0 + * dl_defer_running = 1 + * dl_defer_idle = 0 + * + * + * [1] A->B, A->D + * dl_server_start() + * dl_server_active = 1; + * enqueue_dl_entity() + * update_dl_entity(WAKEUP) + * if (!dl_defer_running) + * dl_defer_armed = 1; + * dl_throttled = 1; + * if (dl_throttled && start_dl_timer()) + * return; // [B] + * __enqueue_dl_entity(); + * // [D] + * + * // deplete server runtime from client-class + * [2] B->B, C->B, E->B + * dl_server_update() + * update_curr_dl_se() // idle = false + * if (dl_defer_idle) + * dl_defer_idle = 0; + * if (dl_defer && dl_throttled && dl_runtime_exceeded()) + * dl_defer_running = 0; + * hrtimer_try_to_cancel(); // stop timer + * replenish_dl_new_period() + * // fwd period + * dl_throttled = 1; + * dl_defer_armed = 1; + * start_dl_timer(); // restart timer + * // [B] + * + * // timer actually fires means we have runtime + * [3] B->D + * dl_server_timer() + * if (dl_defer_armed) + * dl_defer_running = 1; + * enqueue_dl_entity(REPLENISH) + * replenish_dl_entity() + * // fwd period + * if (dl_throttled) + * dl_throttled = 0; + * if (dl_defer_armed) + * dl_defer_armed = 0; + * __enqueue_dl_entity(); + * // [D] + * + * // schedule server + * [4] D->A + * pick_task_dl() + * p = server_pick_task(); + * if (!p) + * dl_server_stop() + * dequeue_dl_entity(); + * hrtimer_try_to_cancel(); + * dl_defer_armed = 0; + * dl_throttled = 0; + * dl_server_active = 0; + * // [A] + * return p; + * + * // server running + * [5] D->E + * update_curr_dl_se() + * if (dl_runtime_exceeded()) + * dl_throttled = 1; + * dequeue_dl_entity(); + * start_dl_timer(); + * // [E] + * + * // server replenished + * [6] E->D + * dl_server_timer() + * enqueue_dl_entity(REPLENISH) + * replenish_dl_entity() + * fwd-period + * if (dl_throttled) + * dl_throttled = 0; + * __enqueue_dl_entity(); + * // [D] + * + * // deplete server runtime from idle + * [7] B->C, C->C + * dl_server_update_idle() + * update_curr_dl_se() // idle = true + * if (dl_defer && dl_throttled && dl_runtime_exceeded()) + * if (dl_defer_idle) + * return; + * dl_defer_running = 0; + * hrtimer_try_to_cancel(); + * replenish_dl_new_period() + * // fwd period + * dl_throttled = 1; + * dl_defer_armed = 1; + * dl_defer_idle = 1; + * start_dl_timer(); // restart timer + * // [C] + * + * // stop idle server + * [8] C->A + * dl_server_timer() + * if (dl_defer_idle) + * dl_server_stop(); + * // [A] + * + * + * digraph dl_server { + * "A:init" -> "B:zero_laxity-wait" [label="1:dl_server_start"] + * "A:init" -> "D:running" [label="1:dl_server_start"] + * "B:zero_laxity-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"] + * "B:zero_laxity-wait" -> "C:idle-wait" [label="7:dl_server_update_idle"] + * "B:zero_laxity-wait" -> "D:running" [label="3:dl_server_timer"] + * "C:idle-wait" -> "A:init" [label="8:dl_server_timer"] + * "C:idle-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"] + * "C:idle-wait" -> "C:idle-wait" [label="7:dl_server_update_idle"] + * "D:running" -> "A:init" [label="4:pick_task_dl"] + * "D:running" -> "E:replenish-wait" [label="5:update_curr_dl_se"] + * "E:replenish-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"] + * "E:replenish-wait" -> "D:running" [label="6:dl_server_timer"] + * } + * + * + * Notes: + * + * - When there are fair tasks running the most likely loop is [2]->[2]. + * the dl_server never actually runs, the timer never fires. + * + * - When there is actual fair starvation; the timer fires and starts the + * dl_server. This will then throttle and replenish like a normal DL + * task. Notably it will not 'defer' again. + * + * - When idle it will push the actication forward once, and then wait + * for the timer to hit or a non-idle update to restart things. + */ void dl_server_start(struct sched_dl_entity *dl_se) { struct rq *rq = dl_se->rq; + if (!dl_server(dl_se) || dl_se->dl_server_active) + return; + /* - * XXX: the apply do not work fine at the init phase for the - * fair server because things are not yet set. We need to improve - * this before getting generic. + * Update the current task to 'now'. */ - if (!dl_server(dl_se)) { - u64 runtime = 50 * NSEC_PER_MSEC; - u64 period = 1000 * NSEC_PER_MSEC; + rq->donor->sched_class->update_curr(rq); - dl_server_apply_params(dl_se, runtime, period, 1); - - dl_se->dl_server = 1; - dl_se->dl_defer = 1; - setup_new_dl_entity(dl_se); - } - - if (!dl_se->dl_runtime) + if (WARN_ON_ONCE(!cpu_online(cpu_of(rq)))) return; dl_se->dl_server_active = 1; @@ -1674,25 +1803,50 @@ void dl_server_start(struct sched_dl_entity *dl_se) void dl_server_stop(struct sched_dl_entity *dl_se) { - if (!dl_se->dl_runtime) + if (!dl_server(dl_se) || !dl_server_active(dl_se)) return; dequeue_dl_entity(dl_se, DEQUEUE_SLEEP); hrtimer_try_to_cancel(&dl_se->dl_timer); dl_se->dl_defer_armed = 0; dl_se->dl_throttled = 0; + dl_se->dl_defer_idle = 0; dl_se->dl_server_active = 0; } void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, - dl_server_has_tasks_f has_tasks, dl_server_pick_f pick_task) { dl_se->rq = rq; - dl_se->server_has_tasks = has_tasks; dl_se->server_pick_task = pick_task; } +void sched_init_dl_servers(void) +{ + int cpu; + struct rq *rq; + struct sched_dl_entity *dl_se; + + for_each_online_cpu(cpu) { + u64 runtime = 50 * NSEC_PER_MSEC; + u64 period = 1000 * NSEC_PER_MSEC; + + rq = cpu_rq(cpu); + + guard(rq_lock_irq)(rq); + + dl_se = &rq->fair_server; + + WARN_ON(dl_server(dl_se)); + + dl_server_apply_params(dl_se, runtime, period, 1); + + dl_se->dl_server = 1; + dl_se->dl_defer = 1; + setup_new_dl_entity(dl_se); + } +} + void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq) { u64 new_bw = dl_se->dl_bw; @@ -1844,8 +1998,6 @@ static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) #define __node_2_dle(node) \ rb_entry((node), struct sched_dl_entity, rb_node) -#ifdef CONFIG_SMP - static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) { struct rq *rq = rq_of_dl_rq(dl_rq); @@ -1870,7 +2022,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) if (!dl_rq->dl_nr_running) { dl_rq->earliest_dl.curr = 0; dl_rq->earliest_dl.next = 0; - cpudl_clear(&rq->rd->cpudl, rq->cpu); + cpudl_clear(&rq->rd->cpudl, rq->cpu, rq->online); cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); } else { struct rb_node *leftmost = rb_first_cached(&dl_rq->root); @@ -1881,20 +2033,15 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) } } -#else - -static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} -static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} - -#endif /* CONFIG_SMP */ - static inline void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { u64 deadline = dl_se->deadline; dl_rq->dl_nr_running++; - add_nr_running(rq_of_dl_rq(dl_rq), 1); + + if (!dl_server(dl_se)) + add_nr_running(rq_of_dl_rq(dl_rq), 1); inc_dl_deadline(dl_rq, deadline); } @@ -1904,7 +2051,9 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; - sub_nr_running(rq_of_dl_rq(dl_rq), 1); + + if (!dl_server(dl_se)) + sub_nr_running(rq_of_dl_rq(dl_rq), 1); dec_dl_deadline(dl_rq, dl_se->deadline); } @@ -2110,7 +2259,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags) * or "inactive") */ if (flags & DEQUEUE_SLEEP) - task_non_contending(dl_se); + task_non_contending(dl_se, true); } static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) @@ -2166,6 +2315,9 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) if (dl_server(&p->dl)) return; + if (task_is_blocked(p)) + return; + if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } @@ -2202,7 +2354,7 @@ static void yield_task_dl(struct rq *rq) * it and the bandwidth timer will wake it up and will give it * new scheduling parameters (thanks to dl_yielded=1). */ - rq->curr->dl.dl_yielded = 1; + rq->donor->dl.dl_yielded = 1; update_rq_clock(rq); update_curr_dl(rq); @@ -2214,8 +2366,6 @@ static void yield_task_dl(struct rq *rq) rq_clock_skip_update(rq); } -#ifdef CONFIG_SMP - static inline bool dl_task_is_earliest_deadline(struct task_struct *p, struct rq *rq) { @@ -2234,7 +2384,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) struct rq *rq; if (!(flags & WF_TTWU)) - goto out; + return cpu; rq = cpu_rq(cpu); @@ -2272,7 +2422,6 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) } rcu_read_unlock(); -out: return cpu; } @@ -2345,7 +2494,6 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) return sched_stop_runnable(rq) || sched_dl_runnable(rq); } -#endif /* CONFIG_SMP */ /* * Only called when both the current and waking task are -deadline @@ -2359,7 +2507,6 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, return; } -#ifdef CONFIG_SMP /* * In the unlikely case current and p have the same deadline * let us try to decide what's the best thing to do... @@ -2367,7 +2514,6 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, if ((p->dl.deadline == rq->donor->dl.deadline) && !test_tsk_need_resched(rq->curr)) check_preempt_equal_dl(rq, p); -#endif /* CONFIG_SMP */ } #ifdef CONFIG_SCHED_HRTICK @@ -2375,11 +2521,11 @@ static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) { hrtick_start(rq, dl_se->runtime); } -#else /* !CONFIG_SCHED_HRTICK */ +#else /* !CONFIG_SCHED_HRTICK: */ static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) { } -#endif +#endif /* !CONFIG_SCHED_HRTICK */ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) { @@ -2419,7 +2565,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq) * __pick_next_task_dl - Helper to pick the next -deadline task to run. * @rq: The runqueue to pick the next task from. */ -static struct task_struct *__pick_task_dl(struct rq *rq) +static struct task_struct *__pick_task_dl(struct rq *rq, struct rq_flags *rf) { struct sched_dl_entity *dl_se; struct dl_rq *dl_rq = &rq->dl; @@ -2433,12 +2579,9 @@ again: WARN_ON_ONCE(!dl_se); if (dl_server(dl_se)) { - p = dl_se->server_pick_task(dl_se); + p = dl_se->server_pick_task(dl_se, rf); if (!p) { - if (dl_server_active(dl_se)) { - dl_se->dl_yielded = 1; - update_curr_dl_se(rq, dl_se, 0); - } + dl_server_stop(dl_se); goto again; } rq->dl_server = dl_se; @@ -2449,9 +2592,9 @@ again: return p; } -static struct task_struct *pick_task_dl(struct rq *rq) +static struct task_struct *pick_task_dl(struct rq *rq, struct rq_flags *rf) { - return __pick_task_dl(rq); + return __pick_task_dl(rq, rf); } static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next) @@ -2465,6 +2608,10 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_s update_curr_dl(rq); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); + + if (task_is_blocked(p)) + return; + if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } @@ -2500,8 +2647,6 @@ static void task_fork_dl(struct task_struct *p) */ } -#ifdef CONFIG_SMP - /* Only try algorithms three times */ #define DL_MAX_TRIES 3 @@ -2530,6 +2675,7 @@ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu return NULL; } +/* Access rule: must be called on local CPU with preemption disabled */ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); static int find_later_rq(struct task_struct *task) @@ -2619,6 +2765,25 @@ static int find_later_rq(struct task_struct *task) return -1; } +static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_dl_tasks(rq)) + return NULL; + + p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root)); + + WARN_ON_ONCE(rq->cpu != task_cpu(p)); + WARN_ON_ONCE(task_current(rq, p)); + WARN_ON_ONCE(p->nr_cpus_allowed <= 1); + + WARN_ON_ONCE(!task_on_rq_queued(p)); + WARN_ON_ONCE(!dl_task(p)); + + return p; +} + /* Locks the rq it finds */ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) { @@ -2646,12 +2811,37 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) /* Retry if something changed. */ if (double_lock_balance(rq, later_rq)) { - if (unlikely(task_rq(task) != rq || + /* + * double_lock_balance had to release rq->lock, in the + * meantime, task may no longer be fit to be migrated. + * Check the following to ensure that the task is + * still suitable for migration: + * 1. It is possible the task was scheduled, + * migrate_disabled was set and then got preempted, + * so we must check the task migration disable + * flag. + * 2. The CPU picked is in the task's affinity. + * 3. For throttled task (dl_task_offline_migration), + * check the following: + * - the task is not on the rq anymore (it was + * migrated) + * - the task is not on CPU anymore + * - the task is still a dl task + * - the task is not queued on the rq anymore + * 4. For the non-throttled task (push_dl_task), the + * check to ensure that this task is still at the + * head of the pushable tasks list is enough. + */ + if (unlikely(is_migration_disabled(task) || !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) || - task_on_cpu(rq, task) || - !dl_task(task) || - is_migration_disabled(task) || - !task_on_rq_queued(task))) { + (task->dl.dl_throttled && + (task_rq(task) != rq || + task_on_cpu(rq, task) || + !dl_task(task) || + !task_on_rq_queued(task))) || + (!task->dl.dl_throttled && + task != pick_next_pushable_dl_task(rq)))) { + double_unlock_balance(rq, later_rq); later_rq = NULL; break; @@ -2674,25 +2864,6 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) return later_rq; } -static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) -{ - struct task_struct *p; - - if (!has_pushable_dl_tasks(rq)) - return NULL; - - p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root)); - - WARN_ON_ONCE(rq->cpu != task_cpu(p)); - WARN_ON_ONCE(task_current(rq, p)); - WARN_ON_ONCE(p->nr_cpus_allowed <= 1); - - WARN_ON_ONCE(!task_on_rq_queued(p)); - WARN_ON_ONCE(!dl_task(p)); - - return p; -} - /* * See if the non running -deadline tasks on this rq * can be sent to some other CPU where they can preempt @@ -2923,9 +3094,10 @@ static void rq_online_dl(struct rq *rq) if (rq->dl.overloaded) dl_set_overload(rq); - cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); if (rq->dl.dl_nr_running > 0) cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr); + else + cpudl_clear(&rq->rd->cpudl, rq->cpu, true); } /* Assumes rq->lock is held */ @@ -2934,8 +3106,7 @@ static void rq_offline_dl(struct rq *rq) if (rq->dl.overloaded) dl_clear_overload(rq); - cpudl_clear(&rq->rd->cpudl, rq->cpu); - cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); + cpudl_clear(&rq->rd->cpudl, rq->cpu, false); } void __init init_sched_dl_class(void) @@ -2947,11 +3118,43 @@ void __init init_sched_dl_class(void) GFP_KERNEL, cpu_to_node(i)); } +/* + * This function always returns a non-empty bitmap in @cpus. This is because + * if a root domain has reserved bandwidth for DL tasks, the DL bandwidth + * check will prevent CPU hotplug from deactivating all CPUs in that domain. + */ +static void dl_get_task_effective_cpus(struct task_struct *p, struct cpumask *cpus) +{ + const struct cpumask *hk_msk; + + hk_msk = housekeeping_cpumask(HK_TYPE_DOMAIN); + if (housekeeping_enabled(HK_TYPE_DOMAIN)) { + if (!cpumask_intersects(p->cpus_ptr, hk_msk)) { + /* + * CPUs isolated by isolcpu="domain" always belong to + * def_root_domain. + */ + cpumask_andnot(cpus, cpu_active_mask, hk_msk); + return; + } + } + + /* + * If a root domain holds a DL task, it must have active CPUs. So + * active CPUs can always be found by walking up the task's cpuset + * hierarchy up to the partition root. + */ + cpuset_cpus_allowed_locked(p, cpus); +} + +/* The caller should hold cpuset_mutex */ void dl_add_task_root_domain(struct task_struct *p) { struct rq_flags rf; struct rq *rq; struct dl_bw *dl_b; + unsigned int cpu; + struct cpumask *msk = this_cpu_cpumask_var_ptr(local_cpu_mask_dl); raw_spin_lock_irqsave(&p->pi_lock, rf.flags); if (!dl_task(p) || dl_entity_is_special(&p->dl)) { @@ -2959,16 +3162,25 @@ void dl_add_task_root_domain(struct task_struct *p) return; } - rq = __task_rq_lock(p, &rf); - + /* + * Get an active rq, whose rq->rd traces the correct root + * domain. + * Ideally this would be under cpuset reader lock until rq->rd is + * fetched. However, sleepable locks cannot nest inside pi_lock, so we + * rely on the caller of dl_add_task_root_domain() holds 'cpuset_mutex' + * to guarantee the CPU stays in the cpuset. + */ + dl_get_task_effective_cpus(p, msk); + cpu = cpumask_first_and(cpu_active_mask, msk); + BUG_ON(cpu >= nr_cpu_ids); + rq = cpu_rq(cpu); dl_b = &rq->rd->dl_bw; - raw_spin_lock(&dl_b->lock); + /* End of fetching rd */ + raw_spin_lock(&dl_b->lock); __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span)); - raw_spin_unlock(&dl_b->lock); - - task_rq_unlock(rq, p, &rf); + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); } void dl_clear_root_domain(struct root_domain *rd) @@ -2976,7 +3188,14 @@ void dl_clear_root_domain(struct root_domain *rd) int i; guard(raw_spinlock_irqsave)(&rd->dl_bw.lock); + + /* + * Reset total_bw to zero and extra_bw to max_bw so that next + * loop will add dl-servers contributions back properly, + */ rd->dl_bw.total_bw = 0; + for_each_cpu(i, rd->span) + cpu_rq(i)->dl.extra_bw = cpu_rq(i)->dl.max_bw; /* * dl_servers are not tasks. Since dl_add_task_root_domain ignores @@ -2995,8 +3214,6 @@ void dl_clear_root_domain_cpu(int cpu) dl_clear_root_domain(cpu_rq(cpu)->rd); } -#endif /* CONFIG_SMP */ - static void switched_from_dl(struct rq *rq, struct task_struct *p) { /* @@ -3008,7 +3225,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) * will reset the task parameters. */ if (task_on_rq_queued(p) && p->dl.dl_runtime) - task_non_contending(&p->dl); + task_non_contending(&p->dl, false); /* * In case a task is setscheduled out from SCHED_DEADLINE we need to @@ -3069,10 +3286,8 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) } if (rq->donor != p) { -#ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) deadline_queue_push_tasks(rq); -#endif if (dl_task(rq->donor)) wakeup_preempt_dl(rq, p, 0); else @@ -3082,24 +3297,24 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) } } +static u64 get_prio_dl(struct rq *rq, struct task_struct *p) +{ + return p->dl.deadline; +} + /* * If the scheduling parameters of a -deadline task changed, * a push or pull operation might be needed. */ -static void prio_changed_dl(struct rq *rq, struct task_struct *p, - int oldprio) +static void prio_changed_dl(struct rq *rq, struct task_struct *p, u64 old_deadline) { if (!task_on_rq_queued(p)) return; -#ifdef CONFIG_SMP - /* - * This might be too much, but unfortunately - * we don't have the old deadline value, and - * we can't argue if the task is increasing - * or lowering its prio, so... - */ - if (!rq->dl.overloaded) + if (p->dl.deadline == old_deadline) + return; + + if (dl_time_before(old_deadline, p->dl.deadline)) deadline_queue_pull_task(rq); if (task_current_donor(rq, p)) { @@ -3121,13 +3336,6 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, dl_time_before(p->dl.deadline, rq->curr->dl.deadline)) resched_curr(rq); } -#else - /* - * We don't know if p has a earlier or later deadline, so let's blindly - * set a (maybe not needed) rescheduling point. - */ - resched_curr(rq); -#endif } #ifdef CONFIG_SCHED_CORE @@ -3139,6 +3347,8 @@ static int task_is_throttled_dl(struct task_struct *p, int cpu) DEFINE_SCHED_CLASS(dl) = { + .queue_mask = 8, + .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, @@ -3149,7 +3359,6 @@ DEFINE_SCHED_CLASS(dl) = { .put_prev_task = put_prev_task_dl, .set_next_task = set_next_task_dl, -#ifdef CONFIG_SMP .balance = balance_dl, .select_task_rq = select_task_rq_dl, .migrate_task_rq = migrate_task_rq_dl, @@ -3158,11 +3367,11 @@ DEFINE_SCHED_CLASS(dl) = { .rq_offline = rq_offline_dl, .task_woken = task_woken_dl, .find_lock_rq = find_lock_later_rq, -#endif .task_tick = task_tick_dl, .task_fork = task_fork_dl, + .get_prio = get_prio_dl, .prio_changed = prio_changed_dl, .switched_from = switched_from_dl, .switched_to = switched_to_dl, @@ -3242,6 +3451,9 @@ void sched_dl_do_global(void) if (global_rt_runtime() != RUNTIME_INF) new_bw = to_ratio(global_rt_period(), global_rt_runtime()); + for_each_possible_cpu(cpu) + init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl); + for_each_possible_cpu(cpu) { rcu_read_lock_sched(); @@ -3257,7 +3469,6 @@ void sched_dl_do_global(void) raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); - init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl); } } @@ -3458,7 +3669,6 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) return false; } -#ifdef CONFIG_SMP int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { @@ -3570,7 +3780,6 @@ void dl_bw_free(int cpu, u64 dl_bw) { dl_bw_manage(dl_bw_req_free, cpu, dl_bw); } -#endif void print_dl_stats(struct seq_file *m, int cpu) { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 56ae54e0ce6a..41caa22e0680 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -6,6 +6,9 @@ * * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar */ +#include <linux/debugfs.h> +#include <linux/nmi.h> +#include "sched.h" /* * This allows printing both to /sys/kernel/debug/sched/debug and @@ -90,10 +93,10 @@ static void sched_feat_enable(int i) { static_key_enable_cpuslocked(&sched_feat_keys[i]); } -#else +#else /* !CONFIG_JUMP_LABEL: */ static void sched_feat_disable(int i) { }; static void sched_feat_enable(int i) { }; -#endif /* CONFIG_JUMP_LABEL */ +#endif /* !CONFIG_JUMP_LABEL */ static int sched_feat_set(char *cmp) { @@ -166,8 +169,6 @@ static const struct file_operations sched_feat_fops = { .release = single_release, }; -#ifdef CONFIG_SMP - static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -214,8 +215,6 @@ static const struct file_operations sched_scaling_fops = { .release = single_release, }; -#endif /* SMP */ - #ifdef CONFIG_PREEMPT_DYNAMIC static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, @@ -283,7 +282,6 @@ static const struct file_operations sched_dynamic_fops = { __read_mostly bool sched_debug_verbose; -#ifdef CONFIG_SMP static struct dentry *sd_dentry; @@ -311,9 +309,6 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf, return result; } -#else -#define sched_verbose_write debugfs_write_file_bool -#endif static const struct file_operations sched_verbose_fops = { .read = debugfs_read_file_bool, @@ -381,10 +376,8 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu return -EINVAL; } - if (rq->cfs.h_nr_queued) { - update_rq_clock(rq); - dl_server_stop(&rq->fair_server); - } + update_rq_clock(rq); + dl_server_stop(&rq->fair_server); retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0); if (retval) @@ -512,7 +505,6 @@ static __init int sched_init_debug(void) debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); -#ifdef CONFIG_SMP debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); @@ -520,7 +512,6 @@ static __init int sched_init_debug(void) sched_domains_mutex_lock(); update_sched_domain_debugfs(); sched_domains_mutex_unlock(); -#endif #ifdef CONFIG_NUMA_BALANCING numa = debugfs_create_dir("numa_balancing", debugfs_sched); @@ -530,7 +521,7 @@ static __init int sched_init_debug(void) debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max); debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size); debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold); -#endif +#endif /* CONFIG_NUMA_BALANCING */ debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); @@ -540,8 +531,6 @@ static __init int sched_init_debug(void) } late_initcall(sched_init_debug); -#ifdef CONFIG_SMP - static cpumask_var_t sd_sysctl_cpus; static int sd_flags_show(struct seq_file *m, void *v) @@ -588,6 +577,10 @@ static void register_sd(struct sched_domain *sd, struct dentry *parent) debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops); debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops); debugfs_create_u32("level", 0444, parent, (u32 *)&sd->level); + + if (sd->flags & SD_ASYM_PACKING) + debugfs_create_u32("group_asym_prefer_cpu", 0444, parent, + (u32 *)&sd->groups->asym_prefer_cpu); } void update_sched_domain_debugfs(void) @@ -648,8 +641,6 @@ void dirty_sched_domain_sysctl(int cpu) __cpumask_set_cpu(cpu, sd_sysctl_cpus); } -#endif /* CONFIG_SMP */ - #ifdef CONFIG_FAIR_GROUP_SCHED static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) { @@ -686,18 +677,16 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group } P(se->load.weight); -#ifdef CONFIG_SMP P(se->avg.load_avg); P(se->avg.util_avg); P(se->avg.runnable_avg); -#endif #undef PN_SCHEDSTAT #undef PN #undef P_SCHEDSTAT #undef P } -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_CGROUP_SCHED static DEFINE_SPINLOCK(sched_debug_lock); @@ -807,7 +796,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { - s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, left_deadline = -1, spread; + s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread; struct sched_entity *last, *first, *root; struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -830,15 +819,15 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) last = __pick_last_entity(cfs_rq); if (last) right_vruntime = last->vruntime; - min_vruntime = cfs_rq->min_vruntime; + zero_vruntime = cfs_rq->zero_vruntime; raw_spin_rq_unlock_irqrestore(rq, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline", SPLIT_NS(left_deadline)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime", SPLIT_NS(left_vruntime)); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", - SPLIT_NS(min_vruntime)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime", + SPLIT_NS(zero_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime", SPLIT_NS(avg_vruntime(cfs_rq))); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime", @@ -850,7 +839,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued); SEQ_printf(m, " .%-30s: %d\n", "h_nr_idle", cfs_rq->h_nr_idle); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); -#ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "load_avg", cfs_rq->avg.load_avg); SEQ_printf(m, " .%-30s: %lu\n", "runnable_avg", @@ -870,8 +858,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->tg_load_avg_contrib); SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", atomic_long_read(&cfs_rq->tg->load_avg)); -#endif -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_CFS_BANDWIDTH SEQ_printf(m, " .%-30s: %d\n", "throttled", cfs_rq->throttled); @@ -925,11 +912,7 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x)) PU(dl_nr_running); -#ifdef CONFIG_SMP dl_bw = &cpu_rq(cpu)->rd->dl_bw; -#else - dl_bw = &dl_rq->dl_bw; -#endif SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw); SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw); @@ -947,9 +930,9 @@ static void print_cpu(struct seq_file *m, int cpu) SEQ_printf(m, "cpu#%d, %u.%03u MHz\n", cpu, freq / 1000, (freq % 1000)); } -#else +#else /* !CONFIG_X86: */ SEQ_printf(m, "cpu#%d\n", cpu); -#endif +#endif /* !CONFIG_X86 */ #define P(x) \ do { \ @@ -972,12 +955,10 @@ do { \ #undef P #undef PN -#ifdef CONFIG_SMP #define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); P64(avg_idle); P64(max_idle_balance_cost); #undef P64 -#endif #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n)); if (schedstat_enabled()) { @@ -1159,7 +1140,7 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) SEQ_printf(m, "current_node=%d, numa_group_id=%d\n", task_node(p), task_numa_group_id(p)); show_numa_stats(p, m); -#endif +#endif /* CONFIG_NUMA_BALANCING */ } void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, @@ -1243,7 +1224,6 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, __PS("nr_involuntary_switches", p->nivcsw); P(se.load.weight); -#ifdef CONFIG_SMP P(se.avg.load_sum); P(se.avg.runnable_sum); P(se.avg.util_sum); @@ -1252,13 +1232,12 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(se.avg.util_avg); P(se.avg.last_update_time); PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED); -#endif #ifdef CONFIG_UCLAMP_TASK __PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value); __PS("uclamp.max", p->uclamp_req[UCLAMP_MAX].value); __PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN)); __PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX)); -#endif +#endif /* CONFIG_UCLAMP_TASK */ P(policy); P(prio); if (task_has_dl_policy(p)) { diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index fdbf249d1c68..8f6d8d7f895c 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -9,893 +9,15 @@ #include <linux/btf_ids.h> #include "ext_idle.h" -#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) - -enum scx_consts { - SCX_DSP_DFL_MAX_BATCH = 32, - SCX_DSP_MAX_LOOPS = 32, - SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, - - SCX_EXIT_BT_LEN = 64, - SCX_EXIT_MSG_LEN = 1024, - SCX_EXIT_DUMP_DFL_LEN = 32768, - - SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, - - /* - * Iterating all tasks may take a while. Periodically drop - * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. - */ - SCX_OPS_TASK_ITER_BATCH = 32, -}; - -enum scx_exit_kind { - SCX_EXIT_NONE, - SCX_EXIT_DONE, - - SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ - SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ - SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ - SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ - - SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ - SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ - SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ -}; - -/* - * An exit code can be specified when exiting with scx_bpf_exit() or - * scx_ops_exit(), corresponding to exit_kind UNREG_BPF and UNREG_KERN - * respectively. The codes are 64bit of the format: - * - * Bits: [63 .. 48 47 .. 32 31 .. 0] - * [ SYS ACT ] [ SYS RSN ] [ USR ] - * - * SYS ACT: System-defined exit actions - * SYS RSN: System-defined exit reasons - * USR : User-defined exit codes and reasons - * - * Using the above, users may communicate intention and context by ORing system - * actions and/or system reasons with a user-defined exit code. - */ -enum scx_exit_code { - /* Reasons */ - SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, - - /* Actions */ - SCX_ECODE_ACT_RESTART = 1LLU << 48, -}; - -/* - * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is - * being disabled. - */ -struct scx_exit_info { - /* %SCX_EXIT_* - broad category of the exit reason */ - enum scx_exit_kind kind; - - /* exit code if gracefully exiting */ - s64 exit_code; - - /* textual representation of the above */ - const char *reason; - - /* backtrace if exiting due to an error */ - unsigned long *bt; - u32 bt_len; - - /* informational message */ - char *msg; - - /* debug dump */ - char *dump; -}; - -/* sched_ext_ops.flags */ -enum scx_ops_flags { - /* - * Keep built-in idle tracking even if ops.update_idle() is implemented. - */ - SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, - - /* - * By default, if there are no other task to run on the CPU, ext core - * keeps running the current task even after its slice expires. If this - * flag is specified, such tasks are passed to ops.enqueue() with - * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. - */ - SCX_OPS_ENQ_LAST = 1LLU << 1, - - /* - * An exiting task may schedule after PF_EXITING is set. In such cases, - * bpf_task_from_pid() may not be able to find the task and if the BPF - * scheduler depends on pid lookup for dispatching, the task will be - * lost leading to various issues including RCU grace period stalls. - * - * To mask this problem, by default, unhashed tasks are automatically - * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't - * depend on pid lookups and wants to handle these tasks directly, the - * following flag can be used. - */ - SCX_OPS_ENQ_EXITING = 1LLU << 2, - - /* - * If set, only tasks with policy set to SCHED_EXT are attached to - * sched_ext. If clear, SCHED_NORMAL tasks are also included. - */ - SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, - - /* - * A migration disabled task can only execute on its current CPU. By - * default, such tasks are automatically put on the CPU's local DSQ with - * the default slice on enqueue. If this ops flag is set, they also go - * through ops.enqueue(). - * - * A migration disabled task never invokes ops.select_cpu() as it can - * only select the current CPU. Also, p->cpus_ptr will only contain its - * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr - * and thus may disagree with cpumask_weight(p->cpus_ptr). - */ - SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, - - /* - * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes - * ops.enqueue() on the ops.select_cpu() selected or the wakee's - * previous CPU via IPI (inter-processor interrupt) to reduce cacheline - * transfers. When this optimization is enabled, ops.select_cpu() is - * skipped in some cases (when racing against the wakee switching out). - * As the BPF scheduler may depend on ops.select_cpu() being invoked - * during wakeups, queued wakeup is disabled by default. - * - * If this ops flag is set, queued wakeup optimization is enabled and - * the BPF scheduler must be able to handle ops.enqueue() invoked on the - * wakee's CPU without preceding ops.select_cpu() even for tasks which - * may be executed on multiple CPUs. - */ - SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, - - /* - * If set, enable per-node idle cpumasks. If clear, use a single global - * flat idle cpumask. - */ - SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, - - /* - * CPU cgroup support flags - */ - SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ - - SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | - SCX_OPS_ENQ_LAST | - SCX_OPS_ENQ_EXITING | - SCX_OPS_ENQ_MIGRATION_DISABLED | - SCX_OPS_ALLOW_QUEUED_WAKEUP | - SCX_OPS_SWITCH_PARTIAL | - SCX_OPS_BUILTIN_IDLE_PER_NODE | - SCX_OPS_HAS_CGROUP_WEIGHT, -}; - -/* argument container for ops.init_task() */ -struct scx_init_task_args { - /* - * Set if ops.init_task() is being invoked on the fork path, as opposed - * to the scheduler transition path. - */ - bool fork; -#ifdef CONFIG_EXT_GROUP_SCHED - /* the cgroup the task is joining */ - struct cgroup *cgroup; -#endif -}; - -/* argument container for ops.exit_task() */ -struct scx_exit_task_args { - /* Whether the task exited before running on sched_ext. */ - bool cancelled; -}; - -/* argument container for ops->cgroup_init() */ -struct scx_cgroup_init_args { - /* the weight of the cgroup [1..10000] */ - u32 weight; -}; - -enum scx_cpu_preempt_reason { - /* next task is being scheduled by &sched_class_rt */ - SCX_CPU_PREEMPT_RT, - /* next task is being scheduled by &sched_class_dl */ - SCX_CPU_PREEMPT_DL, - /* next task is being scheduled by &sched_class_stop */ - SCX_CPU_PREEMPT_STOP, - /* unknown reason for SCX being preempted */ - SCX_CPU_PREEMPT_UNKNOWN, -}; - -/* - * Argument container for ops->cpu_acquire(). Currently empty, but may be - * expanded in the future. - */ -struct scx_cpu_acquire_args {}; - -/* argument container for ops->cpu_release() */ -struct scx_cpu_release_args { - /* the reason the CPU was preempted */ - enum scx_cpu_preempt_reason reason; - - /* the task that's going to be scheduled on the CPU */ - struct task_struct *task; -}; - -/* - * Informational context provided to dump operations. - */ -struct scx_dump_ctx { - enum scx_exit_kind kind; - s64 exit_code; - const char *reason; - u64 at_ns; - u64 at_jiffies; -}; - -/** - * struct sched_ext_ops - Operation table for BPF scheduler implementation - * - * A BPF scheduler can implement an arbitrary scheduling policy by - * implementing and loading operations in this table. Note that a userland - * scheduling policy can also be implemented using the BPF scheduler - * as a shim layer. - */ -struct sched_ext_ops { - /** - * @select_cpu: Pick the target CPU for a task which is being woken up - * @p: task being woken up - * @prev_cpu: the cpu @p was on before sleeping - * @wake_flags: SCX_WAKE_* - * - * Decision made here isn't final. @p may be moved to any CPU while it - * is getting dispatched for execution later. However, as @p is not on - * the rq at this point, getting the eventual execution CPU right here - * saves a small bit of overhead down the line. - * - * If an idle CPU is returned, the CPU is kicked and will try to - * dispatch. While an explicit custom mechanism can be added, - * select_cpu() serves as the default way to wake up idle CPUs. - * - * @p may be inserted into a DSQ directly by calling - * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped. - * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ - * of the CPU returned by this operation. - * - * Note that select_cpu() is never called for tasks that can only run - * on a single CPU or tasks with migration disabled, as they don't have - * the option to select a different CPU. See select_task_rq() for - * details. - */ - s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); - - /** - * @enqueue: Enqueue a task on the BPF scheduler - * @p: task being enqueued - * @enq_flags: %SCX_ENQ_* - * - * @p is ready to run. Insert directly into a DSQ by calling - * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly - * inserted, the bpf scheduler owns @p and if it fails to dispatch @p, - * the task will stall. - * - * If @p was inserted into a DSQ from ops.select_cpu(), this callback is - * skipped. - */ - void (*enqueue)(struct task_struct *p, u64 enq_flags); - - /** - * @dequeue: Remove a task from the BPF scheduler - * @p: task being dequeued - * @deq_flags: %SCX_DEQ_* - * - * Remove @p from the BPF scheduler. This is usually called to isolate - * the task while updating its scheduling properties (e.g. priority). - * - * The ext core keeps track of whether the BPF side owns a given task or - * not and can gracefully ignore spurious dispatches from BPF side, - * which makes it safe to not implement this method. However, depending - * on the scheduling logic, this can lead to confusing behaviors - e.g. - * scheduling position not being updated across a priority change. - */ - void (*dequeue)(struct task_struct *p, u64 deq_flags); - - /** - * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs - * @cpu: CPU to dispatch tasks for - * @prev: previous task being switched out - * - * Called when a CPU's local dsq is empty. The operation should dispatch - * one or more tasks from the BPF scheduler into the DSQs using - * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ - * using scx_bpf_dsq_move_to_local(). - * - * The maximum number of times scx_bpf_dsq_insert() can be called - * without an intervening scx_bpf_dsq_move_to_local() is specified by - * ops.dispatch_max_batch. See the comments on top of the two functions - * for more details. - * - * When not %NULL, @prev is an SCX task with its slice depleted. If - * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in - * @prev->scx.flags, it is not enqueued yet and will be enqueued after - * ops.dispatch() returns. To keep executing @prev, return without - * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST. - */ - void (*dispatch)(s32 cpu, struct task_struct *prev); - - /** - * @tick: Periodic tick - * @p: task running currently - * - * This operation is called every 1/HZ seconds on CPUs which are - * executing an SCX task. Setting @p->scx.slice to 0 will trigger an - * immediate dispatch cycle on the CPU. - */ - void (*tick)(struct task_struct *p); - - /** - * @runnable: A task is becoming runnable on its associated CPU - * @p: task becoming runnable - * @enq_flags: %SCX_ENQ_* - * - * This and the following three functions can be used to track a task's - * execution state transitions. A task becomes ->runnable() on a CPU, - * and then goes through one or more ->running() and ->stopping() pairs - * as it runs on the CPU, and eventually becomes ->quiescent() when it's - * done running on the CPU. - * - * @p is becoming runnable on the CPU because it's - * - * - waking up (%SCX_ENQ_WAKEUP) - * - being moved from another CPU - * - being restored after temporarily taken off the queue for an - * attribute change. - * - * This and ->enqueue() are related but not coupled. This operation - * notifies @p's state transition and may not be followed by ->enqueue() - * e.g. when @p is being dispatched to a remote CPU, or when @p is - * being enqueued on a CPU experiencing a hotplug event. Likewise, a - * task may be ->enqueue()'d without being preceded by this operation - * e.g. after exhausting its slice. - */ - void (*runnable)(struct task_struct *p, u64 enq_flags); - - /** - * @running: A task is starting to run on its associated CPU - * @p: task starting to run - * - * See ->runnable() for explanation on the task state notifiers. - */ - void (*running)(struct task_struct *p); - - /** - * @stopping: A task is stopping execution - * @p: task stopping to run - * @runnable: is task @p still runnable? - * - * See ->runnable() for explanation on the task state notifiers. If - * !@runnable, ->quiescent() will be invoked after this operation - * returns. - */ - void (*stopping)(struct task_struct *p, bool runnable); - - /** - * @quiescent: A task is becoming not runnable on its associated CPU - * @p: task becoming not runnable - * @deq_flags: %SCX_DEQ_* - * - * See ->runnable() for explanation on the task state notifiers. - * - * @p is becoming quiescent on the CPU because it's - * - * - sleeping (%SCX_DEQ_SLEEP) - * - being moved to another CPU - * - being temporarily taken off the queue for an attribute change - * (%SCX_DEQ_SAVE) - * - * This and ->dequeue() are related but not coupled. This operation - * notifies @p's state transition and may not be preceded by ->dequeue() - * e.g. when @p is being dispatched to a remote CPU. - */ - void (*quiescent)(struct task_struct *p, u64 deq_flags); - - /** - * @yield: Yield CPU - * @from: yielding task - * @to: optional yield target task - * - * If @to is NULL, @from is yielding the CPU to other runnable tasks. - * The BPF scheduler should ensure that other available tasks are - * dispatched before the yielding task. Return value is ignored in this - * case. - * - * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf - * scheduler can implement the request, return %true; otherwise, %false. - */ - bool (*yield)(struct task_struct *from, struct task_struct *to); - - /** - * @core_sched_before: Task ordering for core-sched - * @a: task A - * @b: task B - * - * Used by core-sched to determine the ordering between two tasks. See - * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on - * core-sched. - * - * Both @a and @b are runnable and may or may not currently be queued on - * the BPF scheduler. Should return %true if @a should run before @b. - * %false if there's no required ordering or @b should run before @a. - * - * If not specified, the default is ordering them according to when they - * became runnable. - */ - bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); - - /** - * @set_weight: Set task weight - * @p: task to set weight for - * @weight: new weight [1..10000] - * - * Update @p's weight to @weight. - */ - void (*set_weight)(struct task_struct *p, u32 weight); - - /** - * @set_cpumask: Set CPU affinity - * @p: task to set CPU affinity for - * @cpumask: cpumask of cpus that @p can run on - * - * Update @p's CPU affinity to @cpumask. - */ - void (*set_cpumask)(struct task_struct *p, - const struct cpumask *cpumask); - - /** - * @update_idle: Update the idle state of a CPU - * @cpu: CPU to update the idle state for - * @idle: whether entering or exiting the idle state - * - * This operation is called when @rq's CPU goes or leaves the idle - * state. By default, implementing this operation disables the built-in - * idle CPU tracking and the following helpers become unavailable: - * - * - scx_bpf_select_cpu_dfl() - * - scx_bpf_test_and_clear_cpu_idle() - * - scx_bpf_pick_idle_cpu() - * - * The user also must implement ops.select_cpu() as the default - * implementation relies on scx_bpf_select_cpu_dfl(). - * - * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle - * tracking. - */ - void (*update_idle)(s32 cpu, bool idle); - - /** - * @cpu_acquire: A CPU is becoming available to the BPF scheduler - * @cpu: The CPU being acquired by the BPF scheduler. - * @args: Acquire arguments, see the struct definition. - * - * A CPU that was previously released from the BPF scheduler is now once - * again under its control. - */ - void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); - - /** - * @cpu_release: A CPU is taken away from the BPF scheduler - * @cpu: The CPU being released by the BPF scheduler. - * @args: Release arguments, see the struct definition. - * - * The specified CPU is no longer under the control of the BPF - * scheduler. This could be because it was preempted by a higher - * priority sched_class, though there may be other reasons as well. The - * caller should consult @args->reason to determine the cause. - */ - void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); - - /** - * @init_task: Initialize a task to run in a BPF scheduler - * @p: task to initialize for BPF scheduling - * @args: init arguments, see the struct definition - * - * Either we're loading a BPF scheduler or a new task is being forked. - * Initialize @p for BPF scheduling. This operation may block and can - * be used for allocations, and is called exactly once for a task. - * - * Return 0 for success, -errno for failure. An error return while - * loading will abort loading of the BPF scheduler. During a fork, it - * will abort that specific fork. - */ - s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); - - /** - * @exit_task: Exit a previously-running task from the system - * @p: task to exit - * @args: exit arguments, see the struct definition - * - * @p is exiting or the BPF scheduler is being unloaded. Perform any - * necessary cleanup for @p. - */ - void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); - - /** - * @enable: Enable BPF scheduling for a task - * @p: task to enable BPF scheduling for - * - * Enable @p for BPF scheduling. enable() is called on @p any time it - * enters SCX, and is always paired with a matching disable(). - */ - void (*enable)(struct task_struct *p); - - /** - * @disable: Disable BPF scheduling for a task - * @p: task to disable BPF scheduling for - * - * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. - * Disable BPF scheduling for @p. A disable() call is always matched - * with a prior enable() call. - */ - void (*disable)(struct task_struct *p); - - /** - * @dump: Dump BPF scheduler state on error - * @ctx: debug dump context - * - * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. - */ - void (*dump)(struct scx_dump_ctx *ctx); - - /** - * @dump_cpu: Dump BPF scheduler state for a CPU on error - * @ctx: debug dump context - * @cpu: CPU to generate debug dump for - * @idle: @cpu is currently idle without any runnable tasks - * - * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for - * @cpu. If @idle is %true and this operation doesn't produce any - * output, @cpu is skipped for dump. - */ - void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); - - /** - * @dump_task: Dump BPF scheduler state for a runnable task on error - * @ctx: debug dump context - * @p: runnable task to generate debug dump for - * - * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for - * @p. - */ - void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); - -#ifdef CONFIG_EXT_GROUP_SCHED - /** - * @cgroup_init: Initialize a cgroup - * @cgrp: cgroup being initialized - * @args: init arguments, see the struct definition - * - * Either the BPF scheduler is being loaded or @cgrp created, initialize - * @cgrp for sched_ext. This operation may block. - * - * Return 0 for success, -errno for failure. An error return while - * loading will abort loading of the BPF scheduler. During cgroup - * creation, it will abort the specific cgroup creation. - */ - s32 (*cgroup_init)(struct cgroup *cgrp, - struct scx_cgroup_init_args *args); - - /** - * @cgroup_exit: Exit a cgroup - * @cgrp: cgroup being exited - * - * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit - * @cgrp for sched_ext. This operation my block. - */ - void (*cgroup_exit)(struct cgroup *cgrp); - - /** - * @cgroup_prep_move: Prepare a task to be moved to a different cgroup - * @p: task being moved - * @from: cgroup @p is being moved from - * @to: cgroup @p is being moved to - * - * Prepare @p for move from cgroup @from to @to. This operation may - * block and can be used for allocations. - * - * Return 0 for success, -errno for failure. An error return aborts the - * migration. - */ - s32 (*cgroup_prep_move)(struct task_struct *p, - struct cgroup *from, struct cgroup *to); - - /** - * @cgroup_move: Commit cgroup move - * @p: task being moved - * @from: cgroup @p is being moved from - * @to: cgroup @p is being moved to - * - * Commit the move. @p is dequeued during this operation. - */ - void (*cgroup_move)(struct task_struct *p, - struct cgroup *from, struct cgroup *to); - - /** - * @cgroup_cancel_move: Cancel cgroup move - * @p: task whose cgroup move is being canceled - * @from: cgroup @p was being moved from - * @to: cgroup @p was being moved to - * - * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). - * Undo the preparation. - */ - void (*cgroup_cancel_move)(struct task_struct *p, - struct cgroup *from, struct cgroup *to); - - /** - * @cgroup_set_weight: A cgroup's weight is being changed - * @cgrp: cgroup whose weight is being updated - * @weight: new weight [1..10000] - * - * Update @tg's weight to @weight. - */ - void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); -#endif /* CONFIG_EXT_GROUP_SCHED */ - - /* - * All online ops must come before ops.cpu_online(). - */ - - /** - * @cpu_online: A CPU became online - * @cpu: CPU which just came up - * - * @cpu just came online. @cpu will not call ops.enqueue() or - * ops.dispatch(), nor run tasks associated with other CPUs beforehand. - */ - void (*cpu_online)(s32 cpu); - - /** - * @cpu_offline: A CPU is going offline - * @cpu: CPU which is going offline - * - * @cpu is going offline. @cpu will not call ops.enqueue() or - * ops.dispatch(), nor run tasks associated with other CPUs afterwards. - */ - void (*cpu_offline)(s32 cpu); - - /* - * All CPU hotplug ops must come before ops.init(). - */ - - /** - * @init: Initialize the BPF scheduler - */ - s32 (*init)(void); - - /** - * @exit: Clean up after the BPF scheduler - * @info: Exit info - * - * ops.exit() is also called on ops.init() failure, which is a bit - * unusual. This is to allow rich reporting through @info on how - * ops.init() failed. - */ - void (*exit)(struct scx_exit_info *info); - - /** - * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch - */ - u32 dispatch_max_batch; - - /** - * @flags: %SCX_OPS_* flags - */ - u64 flags; - - /** - * @timeout_ms: The maximum amount of time, in milliseconds, that a - * runnable task should be able to wait before being scheduled. The - * maximum timeout may not exceed the default timeout of 30 seconds. - * - * Defaults to the maximum allowed timeout value of 30 seconds. - */ - u32 timeout_ms; - - /** - * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default - * value of 32768 is used. - */ - u32 exit_dump_len; - - /** - * @hotplug_seq: A sequence number that may be set by the scheduler to - * detect when a hotplug event has occurred during the loading process. - * If 0, no detection occurs. Otherwise, the scheduler will fail to - * load if the sequence number does not match @scx_hotplug_seq on the - * enable path. - */ - u64 hotplug_seq; - - /** - * @name: BPF scheduler's name - * - * Must be a non-zero valid BPF object name including only isalnum(), - * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the - * BPF scheduler is enabled. - */ - char name[SCX_OPS_NAME_LEN]; -}; - -enum scx_opi { - SCX_OPI_BEGIN = 0, - SCX_OPI_NORMAL_BEGIN = 0, - SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), - SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), - SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), - SCX_OPI_END = SCX_OP_IDX(init), -}; - -enum scx_wake_flags { - /* expose select WF_* flags as enums */ - SCX_WAKE_FORK = WF_FORK, - SCX_WAKE_TTWU = WF_TTWU, - SCX_WAKE_SYNC = WF_SYNC, -}; - -enum scx_enq_flags { - /* expose select ENQUEUE_* flags as enums */ - SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, - SCX_ENQ_HEAD = ENQUEUE_HEAD, - SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, - - /* high 32bits are SCX specific */ - - /* - * Set the following to trigger preemption when calling - * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the - * current task is cleared to zero and the CPU is kicked into the - * scheduling path. Implies %SCX_ENQ_HEAD. - */ - SCX_ENQ_PREEMPT = 1LLU << 32, - - /* - * The task being enqueued was previously enqueued on the current CPU's - * %SCX_DSQ_LOCAL, but was removed from it in a call to the - * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was - * invoked in a ->cpu_release() callback, and the task is again - * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the - * task will not be scheduled on the CPU until at least the next invocation - * of the ->cpu_acquire() callback. - */ - SCX_ENQ_REENQ = 1LLU << 40, - - /* - * The task being enqueued is the only task available for the cpu. By - * default, ext core keeps executing such tasks but when - * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the - * %SCX_ENQ_LAST flag set. - * - * The BPF scheduler is responsible for triggering a follow-up - * scheduling event. Otherwise, Execution may stall. - */ - SCX_ENQ_LAST = 1LLU << 41, - - /* high 8 bits are internal */ - __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, - - SCX_ENQ_CLEAR_OPSS = 1LLU << 56, - SCX_ENQ_DSQ_PRIQ = 1LLU << 57, -}; - -enum scx_deq_flags { - /* expose select DEQUEUE_* flags as enums */ - SCX_DEQ_SLEEP = DEQUEUE_SLEEP, - - /* high 32bits are SCX specific */ - - /* - * The generic core-sched layer decided to execute the task even though - * it hasn't been dispatched yet. Dequeue from the BPF side. - */ - SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, -}; - -enum scx_pick_idle_cpu_flags { - SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ - SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */ -}; - -enum scx_kick_flags { - /* - * Kick the target CPU if idle. Guarantees that the target CPU goes - * through at least one full scheduling cycle before going idle. If the - * target CPU can be determined to be currently not idle and going to go - * through a scheduling cycle before going idle, noop. - */ - SCX_KICK_IDLE = 1LLU << 0, - - /* - * Preempt the current task and execute the dispatch path. If the - * current task of the target CPU is an SCX task, its ->scx.slice is - * cleared to zero before the scheduling path is invoked so that the - * task expires and the dispatch path is invoked. - */ - SCX_KICK_PREEMPT = 1LLU << 1, - - /* - * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will - * return after the target CPU finishes picking the next task. - */ - SCX_KICK_WAIT = 1LLU << 2, -}; - -enum scx_tg_flags { - SCX_TG_ONLINE = 1U << 0, - SCX_TG_INITED = 1U << 1, -}; - -enum scx_ops_enable_state { - SCX_OPS_ENABLING, - SCX_OPS_ENABLED, - SCX_OPS_DISABLING, - SCX_OPS_DISABLED, -}; - -static const char *scx_ops_enable_state_str[] = { - [SCX_OPS_ENABLING] = "enabling", - [SCX_OPS_ENABLED] = "enabled", - [SCX_OPS_DISABLING] = "disabling", - [SCX_OPS_DISABLED] = "disabled", -}; - /* - * sched_ext_entity->ops_state - * - * Used to track the task ownership between the SCX core and the BPF scheduler. - * State transitions look as follows: - * - * NONE -> QUEUEING -> QUEUED -> DISPATCHING - * ^ | | - * | v v - * \-------------------------------/ - * - * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call - * sites for explanations on the conditions being waited upon and why they are - * safe. Transitions out of them into NONE or QUEUED must store_release and the - * waiters should load_acquire. - * - * Tracking scx_ops_state enables sched_ext core to reliably determine whether - * any given task can be dispatched by the BPF scheduler at all times and thus - * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler - * to try to dispatch any task anytime regardless of its state as the SCX core - * can safely reject invalid dispatches. + * NOTE: sched_ext is in the process of growing multiple scheduler support and + * scx_root usage is in a transitional state. Naked dereferences are safe if the + * caller is one of the tasks attached to SCX and explicit RCU dereference is + * necessary otherwise. Naked scx_root dereferences trigger sparse warnings but + * are used as temporary markers to indicate that the dereferences need to be + * updated to point to the associated scheduler instances rather than scx_root. */ -enum scx_ops_state { - SCX_OPSS_NONE, /* owned by the SCX core */ - SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ - SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ - SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ - - /* - * QSEQ brands each QUEUED instance so that, when dispatch races - * dequeue/requeue, the dispatcher can tell whether it still has a claim - * on the task being dispatched. - * - * As some 32bit archs can't do 64bit store_release/load_acquire, - * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on - * 32bit machines. The dispatch race window QSEQ protects is very narrow - * and runs with IRQ disabled. 30 bits should be sufficient. - */ - SCX_OPSS_QSEQ_SHIFT = 2, -}; - -/* Use macros to ensure that the type is unsigned long for the masks */ -#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) -#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) +static struct scx_sched __rcu *scx_root; /* * During exit, a task may schedule after losing its PIDs. When disabling the @@ -903,36 +25,28 @@ enum scx_ops_state { * guarantee system safety. Maintain a dedicated task list which contains every * task between its fork and eventual free. */ -static DEFINE_SPINLOCK(scx_tasks_lock); +static DEFINE_RAW_SPINLOCK(scx_tasks_lock); static LIST_HEAD(scx_tasks); /* ops enable/disable */ -static struct kthread_worker *scx_ops_helper; -static DEFINE_MUTEX(scx_ops_enable_mutex); -DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); +static DEFINE_MUTEX(scx_enable_mutex); +DEFINE_STATIC_KEY_FALSE(__scx_enabled); DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); -static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); -static unsigned long scx_in_softlockup; -static atomic_t scx_ops_breather_depth = ATOMIC_INIT(0); -static int scx_ops_bypass_depth; -static bool scx_ops_init_task_enabled; +static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED); +static int scx_bypass_depth; +static cpumask_var_t scx_bypass_lb_donee_cpumask; +static cpumask_var_t scx_bypass_lb_resched_cpumask; +static bool scx_aborting; +static bool scx_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); -static struct sched_ext_ops scx_ops; -static bool scx_warned_zero_slice; - -DEFINE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); -static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); -static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); -static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled); -static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); - -static struct static_key_false scx_has_op[SCX_OPI_END] = - { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT }; - -static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE); -static struct scx_exit_info *scx_exit_info; +/* + * Tracks whether scx_enable() called scx_bypass(true). Used to balance bypass + * depth on enable failure. Will be removed when bypass depth is moved into the + * sched instance. + */ +static bool scx_bypassed_for_enable; static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); @@ -947,7 +61,7 @@ static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0); /* * The maximum amount of time in jiffies that a task may be runnable without * being scheduled on a CPU. If this timeout is exceeded, it will trigger - * scx_ops_error(). + * scx_error(). */ static unsigned long scx_watchdog_timeout; @@ -961,8 +75,19 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; static struct delayed_work scx_watchdog_work; -/* for %SCX_KICK_WAIT */ -static unsigned long __percpu *scx_kick_cpus_pnt_seqs; +/* + * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of kick_sync sequence + * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu + * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated + * lazily when enabling and freed when disabling to avoid waste when sched_ext + * isn't active. + */ +struct scx_kick_syncs { + struct rcu_head rcu; + unsigned long syncs[]; +}; + +static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs); /* * Direct dispatch marker. @@ -973,23 +98,12 @@ static unsigned long __percpu *scx_kick_cpus_pnt_seqs; */ static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); -/* - * Dispatch queues. - * - * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is - * to avoid live-locking in bypass mode where all tasks are dispatched to - * %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't - * sufficient, it can be further split. - */ -static struct scx_dispatch_q **global_dsqs; - static const struct rhashtable_params dsq_hash_params = { .key_len = sizeof_field(struct scx_dispatch_q, id), .key_offset = offsetof(struct scx_dispatch_q, id), .head_offset = offsetof(struct scx_dispatch_q, hash_node), }; -static struct rhashtable dsq_hash; static LLIST_HEAD(dsqs_to_free); /* dispatch buf */ @@ -1036,27 +150,73 @@ static struct scx_dump_data scx_dump_data = { /* /sys/kernel/sched_ext interface */ static struct kset *scx_kset; -static struct kobject *scx_root_kobj; + +/* + * Parameters that can be adjusted through /sys/module/sched_ext/parameters. + * There usually is no reason to modify these as normal scheduler operation + * shouldn't be affected by them. The knobs are primarily for debugging. + */ +static u64 scx_slice_dfl = SCX_SLICE_DFL; +static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC; +static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US; + +static int set_slice_us(const char *val, const struct kernel_param *kp) +{ + return param_set_uint_minmax(val, kp, 100, 100 * USEC_PER_MSEC); +} + +static const struct kernel_param_ops slice_us_param_ops = { + .set = set_slice_us, + .get = param_get_uint, +}; + +static int set_bypass_lb_intv_us(const char *val, const struct kernel_param *kp) +{ + return param_set_uint_minmax(val, kp, 0, 10 * USEC_PER_SEC); +} + +static const struct kernel_param_ops bypass_lb_intv_us_param_ops = { + .set = set_bypass_lb_intv_us, + .get = param_get_uint, +}; + +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "sched_ext." + +module_param_cb(slice_bypass_us, &slice_us_param_ops, &scx_slice_bypass_us, 0600); +MODULE_PARM_DESC(slice_bypass_us, "bypass slice in microseconds, applied on [un]load (100us to 100ms)"); +module_param_cb(bypass_lb_intv_us, &bypass_lb_intv_us_param_ops, &scx_bypass_lb_intv_us, 0600); +MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microseconds (0 (disable) to 10s)"); + +#undef MODULE_PARAM_PREFIX #define CREATE_TRACE_POINTS #include <trace/events/sched_ext.h> static void process_ddsp_deferred_locals(struct rq *rq); -static void scx_bpf_kick_cpu(s32 cpu, u64 flags); -static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, - s64 exit_code, - const char *fmt, ...); +static u32 reenq_local(struct rq *rq); +static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); +static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, + s64 exit_code, const char *fmt, va_list args); -#define scx_ops_error_kind(err, fmt, args...) \ - scx_ops_exit_kind((err), 0, fmt, ##args) +static __printf(4, 5) bool scx_exit(struct scx_sched *sch, + enum scx_exit_kind kind, s64 exit_code, + const char *fmt, ...) +{ + va_list args; + bool ret; + + va_start(args, fmt); + ret = scx_vexit(sch, kind, exit_code, fmt, args); + va_end(args); -#define scx_ops_exit(code, fmt, args...) \ - scx_ops_exit_kind(SCX_EXIT_UNREG_KERN, (code), fmt, ##args) + return ret; +} -#define scx_ops_error(fmt, args...) \ - scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args) +#define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args) +#define scx_verror(sch, fmt, args) scx_vexit((sch), SCX_EXIT_ERROR, 0, fmt, args) -#define SCX_HAS_OP(op) static_branch_likely(&scx_has_op[SCX_OP_IDX(op)]) +#define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op) static long jiffies_delta_msecs(unsigned long at, unsigned long now) { @@ -1084,14 +244,23 @@ static bool u32_before(u32 a, u32 b) return (s32)(a - b) < 0; } -static struct scx_dispatch_q *find_global_dsq(struct task_struct *p) +static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, + struct task_struct *p) { - return global_dsqs[cpu_to_node(task_cpu(p))]; + return sch->global_dsqs[cpu_to_node(task_cpu(p))]; } -static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) +static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id) { - return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); + return rhashtable_lookup(&sch->dsq_hash, &dsq_id, dsq_hash_params); +} + +static const struct sched_class *scx_setscheduler_class(struct task_struct *p) +{ + if (p->sched_class == &stop_sched_class) + return &stop_sched_class; + + return __setscheduler_class(p->policy, p->prio); } /* @@ -1118,27 +287,56 @@ static void scx_kf_disallow(u32 mask) current->scx.kf_mask &= ~mask; } -#define SCX_CALL_OP(mask, op, args...) \ +/* + * Track the rq currently locked. + * + * This allows kfuncs to safely operate on rq from any scx ops callback, + * knowing which rq is already locked. + */ +DEFINE_PER_CPU(struct rq *, scx_locked_rq_state); + +static inline void update_locked_rq(struct rq *rq) +{ + /* + * Check whether @rq is actually locked. This can help expose bugs + * or incorrect assumptions about the context in which a kfunc or + * callback is executed. + */ + if (rq) + lockdep_assert_rq_held(rq); + __this_cpu_write(scx_locked_rq_state, rq); +} + +#define SCX_CALL_OP(sch, mask, op, rq, args...) \ do { \ + if (rq) \ + update_locked_rq(rq); \ if (mask) { \ scx_kf_allow(mask); \ - scx_ops.op(args); \ + (sch)->ops.op(args); \ scx_kf_disallow(mask); \ } else { \ - scx_ops.op(args); \ + (sch)->ops.op(args); \ } \ + if (rq) \ + update_locked_rq(NULL); \ } while (0) -#define SCX_CALL_OP_RET(mask, op, args...) \ +#define SCX_CALL_OP_RET(sch, mask, op, rq, args...) \ ({ \ - __typeof__(scx_ops.op(args)) __ret; \ + __typeof__((sch)->ops.op(args)) __ret; \ + \ + if (rq) \ + update_locked_rq(rq); \ if (mask) { \ scx_kf_allow(mask); \ - __ret = scx_ops.op(args); \ + __ret = (sch)->ops.op(args); \ scx_kf_disallow(mask); \ } else { \ - __ret = scx_ops.op(args); \ + __ret = (sch)->ops.op(args); \ } \ + if (rq) \ + update_locked_rq(NULL); \ __ret; \ }) @@ -1153,42 +351,42 @@ do { \ * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on * the specific task. */ -#define SCX_CALL_OP_TASK(mask, op, task, args...) \ +#define SCX_CALL_OP_TASK(sch, mask, op, rq, task, args...) \ do { \ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ current->scx.kf_tasks[0] = task; \ - SCX_CALL_OP(mask, op, task, ##args); \ + SCX_CALL_OP((sch), mask, op, rq, task, ##args); \ current->scx.kf_tasks[0] = NULL; \ } while (0) -#define SCX_CALL_OP_TASK_RET(mask, op, task, args...) \ +#define SCX_CALL_OP_TASK_RET(sch, mask, op, rq, task, args...) \ ({ \ - __typeof__(scx_ops.op(task, ##args)) __ret; \ + __typeof__((sch)->ops.op(task, ##args)) __ret; \ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ current->scx.kf_tasks[0] = task; \ - __ret = SCX_CALL_OP_RET(mask, op, task, ##args); \ + __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task, ##args); \ current->scx.kf_tasks[0] = NULL; \ __ret; \ }) -#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...) \ +#define SCX_CALL_OP_2TASKS_RET(sch, mask, op, rq, task0, task1, args...) \ ({ \ - __typeof__(scx_ops.op(task0, task1, ##args)) __ret; \ + __typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ current->scx.kf_tasks[0] = task0; \ current->scx.kf_tasks[1] = task1; \ - __ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args); \ + __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task0, task1, ##args); \ current->scx.kf_tasks[0] = NULL; \ current->scx.kf_tasks[1] = NULL; \ __ret; \ }) /* @mask is constant, always inline to cull unnecessary branches */ -static __always_inline bool scx_kf_allowed(u32 mask) +static __always_inline bool scx_kf_allowed(struct scx_sched *sch, u32 mask) { if (unlikely(!(current->scx.kf_mask & mask))) { - scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x", - mask, current->scx.kf_mask); + scx_error(sch, "kfunc with mask 0x%x called from an operation only allowing 0x%x", + mask, current->scx.kf_mask); return false; } @@ -1201,13 +399,13 @@ static __always_inline bool scx_kf_allowed(u32 mask) */ if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE && (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) { - scx_ops_error("cpu_release kfunc called from a nested operation"); + scx_error(sch, "cpu_release kfunc called from a nested operation"); return false; } if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH && (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) { - scx_ops_error("dispatch kfunc called from a nested operation"); + scx_error(sch, "dispatch kfunc called from a nested operation"); return false; } @@ -1215,26 +413,22 @@ static __always_inline bool scx_kf_allowed(u32 mask) } /* see SCX_CALL_OP_TASK() */ -static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask, +static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch, + u32 mask, struct task_struct *p) { - if (!scx_kf_allowed(mask)) + if (!scx_kf_allowed(sch, mask)) return false; if (unlikely((p != current->scx.kf_tasks[0] && p != current->scx.kf_tasks[1]))) { - scx_ops_error("called on a task not being operated on"); + scx_error(sch, "called on a task not being operated on"); return false; } return true; } -static bool scx_kf_allowed_if_unlocked(void) -{ - return !current->scx.kf_mask; -} - /** * nldsq_next_task - Iterate to the next task in a non-local DSQ * @dsq: user dsq being iterated @@ -1314,10 +508,11 @@ struct bpf_iter_scx_dsq { */ struct scx_task_iter { struct sched_ext_entity cursor; - struct task_struct *locked; + struct task_struct *locked_task; struct rq *rq; struct rq_flags rf; u32 cnt; + bool list_locked; }; /** @@ -1334,26 +529,24 @@ struct scx_task_iter { * RCU read lock or obtaining a reference count. * * All tasks which existed when the iteration started are guaranteed to be - * visited as long as they still exist. + * visited as long as they are not dead. */ static void scx_task_iter_start(struct scx_task_iter *iter) { - BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & - ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); + memset(iter, 0, sizeof(*iter)); - spin_lock_irq(&scx_tasks_lock); + raw_spin_lock_irq(&scx_tasks_lock); iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; list_add(&iter->cursor.tasks_node, &scx_tasks); - iter->locked = NULL; - iter->cnt = 0; + iter->list_locked = true; } static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) { - if (iter->locked) { - task_rq_unlock(iter->rq, iter->locked, &iter->rf); - iter->locked = NULL; + if (iter->locked_task) { + task_rq_unlock(iter->rq, iter->locked_task, &iter->rf); + iter->locked_task = NULL; } } @@ -1363,24 +556,24 @@ static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) * * If @iter is in the middle of a locked iteration, it may be locking the rq of * the task currently being visited in addition to scx_tasks_lock. Unlock both. - * This function can be safely called anytime during an iteration. + * This function can be safely called anytime during an iteration. The next + * iterator operation will automatically restore the necessary locking. */ static void scx_task_iter_unlock(struct scx_task_iter *iter) { __scx_task_iter_rq_unlock(iter); - spin_unlock_irq(&scx_tasks_lock); + if (iter->list_locked) { + iter->list_locked = false; + raw_spin_unlock_irq(&scx_tasks_lock); + } } -/** - * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock() - * @iter: iterator to re-lock - * - * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it - * doesn't re-lock the rq lock. Must be called before other iterator operations. - */ -static void scx_task_iter_relock(struct scx_task_iter *iter) +static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) { - spin_lock_irq(&scx_tasks_lock); + if (!iter->list_locked) { + raw_spin_lock_irq(&scx_tasks_lock); + iter->list_locked = true; + } } /** @@ -1393,6 +586,7 @@ static void scx_task_iter_relock(struct scx_task_iter *iter) */ static void scx_task_iter_stop(struct scx_task_iter *iter) { + __scx_task_iter_maybe_relock(iter); list_del_init(&iter->cursor.tasks_node); scx_task_iter_unlock(iter); } @@ -1402,20 +596,21 @@ static void scx_task_iter_stop(struct scx_task_iter *iter) * @iter: iterator to walk * * Visit the next task. See scx_task_iter_start() for details. Locks are dropped - * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing - * stalls by holding scx_tasks_lock for too long. + * and re-acquired every %SCX_TASK_ITER_BATCH iterations to avoid causing stalls + * by holding scx_tasks_lock for too long. */ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) { struct list_head *cursor = &iter->cursor.tasks_node; struct sched_ext_entity *pos; - if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) { + if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) { scx_task_iter_unlock(iter); cond_resched(); - scx_task_iter_relock(iter); } + __scx_task_iter_maybe_relock(iter); + list_for_each_entry(pos, cursor, tasks_node) { if (&pos->tasks_node == &scx_tasks) return NULL; @@ -1476,96 +671,34 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) return NULL; iter->rq = task_rq_lock(p, &iter->rf); - iter->locked = p; + iter->locked_task = p; return p; } -/* - * Collection of event counters. Event types are placed in descending order. - */ -struct scx_event_stats { - /* - * If ops.select_cpu() returns a CPU which can't be used by the task, - * the core scheduler code silently picks a fallback CPU. - */ - s64 SCX_EV_SELECT_CPU_FALLBACK; - - /* - * When dispatching to a local DSQ, the CPU may have gone offline in - * the meantime. In this case, the task is bounced to the global DSQ. - */ - s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; - - /* - * If SCX_OPS_ENQ_LAST is not set, the number of times that a task - * continued to run because there were no other tasks on the CPU. - */ - s64 SCX_EV_DISPATCH_KEEP_LAST; - - /* - * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task - * is dispatched to a local DSQ when exiting. - */ - s64 SCX_EV_ENQ_SKIP_EXITING; - - /* - * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a - * migration disabled task skips ops.enqueue() and is dispatched to its - * local DSQ. - */ - s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; - - /* - * The total number of tasks enqueued (or pick_task-ed) with a - * default time slice (SCX_SLICE_DFL). - */ - s64 SCX_EV_ENQ_SLICE_DFL; - - /* - * The total duration of bypass modes in nanoseconds. - */ - s64 SCX_EV_BYPASS_DURATION; - - /* - * The number of tasks dispatched in the bypassing mode. - */ - s64 SCX_EV_BYPASS_DISPATCH; - - /* - * The number of times the bypassing mode has been activated. - */ - s64 SCX_EV_BYPASS_ACTIVATE; -}; - -/* - * The event counter is organized by a per-CPU variable to minimize the - * accounting overhead without synchronization. A system-wide view on the - * event counter is constructed when requested by scx_bpf_get_event_stat(). - */ -static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu); - /** * scx_add_event - Increase an event counter for 'name' by 'cnt' + * @sch: scx_sched to account events for * @name: an event name defined in struct scx_event_stats - * @cnt: the number of the event occured + * @cnt: the number of the event occurred * * This can be used when preemption is not disabled. */ -#define scx_add_event(name, cnt) do { \ - this_cpu_add(event_stats_cpu.name, cnt); \ - trace_sched_ext_event(#name, cnt); \ +#define scx_add_event(sch, name, cnt) do { \ + this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ + trace_sched_ext_event(#name, (cnt)); \ } while(0) /** * __scx_add_event - Increase an event counter for 'name' by 'cnt' + * @sch: scx_sched to account events for * @name: an event name defined in struct scx_event_stats - * @cnt: the number of the event occured + * @cnt: the number of the event occurred * * This should be used only when preemption is disabled. */ -#define __scx_add_event(name, cnt) do { \ - __this_cpu_add(event_stats_cpu.name, cnt); \ +#define __scx_add_event(sch, name, cnt) do { \ + __this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ trace_sched_ext_event(#name, cnt); \ } while(0) @@ -1590,30 +723,25 @@ static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu); } while (0) -static void scx_bpf_events(struct scx_event_stats *events, size_t events__sz); +static void scx_read_events(struct scx_sched *sch, + struct scx_event_stats *events); -static enum scx_ops_enable_state scx_ops_enable_state(void) +static enum scx_enable_state scx_enable_state(void) { - return atomic_read(&scx_ops_enable_state_var); + return atomic_read(&scx_enable_state_var); } -static enum scx_ops_enable_state -scx_ops_set_enable_state(enum scx_ops_enable_state to) +static enum scx_enable_state scx_set_enable_state(enum scx_enable_state to) { - return atomic_xchg(&scx_ops_enable_state_var, to); + return atomic_xchg(&scx_enable_state_var, to); } -static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to, - enum scx_ops_enable_state from) +static bool scx_tryset_enable_state(enum scx_enable_state to, + enum scx_enable_state from) { int from_v = from; - return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to); -} - -static bool scx_rq_bypassing(struct rq *rq) -{ - return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); + return atomic_try_cmpxchg(&scx_enable_state_var, &from_v, to); } /** @@ -1633,8 +761,14 @@ static void wait_ops_state(struct task_struct *p, unsigned long opss) } while (atomic_long_read_acquire(&p->scx.ops_state) == opss); } +static inline bool __cpu_valid(s32 cpu) +{ + return likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu)); +} + /** - * ops_cpu_valid - Verify a cpu number + * ops_cpu_valid - Verify a cpu number, to be used on ops input args + * @sch: scx_sched to abort on error * @cpu: cpu number which came from a BPF ops * @where: extra information reported on error * @@ -1642,49 +776,52 @@ static void wait_ops_state(struct task_struct *p, unsigned long opss) * Verify that it is in range and one of the possible cpus. If invalid, trigger * an ops error. */ -static bool ops_cpu_valid(s32 cpu, const char *where) +static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where) { - if (likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu))) { + if (__cpu_valid(cpu)) { return true; } else { - scx_ops_error("invalid CPU %d%s%s", cpu, - where ? " " : "", where ?: ""); + scx_error(sch, "invalid CPU %d%s%s", cpu, where ? " " : "", where ?: ""); return false; } } /** * ops_sanitize_err - Sanitize a -errno value + * @sch: scx_sched to error out on error * @ops_name: operation to blame on failure * @err: -errno value to sanitize * - * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return + * Verify @err is a valid -errno. If not, trigger scx_error() and return * -%EPROTO. This is necessary because returning a rogue -errno up the chain can * cause misbehaviors. For an example, a large negative return from * ops.init_task() triggers an oops when passed up the call chain because the * value fails IS_ERR() test after being encoded with ERR_PTR() and then is * handled as a pointer. */ -static int ops_sanitize_err(const char *ops_name, s32 err) +static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err) { if (err < 0 && err >= -MAX_ERRNO) return err; - scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err); + scx_error(sch, "ops.%s() returned an invalid errno %d", ops_name, err); return -EPROTO; } static void run_deferred(struct rq *rq) { process_ddsp_deferred_locals(rq); + + if (local_read(&rq->scx.reenq_local_deferred)) { + local_set(&rq->scx.reenq_local_deferred, 0); + reenq_local(rq); + } } -#ifdef CONFIG_SMP static void deferred_bal_cb_workfn(struct rq *rq) { run_deferred(rq); } -#endif static void deferred_irq_workfn(struct irq_work *irq_work) { @@ -1699,15 +836,30 @@ static void deferred_irq_workfn(struct irq_work *irq_work) * schedule_deferred - Schedule execution of deferred actions on an rq * @rq: target rq * - * Schedule execution of deferred actions on @rq. Must be called with @rq - * locked. Deferred actions are executed with @rq locked but unpinned, and thus - * can unlock @rq to e.g. migrate tasks to other rqs. + * Schedule execution of deferred actions on @rq. Deferred actions are executed + * with @rq locked but unpinned, and thus can unlock @rq to e.g. migrate tasks + * to other rqs. */ static void schedule_deferred(struct rq *rq) { + /* + * Queue an irq work. They are executed on IRQ re-enable which may take + * a bit longer than the scheduler hook in schedule_deferred_locked(). + */ + irq_work_queue(&rq->scx.deferred_irq_work); +} + +/** + * schedule_deferred_locked - Schedule execution of deferred actions on an rq + * @rq: target rq + * + * Schedule execution of deferred actions on @rq. Equivalent to + * schedule_deferred() but requires @rq to be locked and can be more efficient. + */ +static void schedule_deferred_locked(struct rq *rq) +{ lockdep_assert_rq_held(rq); -#ifdef CONFIG_SMP /* * If in the middle of waking up a task, task_woken_scx() will be called * afterwards which will then run the deferred actions, no need to @@ -1716,23 +868,32 @@ static void schedule_deferred(struct rq *rq) if (rq->scx.flags & SCX_RQ_IN_WAKEUP) return; + /* Don't do anything if there already is a deferred operation. */ + if (rq->scx.flags & SCX_RQ_BAL_CB_PENDING) + return; + /* * If in balance, the balance callbacks will be called before rq lock is * released. Schedule one. + * + * + * We can't directly insert the callback into the + * rq's list: The call can drop its lock and make the pending balance + * callback visible to unrelated code paths that call rq_pin_lock(). + * + * Just let balance_one() know that it must do it itself. */ if (rq->scx.flags & SCX_RQ_IN_BALANCE) { - queue_balance_callback(rq, &rq->scx.deferred_bal_cb, - deferred_bal_cb_workfn); + rq->scx.flags |= SCX_RQ_BAL_CB_PENDING; return; } -#endif + /* - * No scheduler hooks available. Queue an irq work. They are executed on - * IRQ re-enable which may take a bit longer than the scheduler hooks. - * The above WAKEUP and BALANCE paths should cover most of the cases and - * the time to IRQ re-enable shouldn't be long. + * No scheduler hooks available. Use the generic irq_work path. The + * above WAKEUP and BALANCE paths should cover most of the cases and the + * time to IRQ re-enable shouldn't be long. */ - irq_work_queue(&rq->scx.deferred_irq_work); + schedule_deferred(rq); } /** @@ -1777,7 +938,7 @@ static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p) lockdep_assert_rq_held(rq); #ifdef CONFIG_SCHED_CORE - if (SCX_HAS_OP(core_sched_before)) + if (unlikely(SCX_HAS_OP(scx_root, core_sched_before))) touch_core_sched(rq, p); #endif } @@ -1815,8 +976,38 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) WRITE_ONCE(dsq->nr, dsq->nr + delta); } -static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, - u64 enq_flags) +static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) +{ + p->scx.slice = READ_ONCE(scx_slice_dfl); + __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); +} + +static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p, + u64 enq_flags) +{ + struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); + bool preempt = false; + + /* + * If @rq is in balance, the CPU is already vacant and looking for the + * next task to run. No need to preempt or trigger resched after moving + * @p into its local DSQ. + */ + if (rq->scx.flags & SCX_RQ_IN_BALANCE) + return; + + if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && + rq->curr->sched_class == &ext_sched_class) { + rq->curr->scx.slice = 0; + preempt = true; + } + + if (preempt || sched_class_above(&ext_sched_class, rq->curr->sched_class)) + resched_curr(rq); +} + +static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, + struct task_struct *p, u64 enq_flags) { bool is_local = dsq->id == SCX_DSQ_LOCAL; @@ -1825,12 +1016,14 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, !RB_EMPTY_NODE(&p->scx.dsq_priq)); if (!is_local) { - raw_spin_lock(&dsq->lock); + raw_spin_lock_nested(&dsq->lock, + (enq_flags & SCX_ENQ_NESTED) ? SINGLE_DEPTH_NESTING : 0); + if (unlikely(dsq->id == SCX_DSQ_INVALID)) { - scx_ops_error("attempting to dispatch to a destroyed dsq"); + scx_error(sch, "attempting to dispatch to a destroyed dsq"); /* fall back to the global dsq */ raw_spin_unlock(&dsq->lock); - dsq = find_global_dsq(p); + dsq = find_global_dsq(sch, p); raw_spin_lock(&dsq->lock); } } @@ -1844,7 +1037,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, * disallow any internal DSQ from doing vtime ordering of * tasks. */ - scx_ops_error("cannot use vtime ordering for built-in DSQs"); + scx_error(sch, "cannot use vtime ordering for built-in DSQs"); enq_flags &= ~SCX_ENQ_DSQ_PRIQ; } @@ -1858,8 +1051,8 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, */ if (unlikely(RB_EMPTY_ROOT(&dsq->priq) && nldsq_next_task(dsq, NULL, false))) - scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks", - dsq->id); + scx_error(sch, "DSQ ID 0x%016llx already had FIFO-enqueued tasks", + dsq->id); p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ; rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less); @@ -1874,19 +1067,31 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, container_of(rbp, struct task_struct, scx.dsq_priq); list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node); + /* first task unchanged - no update needed */ } else { list_add(&p->scx.dsq_list.node, &dsq->list); + /* not builtin and new task is at head - use fastpath */ + rcu_assign_pointer(dsq->first_task, p); } } else { /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ if (unlikely(!RB_EMPTY_ROOT(&dsq->priq))) - scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks", - dsq->id); + scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks", + dsq->id); - if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) + if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) { list_add(&p->scx.dsq_list.node, &dsq->list); - else + /* new task inserted at head - use fastpath */ + if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) + rcu_assign_pointer(dsq->first_task, p); + } else { + bool was_empty; + + was_empty = list_empty(&dsq->list); list_add_tail(&p->scx.dsq_list.node, &dsq->list); + if (was_empty && !(dsq->id & SCX_DSQ_FLAG_BUILTIN)) + rcu_assign_pointer(dsq->first_task, p); + } } /* seq records the order tasks are queued, used by BPF DSQ iterator */ @@ -1912,22 +1117,10 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, if (enq_flags & SCX_ENQ_CLEAR_OPSS) atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); - if (is_local) { - struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); - bool preempt = false; - - if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && - rq->curr->sched_class == &ext_sched_class) { - rq->curr->scx.slice = 0; - preempt = true; - } - - if (preempt || sched_class_above(&ext_sched_class, - rq->curr->sched_class)) - resched_curr(rq); - } else { + if (is_local) + local_dsq_post_enq(dsq, p, enq_flags); + else raw_spin_unlock(&dsq->lock); - } } static void task_unlink_from_dsq(struct task_struct *p, @@ -1943,6 +1136,13 @@ static void task_unlink_from_dsq(struct task_struct *p, list_del_init(&p->scx.dsq_list.node); dsq_mod_nr(dsq, -1); + + if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) { + struct task_struct *first_task; + + first_task = nldsq_next_task(dsq, NULL, false); + rcu_assign_pointer(dsq->first_task, first_task); + } } static void dispatch_dequeue(struct rq *rq, struct task_struct *p) @@ -1950,6 +1150,8 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p) struct scx_dispatch_q *dsq = p->scx.dsq; bool is_local = dsq == &rq->scx.local_dsq; + lockdep_assert_rq_held(rq); + if (!dsq) { /* * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals. @@ -1996,7 +1198,22 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p) raw_spin_unlock(&dsq->lock); } -static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, +/* + * Abbreviated version of dispatch_dequeue() that can be used when both @p's rq + * and dsq are locked. + */ +static void dispatch_dequeue_locked(struct task_struct *p, + struct scx_dispatch_q *dsq) +{ + lockdep_assert_rq_held(task_rq(p)); + lockdep_assert_held(&dsq->lock); + + task_unlink_from_dsq(p, dsq); + p->scx.dsq = NULL; +} + +static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, + struct rq *rq, u64 dsq_id, struct task_struct *p) { struct scx_dispatch_q *dsq; @@ -2007,27 +1224,28 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; - if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) - return find_global_dsq(p); + if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) + return find_global_dsq(sch, p); return &cpu_rq(cpu)->scx.local_dsq; } if (dsq_id == SCX_DSQ_GLOBAL) - dsq = find_global_dsq(p); + dsq = find_global_dsq(sch, p); else - dsq = find_user_dsq(dsq_id); + dsq = find_user_dsq(sch, dsq_id); if (unlikely(!dsq)) { - scx_ops_error("non-existent DSQ 0x%llx for %s[%d]", - dsq_id, p->comm, p->pid); - return find_global_dsq(p); + scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]", + dsq_id, p->comm, p->pid); + return find_global_dsq(sch, p); } return dsq; } -static void mark_direct_dispatch(struct task_struct *ddsp_task, +static void mark_direct_dispatch(struct scx_sched *sch, + struct task_struct *ddsp_task, struct task_struct *p, u64 dsq_id, u64 enq_flags) { @@ -2041,12 +1259,12 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task, /* @p must match the task on the enqueue path */ if (unlikely(p != ddsp_task)) { if (IS_ERR(ddsp_task)) - scx_ops_error("%s[%d] already direct-dispatched", - p->comm, p->pid); + scx_error(sch, "%s[%d] already direct-dispatched", + p->comm, p->pid); else - scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]", - ddsp_task->comm, ddsp_task->pid, - p->comm, p->pid); + scx_error(sch, "scheduling for %s[%d] but trying to direct-dispatch %s[%d]", + ddsp_task->comm, ddsp_task->pid, + p->comm, p->pid); return; } @@ -2057,11 +1275,12 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task, p->scx.ddsp_enq_flags = enq_flags; } -static void direct_dispatch(struct task_struct *p, u64 enq_flags) +static void direct_dispatch(struct scx_sched *sch, struct task_struct *p, + u64 enq_flags) { struct rq *rq = task_rq(p); struct scx_dispatch_q *dsq = - find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p); + find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p); touch_core_sched_dispatch(rq, p); @@ -2098,11 +1317,12 @@ static void direct_dispatch(struct task_struct *p, u64 enq_flags) WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); list_add_tail(&p->scx.dsq_list.node, &rq->scx.ddsp_deferred_locals); - schedule_deferred(rq); + schedule_deferred_locked(rq); return; } - dispatch_enqueue(dsq, p, p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); + dispatch_enqueue(sch, dsq, p, + p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); } static bool scx_rq_online(struct rq *rq) @@ -2120,7 +1340,9 @@ static bool scx_rq_online(struct rq *rq) static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, int sticky_cpu) { + struct scx_sched *sch = scx_root; struct task_struct **ddsp_taskp; + struct scx_dispatch_q *dsq; unsigned long qseq; WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); @@ -2138,28 +1360,28 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, goto local; if (scx_rq_bypassing(rq)) { - __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1); - goto global; + __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); + goto bypass; } if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) goto direct; /* see %SCX_OPS_ENQ_EXITING */ - if (!static_branch_unlikely(&scx_ops_enq_exiting) && + if (!(sch->ops.flags & SCX_OPS_ENQ_EXITING) && unlikely(p->flags & PF_EXITING)) { - __scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1); + __scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1); goto local; } /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ - if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) && + if (!(sch->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) && is_migration_disabled(p)) { - __scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1); + __scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1); goto local; } - if (!SCX_HAS_OP(enqueue)) + if (unlikely(!SCX_HAS_OP(sch, enqueue))) goto global; /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */ @@ -2172,7 +1394,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, WARN_ON_ONCE(*ddsp_taskp); *ddsp_taskp = p; - SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags); + SCX_CALL_OP_TASK(sch, SCX_KF_ENQUEUE, enqueue, rq, p, enq_flags); *ddsp_taskp = NULL; if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) @@ -2186,27 +1408,30 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, return; direct: - direct_dispatch(p, enq_flags); + direct_dispatch(sch, p, enq_flags); + return; +local_norefill: + dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags); return; - local: + dsq = &rq->scx.local_dsq; + goto enqueue; +global: + dsq = find_global_dsq(sch, p); + goto enqueue; +bypass: + dsq = &task_rq(p)->scx.bypass_dsq; + goto enqueue; + +enqueue: /* * For task-ordering, slice refill must be treated as implying the end * of the current slice. Otherwise, the longer @p stays on the CPU, the * higher priority it becomes from scx_prio_less()'s POV. */ touch_core_sched(rq, p); - p->scx.slice = SCX_SLICE_DFL; - __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); -local_norefill: - dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags); - return; - -global: - touch_core_sched(rq, p); /* see the comment in local: */ - p->scx.slice = SCX_SLICE_DFL; - __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); - dispatch_enqueue(find_global_dsq(p), p, enq_flags); + refill_task_slice_dfl(sch, p); + dispatch_enqueue(sch, dsq, p, enq_flags); } static bool task_runnable(const struct task_struct *p) @@ -2224,7 +1449,7 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p) } /* - * list_add_tail() must be used. scx_ops_bypass() depends on tasks being + * list_add_tail() must be used. scx_bypass() depends on tasks being * appended to the runnable_list. */ list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); @@ -2239,6 +1464,7 @@ static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags) { + struct scx_sched *sch = scx_root; int sticky_cpu = p->scx.sticky_cpu; if (enq_flags & ENQUEUE_WAKEUP) @@ -2268,8 +1494,8 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags rq->scx.nr_running++; add_nr_running(rq, 1); - if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p)) - SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags); + if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, runnable, rq, p, enq_flags); if (enq_flags & SCX_ENQ_WAKEUP) touch_core_sched(rq, p); @@ -2280,11 +1506,12 @@ out: if ((enq_flags & SCX_ENQ_CPU_SELECTED) && unlikely(cpu_of(rq) != p->scx.selected_cpu)) - __scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1); + __scx_add_event(sch, SCX_EV_SELECT_CPU_FALLBACK, 1); } -static void ops_dequeue(struct task_struct *p, u64 deq_flags) +static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) { + struct scx_sched *sch = scx_root; unsigned long opss; /* dequeue is always temporary, don't reset runnable_at */ @@ -2303,8 +1530,9 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags) */ BUG(); case SCX_OPSS_QUEUED: - if (SCX_HAS_OP(dequeue)) - SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags); + if (SCX_HAS_OP(sch, dequeue)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq, + p, deq_flags); if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, SCX_OPSS_NONE)) @@ -2332,12 +1560,14 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags) static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags) { + struct scx_sched *sch = scx_root; + if (!(p->scx.flags & SCX_TASK_QUEUED)) { WARN_ON_ONCE(task_runnable(p)); return true; } - ops_dequeue(p, deq_flags); + ops_dequeue(rq, p, deq_flags); /* * A currently running task which is going off @rq first gets dequeued @@ -2347,17 +1577,17 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags * * @p may go through multiple stopping <-> running transitions between * here and put_prev_task_scx() if task attribute changes occur while - * balance_scx() leaves @rq unlocked. However, they don't contain any + * balance_one() leaves @rq unlocked. However, they don't contain any * information meaningful to the BPF scheduler and can be suppressed by * skipping the callbacks if the task is !QUEUED. */ - if (SCX_HAS_OP(stopping) && task_current(rq, p)) { + if (SCX_HAS_OP(sch, stopping) && task_current(rq, p)) { update_curr_scx(rq); - SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false); + SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, false); } - if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p)) - SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags); + if (SCX_HAS_OP(sch, quiescent) && !task_on_rq_migrating(p)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, quiescent, rq, p, deq_flags); if (deq_flags & SCX_DEQ_SLEEP) p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; @@ -2374,20 +1604,23 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags static void yield_task_scx(struct rq *rq) { - struct task_struct *p = rq->curr; + struct scx_sched *sch = scx_root; + struct task_struct *p = rq->donor; - if (SCX_HAS_OP(yield)) - SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL); + if (SCX_HAS_OP(sch, yield)) + SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL); else p->scx.slice = 0; } static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) { - struct task_struct *from = rq->curr; + struct scx_sched *sch = scx_root; + struct task_struct *from = rq->donor; - if (SCX_HAS_OP(yield)) - return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to); + if (SCX_HAS_OP(sch, yield)) + return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, + from, to); else return false; } @@ -2411,9 +1644,10 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, dsq_mod_nr(dst_dsq, 1); p->scx.dsq = dst_dsq; + + local_dsq_post_enq(dst_dsq, p, enq_flags); } -#ifdef CONFIG_SMP /** * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ * @p: task to move @@ -2467,7 +1701,8 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, * * The caller must ensure that @p and @rq are on different CPUs. */ -static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, +static bool task_can_run_on_remote_rq(struct scx_sched *sch, + struct task_struct *p, struct rq *rq, bool enforce) { int cpu = cpu_of(rq); @@ -2488,8 +1723,8 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, */ if (unlikely(is_migration_disabled(p))) { if (enforce) - scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d", - p->comm, p->pid, task_cpu(p), cpu); + scx_error(sch, "SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d", + p->comm, p->pid, task_cpu(p), cpu); return false; } @@ -2501,14 +1736,14 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, */ if (!task_allowed_on_cpu(p, cpu)) { if (enforce) - scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]", - cpu, p->comm, p->pid); + scx_error(sch, "SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]", + cpu, p->comm, p->pid); return false; } if (!scx_rq_online(rq)) { if (enforce) - __scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); + __scx_add_event(sch, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); return false; } @@ -2578,14 +1813,10 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p, return false; } } -#else /* CONFIG_SMP */ -static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); } -static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool enforce) { return false; } -static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; } -#endif /* CONFIG_SMP */ /** * move_task_between_dsqs() - Move a task from one DSQ to another + * @sch: scx_sched being operated on * @p: target task * @enq_flags: %SCX_ENQ_* * @src_dsq: DSQ @p is currently on, must not be a local DSQ @@ -2599,7 +1830,8 @@ static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p * On return, @src_dsq is unlocked and only @p's new task_rq, which is the * return value, is locked. */ -static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, +static struct rq *move_task_between_dsqs(struct scx_sched *sch, + struct task_struct *p, u64 enq_flags, struct scx_dispatch_q *src_dsq, struct scx_dispatch_q *dst_dsq) { @@ -2612,8 +1844,8 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, if (dst_dsq->id == SCX_DSQ_LOCAL) { dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); if (src_rq != dst_rq && - unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { - dst_dsq = find_global_dsq(p); + unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { + dst_dsq = find_global_dsq(sch, p); dst_rq = src_rq; } } else { @@ -2642,58 +1874,21 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, * @p is going from a non-local DSQ to a non-local DSQ. As * $src_dsq is already locked, do an abbreviated dequeue. */ - task_unlink_from_dsq(p, src_dsq); - p->scx.dsq = NULL; + dispatch_dequeue_locked(p, src_dsq); raw_spin_unlock(&src_dsq->lock); - dispatch_enqueue(dst_dsq, p, enq_flags); + dispatch_enqueue(sch, dst_dsq, p, enq_flags); } return dst_rq; } -/* - * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly - * banging on the same DSQ on a large NUMA system to the point where switching - * to the bypass mode can take a long time. Inject artificial delays while the - * bypass mode is switching to guarantee timely completion. - */ -static void scx_ops_breather(struct rq *rq) -{ - u64 until; - - lockdep_assert_rq_held(rq); - - if (likely(!atomic_read(&scx_ops_breather_depth))) - return; - - raw_spin_rq_unlock(rq); - - until = ktime_get_ns() + NSEC_PER_MSEC; - - do { - int cnt = 1024; - while (atomic_read(&scx_ops_breather_depth) && --cnt) - cpu_relax(); - } while (atomic_read(&scx_ops_breather_depth) && - time_before64(ktime_get_ns(), until)); - - raw_spin_rq_lock(rq); -} - -static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq) +static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq, + struct scx_dispatch_q *dsq) { struct task_struct *p; retry: /* - * This retry loop can repeatedly race against scx_ops_bypass() - * dequeueing tasks from @dsq trying to put the system into the bypass - * mode. On some multi-socket machines (e.g. 2x Intel 8480c), this can - * live-lock the machine into soft lockups. Give a breather. - */ - scx_ops_breather(rq); - - /* * The caller can't expect to successfully consume a task if the task's * addition to @dsq isn't guaranteed to be visible somehow. Test * @dsq->list without locking and skip if it seems empty. @@ -2706,6 +1901,17 @@ retry: nldsq_for_each_task(p, dsq) { struct rq *task_rq = task_rq(p); + /* + * This loop can lead to multiple lockup scenarios, e.g. the BPF + * scheduler can put an enormous number of affinitized tasks into + * a contended DSQ, or the outer retry loop can repeatedly race + * against scx_bypass() dequeueing tasks from @dsq trying to put + * the system into the bypass mode. This can easily live-lock the + * machine. If aborting, exit from all non-bypass DSQs. + */ + if (unlikely(READ_ONCE(scx_aborting)) && dsq->id != SCX_DSQ_BYPASS) + break; + if (rq == task_rq) { task_unlink_from_dsq(p, dsq); move_local_task_to_local_dsq(p, 0, dsq, rq); @@ -2713,7 +1919,7 @@ retry: return true; } - if (task_can_run_on_remote_rq(p, rq, false)) { + if (task_can_run_on_remote_rq(sch, p, rq, false)) { if (likely(consume_remote_task(rq, p, dsq, task_rq))) return true; goto retry; @@ -2724,15 +1930,16 @@ retry: return false; } -static bool consume_global_dsq(struct rq *rq) +static bool consume_global_dsq(struct scx_sched *sch, struct rq *rq) { int node = cpu_to_node(cpu_of(rq)); - return consume_dispatch_q(rq, global_dsqs[node]); + return consume_dispatch_q(sch, rq, sch->global_dsqs[node]); } /** * dispatch_to_local_dsq - Dispatch a task to a local dsq + * @sch: scx_sched being operated on * @rq: current rq which is locked * @dst_dsq: destination DSQ * @p: task to dispatch @@ -2745,14 +1952,13 @@ static bool consume_global_dsq(struct rq *rq) * The caller must have exclusive ownership of @p (e.g. through * %SCX_OPSS_DISPATCHING). */ -static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, +static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, + struct scx_dispatch_q *dst_dsq, struct task_struct *p, u64 enq_flags) { struct rq *src_rq = task_rq(p); struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); -#ifdef CONFIG_SMP struct rq *locked_rq = rq; -#endif /* * We're synchronized against dequeue through DISPATCHING. As @p can't @@ -2761,14 +1967,14 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, * If dispatching to @rq that @p is already on, no lock dancing needed. */ if (rq == src_rq && rq == dst_rq) { - dispatch_enqueue(dst_dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); + dispatch_enqueue(sch, dst_dsq, p, + enq_flags | SCX_ENQ_CLEAR_OPSS); return; } -#ifdef CONFIG_SMP if (src_rq != dst_rq && - unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { - dispatch_enqueue(find_global_dsq(p), p, + unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { + dispatch_enqueue(sch, find_global_dsq(sch, p), p, enq_flags | SCX_ENQ_CLEAR_OPSS); return; } @@ -2806,7 +2012,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, */ if (src_rq == dst_rq) { p->scx.holding_cpu = -1; - dispatch_enqueue(&dst_rq->scx.local_dsq, p, enq_flags); + dispatch_enqueue(sch, &dst_rq->scx.local_dsq, p, + enq_flags); } else { move_remote_task_to_local_dsq(p, enq_flags, src_rq, dst_rq); @@ -2824,9 +2031,6 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, raw_spin_rq_unlock(locked_rq); raw_spin_rq_lock(rq); } -#else /* CONFIG_SMP */ - BUG(); /* control can not reach here on UP */ -#endif /* CONFIG_SMP */ } /** @@ -2848,7 +2052,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, * was valid in the first place. Make sure that the task is still owned by the * BPF scheduler and claim the ownership before dispatching. */ -static void finish_dispatch(struct rq *rq, struct task_struct *p, +static void finish_dispatch(struct scx_sched *sch, struct rq *rq, + struct task_struct *p, unsigned long qseq_at_dispatch, u64 dsq_id, u64 enq_flags) { @@ -2901,15 +2106,15 @@ retry: BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); - dsq = find_dsq_for_dispatch(this_rq(), dsq_id, p); + dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, p); if (dsq->id == SCX_DSQ_LOCAL) - dispatch_to_local_dsq(rq, dsq, p, enq_flags); + dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags); else - dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); + dispatch_enqueue(sch, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); } -static void flush_dispatch_buf(struct rq *rq) +static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq) { struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); u32 u; @@ -2917,7 +2122,7 @@ static void flush_dispatch_buf(struct rq *rq) for (u = 0; u < dspc->cursor; u++) { struct scx_dsp_buf_ent *ent = &dspc->buf[u]; - finish_dispatch(rq, ent->task, ent->qseq, ent->dsq_id, + finish_dispatch(sch, rq, ent->task, ent->qseq, ent->dsq_id, ent->enq_flags); } @@ -2925,8 +2130,22 @@ static void flush_dispatch_buf(struct rq *rq) dspc->cursor = 0; } +static inline void maybe_queue_balance_callback(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + + if (!(rq->scx.flags & SCX_RQ_BAL_CB_PENDING)) + return; + + queue_balance_callback(rq, &rq->scx.deferred_bal_cb, + deferred_bal_cb_workfn); + + rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING; +} + static int balance_one(struct rq *rq, struct task_struct *prev) { + struct scx_sched *sch = scx_root; struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); bool prev_on_scx = prev->sched_class == &ext_sched_class; bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED; @@ -2934,9 +2153,9 @@ static int balance_one(struct rq *rq, struct task_struct *prev) lockdep_assert_rq_held(rq); rq->scx.flags |= SCX_RQ_IN_BALANCE; - rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP); + rq->scx.flags &= ~SCX_RQ_BAL_KEEP; - if (static_branch_unlikely(&scx_ops_cpu_preempt) && + if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) && unlikely(rq->scx.cpu_released)) { /* * If the previous sched_class for the current CPU was not SCX, @@ -2944,8 +2163,9 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * core. This callback complements ->cpu_release(), which is * emitted in switch_class(). */ - if (SCX_HAS_OP(cpu_acquire)) - SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL); + if (SCX_HAS_OP(sch, cpu_acquire)) + SCX_CALL_OP(sch, SCX_KF_REST, cpu_acquire, rq, + cpu_of(rq), NULL); rq->scx.cpu_released = false; } @@ -2959,8 +2179,8 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * scheduler wants to handle this explicitly, it should * implement ->cpu_release(). * - * See scx_ops_disable_workfn() for the explanation on the - * bypassing test. + * See scx_disable_workfn() for the explanation on the bypassing + * test. */ if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) { rq->scx.flags |= SCX_RQ_BAL_KEEP; @@ -2972,10 +2192,17 @@ static int balance_one(struct rq *rq, struct task_struct *prev) if (rq->scx.local_dsq.nr) goto has_tasks; - if (consume_global_dsq(rq)) + if (consume_global_dsq(sch, rq)) goto has_tasks; - if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq)) + if (scx_rq_bypassing(rq)) { + if (consume_dispatch_q(sch, rq, &rq->scx.bypass_dsq)) + goto has_tasks; + else + goto no_tasks; + } + + if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq)) goto no_tasks; dspc->rq = rq; @@ -2990,10 +2217,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev) do { dspc->nr_tasks = 0; - SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq), - prev_on_scx ? prev : NULL); + SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq, + cpu_of(rq), prev_on_scx ? prev : NULL); - flush_dispatch_buf(rq); + flush_dispatch_buf(sch, rq); if (prev_on_rq && prev->scx.slice) { rq->scx.flags |= SCX_RQ_BAL_KEEP; @@ -3001,7 +2228,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) } if (rq->scx.local_dsq.nr) goto has_tasks; - if (consume_global_dsq(rq)) + if (consume_global_dsq(sch, rq)) goto has_tasks; /* @@ -3011,10 +2238,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * balance(), we want to complete this scheduling cycle and then * start a new one. IOW, we want to call resched_curr() on the * next, most likely idle, task, not the current one. Use - * scx_bpf_kick_cpu() for deferred kicking. + * scx_kick_cpu() for deferred kicking. */ if (unlikely(!--nr_loops)) { - scx_bpf_kick_cpu(cpu_of(rq), 0); + scx_kick_cpu(sch, cpu_of(rq), 0); break; } } while (dspc->nr_tasks); @@ -3024,10 +2251,10 @@ no_tasks: * Didn't find another task to run. Keep running @prev unless * %SCX_OPS_ENQ_LAST is in effect. */ - if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) || - scx_rq_bypassing(rq))) { + if (prev_on_rq && + (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) { rq->scx.flags |= SCX_RQ_BAL_KEEP; - __scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1); + __scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1); goto has_tasks; } rq->scx.flags &= ~SCX_RQ_IN_BALANCE; @@ -3038,40 +2265,6 @@ has_tasks: return true; } -static int balance_scx(struct rq *rq, struct task_struct *prev, - struct rq_flags *rf) -{ - int ret; - - rq_unpin_lock(rq, rf); - - ret = balance_one(rq, prev); - -#ifdef CONFIG_SCHED_SMT - /* - * When core-sched is enabled, this ops.balance() call will be followed - * by pick_task_scx() on this CPU and the SMT siblings. Balance the - * siblings too. - */ - if (sched_core_enabled(rq)) { - const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); - int scpu; - - for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) { - struct rq *srq = cpu_rq(scpu); - struct task_struct *sprev = srq->curr; - - WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq)); - update_rq_clock(srq); - balance_one(srq, sprev); - } - } -#endif - rq_repin_lock(rq, rf); - - return ret; -} - static void process_ddsp_deferred_locals(struct rq *rq) { struct task_struct *p; @@ -3087,32 +2280,36 @@ static void process_ddsp_deferred_locals(struct rq *rq) */ while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals, struct task_struct, scx.dsq_list.node))) { + struct scx_sched *sch = scx_root; struct scx_dispatch_q *dsq; list_del_init(&p->scx.dsq_list.node); - dsq = find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p); + dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p); if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) - dispatch_to_local_dsq(rq, dsq, p, p->scx.ddsp_enq_flags); + dispatch_to_local_dsq(sch, rq, dsq, p, + p->scx.ddsp_enq_flags); } } static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) { + struct scx_sched *sch = scx_root; + if (p->scx.flags & SCX_TASK_QUEUED) { /* * Core-sched might decide to execute @p before it is * dispatched. Call ops_dequeue() to notify the BPF scheduler. */ - ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC); + ops_dequeue(rq, p, SCX_DEQ_CORE_SCHED_EXEC); dispatch_dequeue(rq, p); } p->se.exec_start = rq_clock_task(rq); /* see dequeue_task_scx() on why we skip when !QUEUED */ - if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED)) - SCX_CALL_OP_TASK(SCX_KF_REST, running, p); + if (SCX_HAS_OP(sch, running) && (p->scx.flags & SCX_TASK_QUEUED)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, running, rq, p); clr_task_runnable(p, true); @@ -3142,10 +2339,8 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) static enum scx_cpu_preempt_reason preempt_reason_from_class(const struct sched_class *class) { -#ifdef CONFIG_SMP if (class == &stop_sched_class) return SCX_CPU_PREEMPT_STOP; -#endif if (class == &dl_sched_class) return SCX_CPU_PREEMPT_DL; if (class == &rt_sched_class) @@ -3155,17 +2350,10 @@ preempt_reason_from_class(const struct sched_class *class) static void switch_class(struct rq *rq, struct task_struct *next) { + struct scx_sched *sch = scx_root; const struct sched_class *next_class = next->sched_class; -#ifdef CONFIG_SMP - /* - * Pairs with the smp_load_acquire() issued by a CPU in - * kick_cpus_irq_workfn() who is waiting for this CPU to perform a - * resched. - */ - smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); -#endif - if (!static_branch_unlikely(&scx_ops_cpu_preempt)) + if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT)) return; /* @@ -3184,17 +2372,17 @@ static void switch_class(struct rq *rq, struct task_struct *next) * preempted, and it regaining control of the CPU. * * ->cpu_release() complements ->cpu_acquire(), which is emitted the - * next time that balance_scx() is invoked. + * next time that balance_one() is invoked. */ if (!rq->scx.cpu_released) { - if (SCX_HAS_OP(cpu_release)) { + if (SCX_HAS_OP(sch, cpu_release)) { struct scx_cpu_release_args args = { .reason = preempt_reason_from_class(next_class), .task = next, }; - SCX_CALL_OP(SCX_KF_CPU_RELEASE, - cpu_release, cpu_of(rq), &args); + SCX_CALL_OP(sch, SCX_KF_CPU_RELEASE, cpu_release, rq, + cpu_of(rq), &args); } rq->scx.cpu_released = true; } @@ -3203,11 +2391,16 @@ static void switch_class(struct rq *rq, struct task_struct *next) static void put_prev_task_scx(struct rq *rq, struct task_struct *p, struct task_struct *next) { + struct scx_sched *sch = scx_root; + + /* see kick_cpus_irq_workfn() */ + smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); + update_curr_scx(rq); /* see dequeue_task_scx() on why we skip when !QUEUED */ - if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED)) - SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true); + if (SCX_HAS_OP(sch, stopping) && (p->scx.flags & SCX_TASK_QUEUED)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, true); if (p->scx.flags & SCX_TASK_QUEUED) { set_task_runnable(rq, p); @@ -3219,7 +2412,8 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, * DSQ. */ if (p->scx.slice && !scx_rq_bypassing(rq)) { - dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); + dispatch_enqueue(sch, &rq->scx.local_dsq, p, + SCX_ENQ_HEAD); goto switch_class; } @@ -3229,8 +2423,8 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, * ops.enqueue() that @p is the only one available for this cpu, * which should trigger an explicit follow-up scheduling event. */ - if (sched_class_above(&ext_sched_class, next->sched_class)) { - WARN_ON_ONCE(!static_branch_unlikely(&scx_ops_enq_last)); + if (next && sched_class_above(&ext_sched_class, next->sched_class)) { + WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST)); do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); } else { do_enqueue_task(rq, p, 0, -1); @@ -3248,78 +2442,75 @@ static struct task_struct *first_local_task(struct rq *rq) struct task_struct, scx.dsq_list.node); } -static struct task_struct *pick_task_scx(struct rq *rq) +static struct task_struct * +do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) { struct task_struct *prev = rq->curr; + bool keep_prev; struct task_struct *p; - bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; - bool kick_idle = false; + + /* see kick_cpus_irq_workfn() */ + smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); + + rq_modified_clear(rq); + + rq_unpin_lock(rq, rf); + balance_one(rq, prev); + rq_repin_lock(rq, rf); + maybe_queue_balance_callback(rq); /* - * WORKAROUND: - * - * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just - * have gone through balance_scx(). Unfortunately, there currently is a - * bug where fair could say yes on balance() but no on pick_task(), - * which then ends up calling pick_task_scx() without preceding - * balance_scx(). + * If any higher-priority sched class enqueued a runnable task on + * this rq during balance_one(), abort and return RETRY_TASK, so + * that the scheduler loop can restart. * - * Keep running @prev if possible and avoid stalling from entering idle - * without balancing. - * - * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE() - * if pick_task_scx() is called without preceding balance_scx(). + * If @force_scx is true, always try to pick a SCHED_EXT task, + * regardless of any higher-priority sched classes activity. */ - if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) { - if (prev->scx.flags & SCX_TASK_QUEUED) { - keep_prev = true; - } else { - keep_prev = false; - kick_idle = true; - } - } else if (unlikely(keep_prev && - prev->sched_class != &ext_sched_class)) { - /* - * Can happen while enabling as SCX_RQ_BAL_PENDING assertion is - * conditional on scx_enabled() and may have been skipped. - */ - WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED); + if (!force_scx && rq_modified_above(rq, &ext_sched_class)) + return RETRY_TASK; + + keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; + if (unlikely(keep_prev && + prev->sched_class != &ext_sched_class)) { + WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED); keep_prev = false; } /* - * If balance_scx() is telling us to keep running @prev, replenish slice + * If balance_one() is telling us to keep running @prev, replenish slice * if necessary and keep running @prev. Otherwise, pop the first one * from the local DSQ. */ if (keep_prev) { p = prev; - if (!p->scx.slice) { - p->scx.slice = SCX_SLICE_DFL; - __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); - } + if (!p->scx.slice) + refill_task_slice_dfl(rcu_dereference_sched(scx_root), p); } else { p = first_local_task(rq); - if (!p) { - if (kick_idle) - scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE); + if (!p) return NULL; - } if (unlikely(!p->scx.slice)) { - if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) { + struct scx_sched *sch = rcu_dereference_sched(scx_root); + + if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) { printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", p->comm, p->pid, __func__); - scx_warned_zero_slice = true; + sch->warned_zero_slice = true; } - p->scx.slice = SCX_SLICE_DFL; - __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); + refill_task_slice_dfl(sch, p); } } return p; } +static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) +{ + return do_pick_task_scx(rq, rf, false); +} + #ifdef CONFIG_SCHED_CORE /** * scx_prio_less - Task ordering for core-sched @@ -3342,13 +2533,17 @@ static struct task_struct *pick_task_scx(struct rq *rq) bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, bool in_fi) { + struct scx_sched *sch = scx_root; + /* * The const qualifiers are dropped from task_struct pointers when * calling ops.core_sched_before(). Accesses are controlled by the * verifier. */ - if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a))) - return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, + if (SCX_HAS_OP(sch, core_sched_before) && + !scx_rq_bypassing(task_rq(a))) + return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, core_sched_before, + NULL, (struct task_struct *)a, (struct task_struct *)b); else @@ -3356,10 +2551,9 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, } #endif /* CONFIG_SCHED_CORE */ -#ifdef CONFIG_SMP - static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) { + struct scx_sched *sch = scx_root; bool rq_bypass; /* @@ -3376,7 +2570,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag return prev_cpu; rq_bypass = scx_rq_bypassing(task_rq(p)); - if (SCX_HAS_OP(select_cpu) && !rq_bypass) { + if (likely(SCX_HAS_OP(sch, select_cpu)) && !rq_bypass) { s32 cpu; struct task_struct **ddsp_taskp; @@ -3384,29 +2578,30 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag WARN_ON_ONCE(*ddsp_taskp); *ddsp_taskp = p; - cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, - select_cpu, p, prev_cpu, wake_flags); + cpu = SCX_CALL_OP_TASK_RET(sch, + SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, + select_cpu, NULL, p, prev_cpu, + wake_flags); p->scx.selected_cpu = cpu; *ddsp_taskp = NULL; - if (ops_cpu_valid(cpu, "from ops.select_cpu()")) + if (ops_cpu_valid(sch, cpu, "from ops.select_cpu()")) return cpu; else return prev_cpu; } else { s32 cpu; - cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0); + cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0); if (cpu >= 0) { - p->scx.slice = SCX_SLICE_DFL; + refill_task_slice_dfl(sch, p); p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; - __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); } else { cpu = prev_cpu; } p->scx.selected_cpu = cpu; if (rq_bypass) - __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1); + __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); return cpu; } } @@ -3419,6 +2614,8 @@ static void task_woken_scx(struct rq *rq, struct task_struct *p) static void set_cpus_allowed_scx(struct task_struct *p, struct affinity_context *ac) { + struct scx_sched *sch = scx_root; + set_cpus_allowed_common(p, ac); /* @@ -3429,28 +2626,38 @@ static void set_cpus_allowed_scx(struct task_struct *p, * Fine-grained memory write control is enforced by BPF making the const * designation pointless. Cast it away when calling the operation. */ - if (SCX_HAS_OP(set_cpumask)) - SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, - (struct cpumask *)p->cpus_ptr); + if (SCX_HAS_OP(sch, set_cpumask)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, NULL, + p, (struct cpumask *)p->cpus_ptr); } static void handle_hotplug(struct rq *rq, bool online) { + struct scx_sched *sch = scx_root; int cpu = cpu_of(rq); atomic_long_inc(&scx_hotplug_seq); + /* + * scx_root updates are protected by cpus_read_lock() and will stay + * stable here. Note that we can't depend on scx_enabled() test as the + * hotplug ops need to be enabled before __scx_enabled is set. + */ + if (unlikely(!sch)) + return; + if (scx_enabled()) - scx_idle_update_selcpu_topology(&scx_ops); + scx_idle_update_selcpu_topology(&sch->ops); - if (online && SCX_HAS_OP(cpu_online)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu); - else if (!online && SCX_HAS_OP(cpu_offline)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, cpu); + if (online && SCX_HAS_OP(sch, cpu_online)) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_online, NULL, cpu); + else if (!online && SCX_HAS_OP(sch, cpu_offline)) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_offline, NULL, cpu); else - scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, - "cpu %d going %s, exiting scheduler", cpu, - online ? "online" : "offline"); + scx_exit(sch, SCX_EXIT_UNREG_KERN, + SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, + "cpu %d going %s, exiting scheduler", cpu, + online ? "online" : "offline"); } void scx_rq_activate(struct rq *rq) @@ -3473,15 +2680,19 @@ static void rq_offline_scx(struct rq *rq) rq->scx.flags &= ~SCX_RQ_ONLINE; } -#endif /* CONFIG_SMP */ static bool check_rq_for_timeouts(struct rq *rq) { + struct scx_sched *sch; struct task_struct *p; struct rq_flags rf; bool timed_out = false; rq_lock_irqsave(rq, &rf); + sch = rcu_dereference_bh(scx_root); + if (unlikely(!sch)) + goto out_unlock; + list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) { unsigned long last_runnable = p->scx.runnable_at; @@ -3489,16 +2700,15 @@ static bool check_rq_for_timeouts(struct rq *rq) last_runnable + scx_watchdog_timeout))) { u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); - scx_ops_error_kind(SCX_EXIT_ERROR_STALL, - "%s[%d] failed to run for %u.%03us", - p->comm, p->pid, - dur_ms / 1000, dur_ms % 1000); + scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, + "%s[%d] failed to run for %u.%03us", + p->comm, p->pid, dur_ms / 1000, dur_ms % 1000); timed_out = true; break; } } +out_unlock: rq_unlock_irqrestore(rq, &rf); - return timed_out; } @@ -3520,19 +2730,24 @@ static void scx_watchdog_workfn(struct work_struct *work) void scx_tick(struct rq *rq) { + struct scx_sched *sch; unsigned long last_check; if (!scx_enabled()) return; + sch = rcu_dereference_bh(scx_root); + if (unlikely(!sch)) + return; + last_check = READ_ONCE(scx_watchdog_timestamp); if (unlikely(time_after(jiffies, last_check + READ_ONCE(scx_watchdog_timeout)))) { u32 dur_ms = jiffies_to_msecs(jiffies - last_check); - scx_ops_error_kind(SCX_EXIT_ERROR_STALL, - "watchdog failed to check in for %u.%03us", - dur_ms / 1000, dur_ms % 1000); + scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, + "watchdog failed to check in for %u.%03us", + dur_ms / 1000, dur_ms % 1000); } update_other_load_avgs(rq); @@ -3540,6 +2755,8 @@ void scx_tick(struct rq *rq) static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) { + struct scx_sched *sch = scx_root; + update_curr_scx(rq); /* @@ -3549,8 +2766,8 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) if (scx_rq_bypassing(rq)) { curr->scx.slice = 0; touch_core_sched(rq, curr); - } else if (SCX_HAS_OP(tick)) { - SCX_CALL_OP_TASK(SCX_KF_REST, tick, curr); + } else if (SCX_HAS_OP(sch, tick)) { + SCX_CALL_OP_TASK(sch, SCX_KF_REST, tick, rq, curr); } if (!curr->scx.slice) @@ -3615,21 +2832,23 @@ static void scx_set_task_state(struct task_struct *p, enum scx_task_state state) p->scx.flags |= state << SCX_TASK_STATE_SHIFT; } -static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool fork) +static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork) { + struct scx_sched *sch = scx_root; int ret; p->scx.disallow = false; - if (SCX_HAS_OP(init_task)) { + if (SCX_HAS_OP(sch, init_task)) { struct scx_init_task_args args = { SCX_INIT_TASK_ARGS_CGROUP(tg) .fork = fork, }; - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, p, &args); + ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init_task, NULL, + p, &args); if (unlikely(ret)) { - ret = ops_sanitize_err("init_task", ret); + ret = ops_sanitize_err(sch, "init_task", ret); return ret; } } @@ -3657,8 +2876,8 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool task_rq_unlock(rq, p, &rf); } else if (p->policy == SCHED_EXT) { - scx_ops_error("ops.init_task() set task->scx.disallow for %s[%d] during fork", - p->comm, p->pid); + scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork", + p->comm, p->pid); } } @@ -3666,11 +2885,13 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool return 0; } -static void scx_ops_enable_task(struct task_struct *p) +static void scx_enable_task(struct task_struct *p) { + struct scx_sched *sch = scx_root; + struct rq *rq = task_rq(p); u32 weight; - lockdep_assert_rq_held(task_rq(p)); + lockdep_assert_rq_held(rq); /* * Set the weight before calling ops.enable() so that the scheduler @@ -3683,26 +2904,31 @@ static void scx_ops_enable_task(struct task_struct *p) p->scx.weight = sched_weight_to_cgroup(weight); - if (SCX_HAS_OP(enable)) - SCX_CALL_OP_TASK(SCX_KF_REST, enable, p); + if (SCX_HAS_OP(sch, enable)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, enable, rq, p); scx_set_task_state(p, SCX_TASK_ENABLED); - if (SCX_HAS_OP(set_weight)) - SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); + if (SCX_HAS_OP(sch, set_weight)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq, + p, p->scx.weight); } -static void scx_ops_disable_task(struct task_struct *p) +static void scx_disable_task(struct task_struct *p) { - lockdep_assert_rq_held(task_rq(p)); + struct scx_sched *sch = scx_root; + struct rq *rq = task_rq(p); + + lockdep_assert_rq_held(rq); WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); - if (SCX_HAS_OP(disable)) - SCX_CALL_OP_TASK(SCX_KF_REST, disable, p); + if (SCX_HAS_OP(sch, disable)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, disable, rq, p); scx_set_task_state(p, SCX_TASK_READY); } -static void scx_ops_exit_task(struct task_struct *p) +static void scx_exit_task(struct task_struct *p) { + struct scx_sched *sch = scx_root; struct scx_exit_task_args args = { .cancelled = false, }; @@ -3718,15 +2944,16 @@ static void scx_ops_exit_task(struct task_struct *p) case SCX_TASK_READY: break; case SCX_TASK_ENABLED: - scx_ops_disable_task(p); + scx_disable_task(p); break; default: WARN_ON_ONCE(true); return; } - if (SCX_HAS_OP(exit_task)) - SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, p, &args); + if (SCX_HAS_OP(sch, exit_task)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, exit_task, task_rq(p), + p, &args); scx_set_task_state(p, SCX_TASK_NONE); } @@ -3740,7 +2967,7 @@ void init_scx_entity(struct sched_ext_entity *scx) INIT_LIST_HEAD(&scx->runnable_node); scx->runnable_at = jiffies; scx->ddsp_dsq_id = SCX_DSQ_INVALID; - scx->slice = SCX_SLICE_DFL; + scx->slice = READ_ONCE(scx_slice_dfl); } void scx_pre_fork(struct task_struct *p) @@ -3758,15 +2985,15 @@ int scx_fork(struct task_struct *p) { percpu_rwsem_assert_held(&scx_fork_rwsem); - if (scx_ops_init_task_enabled) - return scx_ops_init_task(p, task_group(p), true); + if (scx_init_task_enabled) + return scx_init_task(p, task_group(p), true); else return 0; } void scx_post_fork(struct task_struct *p) { - if (scx_ops_init_task_enabled) { + if (scx_init_task_enabled) { scx_set_task_state(p, SCX_TASK_READY); /* @@ -3779,14 +3006,14 @@ void scx_post_fork(struct task_struct *p) struct rq *rq; rq = task_rq_lock(p, &rf); - scx_ops_enable_task(p); + scx_enable_task(p); task_rq_unlock(rq, p, &rf); } } - spin_lock_irq(&scx_tasks_lock); + raw_spin_lock_irq(&scx_tasks_lock); list_add_tail(&p->scx.tasks_node, &scx_tasks); - spin_unlock_irq(&scx_tasks_lock); + raw_spin_unlock_irq(&scx_tasks_lock); percpu_up_read(&scx_fork_rwsem); } @@ -3799,31 +3026,31 @@ void scx_cancel_fork(struct task_struct *p) rq = task_rq_lock(p, &rf); WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY); - scx_ops_exit_task(p); + scx_exit_task(p); task_rq_unlock(rq, p, &rf); } percpu_up_read(&scx_fork_rwsem); } -void sched_ext_free(struct task_struct *p) +void sched_ext_dead(struct task_struct *p) { unsigned long flags; - spin_lock_irqsave(&scx_tasks_lock, flags); + raw_spin_lock_irqsave(&scx_tasks_lock, flags); list_del_init(&p->scx.tasks_node); - spin_unlock_irqrestore(&scx_tasks_lock, flags); + raw_spin_unlock_irqrestore(&scx_tasks_lock, flags); /* - * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY -> - * ENABLED transitions can't race us. Disable ops for @p. + * @p is off scx_tasks and wholly ours. scx_enable()'s READY -> ENABLED + * transitions can't race us. Disable ops for @p. */ if (scx_get_task_state(p) != SCX_TASK_NONE) { struct rq_flags rf; struct rq *rq; rq = task_rq_lock(p, &rf); - scx_ops_exit_task(p); + scx_exit_task(p); task_rq_unlock(rq, p, &rf); } } @@ -3831,33 +3058,38 @@ void sched_ext_free(struct task_struct *p) static void reweight_task_scx(struct rq *rq, struct task_struct *p, const struct load_weight *lw) { + struct scx_sched *sch = scx_root; + lockdep_assert_rq_held(task_rq(p)); p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight)); - if (SCX_HAS_OP(set_weight)) - SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); + if (SCX_HAS_OP(sch, set_weight)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq, + p, p->scx.weight); } -static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) +static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio) { } static void switching_to_scx(struct rq *rq, struct task_struct *p) { - scx_ops_enable_task(p); + struct scx_sched *sch = scx_root; + + scx_enable_task(p); /* * set_cpus_allowed_scx() is not called while @p is associated with a * different scheduler class. Keep the BPF scheduler up-to-date. */ - if (SCX_HAS_OP(set_cpumask)) - SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, - (struct cpumask *)p->cpus_ptr); + if (SCX_HAS_OP(sch, set_cpumask)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, rq, + p, (struct cpumask *)p->cpus_ptr); } static void switched_from_scx(struct rq *rq, struct task_struct *p) { - scx_ops_disable_task(p); + scx_disable_task(p); } static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {} @@ -3897,59 +3129,66 @@ bool scx_can_stop_tick(struct rq *rq) #ifdef CONFIG_EXT_GROUP_SCHED -DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); +DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_ops_rwsem); static bool scx_cgroup_enabled; +void scx_tg_init(struct task_group *tg) +{ + tg->scx.weight = CGROUP_WEIGHT_DFL; + tg->scx.bw_period_us = default_bw_period_us(); + tg->scx.bw_quota_us = RUNTIME_INF; + tg->scx.idle = false; +} + int scx_tg_online(struct task_group *tg) { + struct scx_sched *sch = scx_root; int ret = 0; - WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED)); - - percpu_down_read(&scx_cgroup_rwsem); + WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED)); if (scx_cgroup_enabled) { - if (SCX_HAS_OP(cgroup_init)) { + if (SCX_HAS_OP(sch, cgroup_init)) { struct scx_cgroup_init_args args = - { .weight = tg->scx_weight }; + { .weight = tg->scx.weight, + .bw_period_us = tg->scx.bw_period_us, + .bw_quota_us = tg->scx.bw_quota_us, + .bw_burst_us = tg->scx.bw_burst_us }; - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, - tg->css.cgroup, &args); + ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, + NULL, tg->css.cgroup, &args); if (ret) - ret = ops_sanitize_err("cgroup_init", ret); + ret = ops_sanitize_err(sch, "cgroup_init", ret); } if (ret == 0) - tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED; + tg->scx.flags |= SCX_TG_ONLINE | SCX_TG_INITED; } else { - tg->scx_flags |= SCX_TG_ONLINE; + tg->scx.flags |= SCX_TG_ONLINE; } - percpu_up_read(&scx_cgroup_rwsem); return ret; } void scx_tg_offline(struct task_group *tg) { - WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE)); - - percpu_down_read(&scx_cgroup_rwsem); + struct scx_sched *sch = scx_root; - if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup); - tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); + WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE)); - percpu_up_read(&scx_cgroup_rwsem); + if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) && + (tg->scx.flags & SCX_TG_INITED)) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, + tg->css.cgroup); + tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); } int scx_cgroup_can_attach(struct cgroup_taskset *tset) { + struct scx_sched *sch = scx_root; struct cgroup_subsys_state *css; struct task_struct *p; int ret; - /* released in scx_finish/cancel_attach() */ - percpu_down_read(&scx_cgroup_rwsem); - if (!scx_cgroup_enabled) return 0; @@ -3967,8 +3206,9 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) if (from == to) continue; - if (SCX_HAS_OP(cgroup_prep_move)) { - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move, + if (SCX_HAS_OP(sch, cgroup_prep_move)) { + ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, + cgroup_prep_move, NULL, p, from, css->cgroup); if (ret) goto err; @@ -3981,18 +3221,20 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) err: cgroup_taskset_for_each(p, css, tset) { - if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p, - p->scx.cgrp_moving_from, css->cgroup); + if (SCX_HAS_OP(sch, cgroup_cancel_move) && + p->scx.cgrp_moving_from) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL, + p, p->scx.cgrp_moving_from, css->cgroup); p->scx.cgrp_moving_from = NULL; } - percpu_up_read(&scx_cgroup_rwsem); - return ops_sanitize_err("cgroup_prep_move", ret); + return ops_sanitize_err(sch, "cgroup_prep_move", ret); } void scx_cgroup_move_task(struct task_struct *p) { + struct scx_sched *sch = scx_root; + if (!scx_cgroup_enabled) return; @@ -4000,68 +3242,101 @@ void scx_cgroup_move_task(struct task_struct *p) * @p must have ops.cgroup_prep_move() called on it and thus * cgrp_moving_from set. */ - if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) - SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p, - p->scx.cgrp_moving_from, tg_cgrp(task_group(p))); + if (SCX_HAS_OP(sch, cgroup_move) && + !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) + SCX_CALL_OP_TASK(sch, SCX_KF_UNLOCKED, cgroup_move, NULL, + p, p->scx.cgrp_moving_from, + tg_cgrp(task_group(p))); p->scx.cgrp_moving_from = NULL; } -void scx_cgroup_finish_attach(void) -{ - percpu_up_read(&scx_cgroup_rwsem); -} - void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) { + struct scx_sched *sch = scx_root; struct cgroup_subsys_state *css; struct task_struct *p; if (!scx_cgroup_enabled) - goto out_unlock; + return; cgroup_taskset_for_each(p, css, tset) { - if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p, - p->scx.cgrp_moving_from, css->cgroup); + if (SCX_HAS_OP(sch, cgroup_cancel_move) && + p->scx.cgrp_moving_from) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL, + p, p->scx.cgrp_moving_from, css->cgroup); p->scx.cgrp_moving_from = NULL; } -out_unlock: - percpu_up_read(&scx_cgroup_rwsem); } void scx_group_set_weight(struct task_group *tg, unsigned long weight) { - percpu_down_read(&scx_cgroup_rwsem); + struct scx_sched *sch = scx_root; - if (scx_cgroup_enabled && tg->scx_weight != weight) { - if (SCX_HAS_OP(cgroup_set_weight)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, - tg_cgrp(tg), weight); - tg->scx_weight = weight; - } + percpu_down_read(&scx_cgroup_ops_rwsem); + + if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && + tg->scx.weight != weight) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL, + tg_cgrp(tg), weight); - percpu_up_read(&scx_cgroup_rwsem); + tg->scx.weight = weight; + + percpu_up_read(&scx_cgroup_ops_rwsem); } void scx_group_set_idle(struct task_group *tg, bool idle) { - /* TODO: Implement ops->cgroup_set_idle() */ + struct scx_sched *sch = scx_root; + + percpu_down_read(&scx_cgroup_ops_rwsem); + + if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle)) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_idle, NULL, + tg_cgrp(tg), idle); + + /* Update the task group's idle state */ + tg->scx.idle = idle; + + percpu_up_read(&scx_cgroup_ops_rwsem); +} + +void scx_group_set_bandwidth(struct task_group *tg, + u64 period_us, u64 quota_us, u64 burst_us) +{ + struct scx_sched *sch = scx_root; + + percpu_down_read(&scx_cgroup_ops_rwsem); + + if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) && + (tg->scx.bw_period_us != period_us || + tg->scx.bw_quota_us != quota_us || + tg->scx.bw_burst_us != burst_us)) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_bandwidth, NULL, + tg_cgrp(tg), period_us, quota_us, burst_us); + + tg->scx.bw_period_us = period_us; + tg->scx.bw_quota_us = quota_us; + tg->scx.bw_burst_us = burst_us; + + percpu_up_read(&scx_cgroup_ops_rwsem); } static void scx_cgroup_lock(void) { - percpu_down_write(&scx_cgroup_rwsem); + percpu_down_write(&scx_cgroup_ops_rwsem); + cgroup_lock(); } static void scx_cgroup_unlock(void) { - percpu_up_write(&scx_cgroup_rwsem); + cgroup_unlock(); + percpu_up_write(&scx_cgroup_ops_rwsem); } #else /* CONFIG_EXT_GROUP_SCHED */ -static inline void scx_cgroup_lock(void) {} -static inline void scx_cgroup_unlock(void) {} +static void scx_cgroup_lock(void) {} +static void scx_cgroup_unlock(void) {} #endif /* CONFIG_EXT_GROUP_SCHED */ @@ -4078,6 +3353,8 @@ static inline void scx_cgroup_unlock(void) {} * their current sched_class. Call them directly from sched core instead. */ DEFINE_SCHED_CLASS(ext) = { + .queue_mask = 1, + .enqueue_task = enqueue_task_scx, .dequeue_task = dequeue_task_scx, .yield_task = yield_task_scx, @@ -4085,20 +3362,17 @@ DEFINE_SCHED_CLASS(ext) = { .wakeup_preempt = wakeup_preempt_scx, - .balance = balance_scx, .pick_task = pick_task_scx, .put_prev_task = put_prev_task_scx, .set_next_task = set_next_task_scx, -#ifdef CONFIG_SMP .select_task_rq = select_task_rq_scx, .task_woken = task_woken_scx, .set_cpus_allowed = set_cpus_allowed_scx, .rq_online = rq_online_scx, .rq_offline = rq_offline_scx, -#endif .task_tick = task_tick_scx, @@ -4124,29 +3398,6 @@ static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id) dsq->id = dsq_id; } -static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node) -{ - struct scx_dispatch_q *dsq; - int ret; - - if (dsq_id & SCX_DSQ_FLAG_BUILTIN) - return ERR_PTR(-EINVAL); - - dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node); - if (!dsq) - return ERR_PTR(-ENOMEM); - - init_dsq(dsq, dsq_id); - - ret = rhashtable_lookup_insert_fast(&dsq_hash, &dsq->hash_node, - dsq_hash_params); - if (ret) { - kfree(dsq); - return ERR_PTR(ret); - } - return dsq; -} - static void free_dsq_irq_workfn(struct irq_work *irq_work) { struct llist_node *to_free = llist_del_all(&dsqs_to_free); @@ -4158,26 +3409,27 @@ static void free_dsq_irq_workfn(struct irq_work *irq_work) static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); -static void destroy_dsq(u64 dsq_id) +static void destroy_dsq(struct scx_sched *sch, u64 dsq_id) { struct scx_dispatch_q *dsq; unsigned long flags; rcu_read_lock(); - dsq = find_user_dsq(dsq_id); + dsq = find_user_dsq(sch, dsq_id); if (!dsq) goto out_unlock_rcu; raw_spin_lock_irqsave(&dsq->lock, flags); if (dsq->nr) { - scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)", - dsq->id, dsq->nr); + scx_error(sch, "attempting to destroy in-use dsq 0x%016llx (nr=%u)", + dsq->id, dsq->nr); goto out_unlock_dsq; } - if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params)) + if (rhashtable_remove_fast(&sch->dsq_hash, &dsq->hash_node, + dsq_hash_params)) goto out_unlock_dsq; /* @@ -4197,83 +3449,67 @@ out_unlock_rcu: } #ifdef CONFIG_EXT_GROUP_SCHED -static void scx_cgroup_exit(void) +static void scx_cgroup_exit(struct scx_sched *sch) { struct cgroup_subsys_state *css; - percpu_rwsem_assert_held(&scx_cgroup_rwsem); - scx_cgroup_enabled = false; /* - * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk + * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk * cgroups and exit all the inited ones, all online cgroups are exited. */ - rcu_read_lock(); css_for_each_descendant_post(css, &root_task_group.css) { struct task_group *tg = css_tg(css); - if (!(tg->scx_flags & SCX_TG_INITED)) + if (!(tg->scx.flags & SCX_TG_INITED)) continue; - tg->scx_flags &= ~SCX_TG_INITED; + tg->scx.flags &= ~SCX_TG_INITED; - if (!scx_ops.cgroup_exit) + if (!sch->ops.cgroup_exit) continue; - if (WARN_ON_ONCE(!css_tryget(css))) - continue; - rcu_read_unlock(); - - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup); - - rcu_read_lock(); - css_put(css); + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, + css->cgroup); } - rcu_read_unlock(); } -static int scx_cgroup_init(void) +static int scx_cgroup_init(struct scx_sched *sch) { struct cgroup_subsys_state *css; int ret; - percpu_rwsem_assert_held(&scx_cgroup_rwsem); - /* - * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk + * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk * cgroups and init, all online cgroups are initialized. */ - rcu_read_lock(); css_for_each_descendant_pre(css, &root_task_group.css) { struct task_group *tg = css_tg(css); - struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; + struct scx_cgroup_init_args args = { + .weight = tg->scx.weight, + .bw_period_us = tg->scx.bw_period_us, + .bw_quota_us = tg->scx.bw_quota_us, + .bw_burst_us = tg->scx.bw_burst_us, + }; - if ((tg->scx_flags & + if ((tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) continue; - if (!scx_ops.cgroup_init) { - tg->scx_flags |= SCX_TG_INITED; + if (!sch->ops.cgroup_init) { + tg->scx.flags |= SCX_TG_INITED; continue; } - if (WARN_ON_ONCE(!css_tryget(css))) - continue; - rcu_read_unlock(); - - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, + ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL, css->cgroup, &args); if (ret) { css_put(css); - scx_ops_error("ops.cgroup_init() failed (%d)", ret); + scx_error(sch, "ops.cgroup_init() failed (%d)", ret); return ret; } - tg->scx_flags |= SCX_TG_INITED; - - rcu_read_lock(); - css_put(css); + tg->scx.flags |= SCX_TG_INITED; } - rcu_read_unlock(); WARN_ON_ONCE(scx_cgroup_enabled); scx_cgroup_enabled = true; @@ -4282,8 +3518,8 @@ static int scx_cgroup_init(void) } #else -static void scx_cgroup_exit(void) {} -static int scx_cgroup_init(void) { return 0; } +static void scx_cgroup_exit(struct scx_sched *sch) {} +static int scx_cgroup_init(struct scx_sched *sch) { return 0; } #endif @@ -4300,8 +3536,7 @@ static int scx_cgroup_init(void) { return 0; } static ssize_t scx_attr_state_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { - return sysfs_emit(buf, "%s\n", - scx_ops_enable_state_str[scx_ops_enable_state()]); + return sysfs_emit(buf, "%s\n", scx_enable_state_str[scx_enable_state()]); } SCX_ATTR(state); @@ -4346,15 +3581,53 @@ static const struct attribute_group scx_global_attr_group = { .attrs = scx_global_attrs, }; +static void free_exit_info(struct scx_exit_info *ei); + +static void scx_sched_free_rcu_work(struct work_struct *work) +{ + struct rcu_work *rcu_work = to_rcu_work(work); + struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work); + struct rhashtable_iter rht_iter; + struct scx_dispatch_q *dsq; + int node; + + irq_work_sync(&sch->error_irq_work); + kthread_destroy_worker(sch->helper); + + free_percpu(sch->pcpu); + + for_each_node_state(node, N_POSSIBLE) + kfree(sch->global_dsqs[node]); + kfree(sch->global_dsqs); + + rhashtable_walk_enter(&sch->dsq_hash, &rht_iter); + do { + rhashtable_walk_start(&rht_iter); + + while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq)) + destroy_dsq(sch, dsq->id); + + rhashtable_walk_stop(&rht_iter); + } while (dsq == ERR_PTR(-EAGAIN)); + rhashtable_walk_exit(&rht_iter); + + rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); + free_exit_info(sch->exit_info); + kfree(sch); +} + static void scx_kobj_release(struct kobject *kobj) { - kfree(kobj); + struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); + + INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work); + queue_rcu_work(system_unbound_wq, &sch->rcu_work); } static ssize_t scx_attr_ops_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { - return sysfs_emit(buf, "%s\n", scx_ops.name); + return sysfs_emit(buf, "%s\n", scx_root->ops.name); } SCX_ATTR(ops); @@ -4365,16 +3638,17 @@ SCX_ATTR(ops); static ssize_t scx_attr_events_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { + struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); struct scx_event_stats events; int at = 0; - scx_bpf_events(&events, sizeof(events)); + scx_read_events(sch, &events); at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK); at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST); at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING); at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); - at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SLICE_DFL); + at += scx_attr_event_show(buf, at, &events, SCX_EV_REFILL_SLICE_DFL); at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION); at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH); at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE); @@ -4397,7 +3671,7 @@ static const struct kobj_type scx_ktype = { static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) { - return add_uevent_var(env, "SCXOPS=%s", scx_ops.name); + return add_uevent_var(env, "SCXOPS=%s", scx_root->ops.name); } static const struct kset_uevent_ops scx_uevent_ops = { @@ -4410,14 +3684,85 @@ static const struct kset_uevent_ops scx_uevent_ops = { */ bool task_should_scx(int policy) { - if (!scx_enabled() || - unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING)) + if (!scx_enabled() || unlikely(scx_enable_state() == SCX_DISABLING)) return false; if (READ_ONCE(scx_switching_all)) return true; return policy == SCHED_EXT; } +bool scx_allow_ttwu_queue(const struct task_struct *p) +{ + struct scx_sched *sch; + + if (!scx_enabled()) + return true; + + sch = rcu_dereference_sched(scx_root); + if (unlikely(!sch)) + return true; + + if (sch->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) + return true; + + if (unlikely(p->sched_class != &ext_sched_class)) + return true; + + return false; +} + +/** + * handle_lockup - sched_ext common lockup handler + * @fmt: format string + * + * Called on system stall or lockup condition and initiates abort of sched_ext + * if enabled, which may resolve the reported lockup. + * + * Returns %true if sched_ext is enabled and abort was initiated, which may + * resolve the lockup. %false if sched_ext is not enabled or abort was already + * initiated by someone else. + */ +static __printf(1, 2) bool handle_lockup(const char *fmt, ...) +{ + struct scx_sched *sch; + va_list args; + bool ret; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return false; + + switch (scx_enable_state()) { + case SCX_ENABLING: + case SCX_ENABLED: + va_start(args, fmt); + ret = scx_verror(sch, fmt, args); + va_end(args); + return ret; + default: + return false; + } +} + +/** + * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler + * + * While there are various reasons why RCU CPU stalls can occur on a system + * that may not be caused by the current BPF scheduler, try kicking out the + * current scheduler in an attempt to recover the system to a good state before + * issuing panics. + * + * Returns %true if sched_ext is enabled and abort was initiated, which may + * resolve the reported RCU stall. %false if sched_ext is not enabled or someone + * else already initiated abort. + */ +bool scx_rcu_cpu_stall(void) +{ + return handle_lockup("RCU CPU stall detected!"); +} + /** * scx_softlockup - sched_ext softlockup handler * @dur_s: number of seconds of CPU stuck due to soft lockup @@ -4426,44 +3771,237 @@ bool task_should_scx(int policy) * live-lock the system by making many CPUs target the same DSQ to the point * where soft-lockup detection triggers. This function is called from * soft-lockup watchdog when the triggering point is close and tries to unjam - * the system by enabling the breather and aborting the BPF scheduler. + * the system and aborting the BPF scheduler. */ void scx_softlockup(u32 dur_s) { - switch (scx_ops_enable_state()) { - case SCX_OPS_ENABLING: - case SCX_OPS_ENABLED: - break; - default: + if (!handle_lockup("soft lockup - CPU %d stuck for %us", smp_processor_id(), dur_s)) return; + + printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU %d stuck for %us, disabling BPF scheduler\n", + smp_processor_id(), dur_s); +} + +/** + * scx_hardlockup - sched_ext hardlockup handler + * + * A poorly behaving BPF scheduler can trigger hard lockup by e.g. putting + * numerous affinitized tasks in a single queue and directing all CPUs at it. + * Try kicking out the current scheduler in an attempt to recover the system to + * a good state before taking more drastic actions. + * + * Returns %true if sched_ext is enabled and abort was initiated, which may + * resolve the reported hardlockdup. %false if sched_ext is not enabled or + * someone else already initiated abort. + */ +bool scx_hardlockup(int cpu) +{ + if (!handle_lockup("hard lockup - CPU %d", cpu)) + return false; + + printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n", + cpu); + return true; +} + +static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq, + struct cpumask *donee_mask, struct cpumask *resched_mask, + u32 nr_donor_target, u32 nr_donee_target) +{ + struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq; + struct task_struct *p, *n; + struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, 0, 0); + s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target; + u32 nr_balanced = 0, min_delta_us; + + /* + * All we want to guarantee is reasonable forward progress. No reason to + * fine tune. Assuming every task on @donor_dsq runs their full slice, + * consider offloading iff the total queued duration is over the + * threshold. + */ + min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV; + if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us)) + return 0; + + raw_spin_rq_lock_irq(rq); + raw_spin_lock(&donor_dsq->lock); + list_add(&cursor.node, &donor_dsq->list); +resume: + n = container_of(&cursor, struct task_struct, scx.dsq_list); + n = nldsq_next_task(donor_dsq, n, false); + + while ((p = n)) { + struct rq *donee_rq; + struct scx_dispatch_q *donee_dsq; + int donee; + + n = nldsq_next_task(donor_dsq, n, false); + + if (donor_dsq->nr <= nr_donor_target) + break; + + if (cpumask_empty(donee_mask)) + break; + + donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr); + if (donee >= nr_cpu_ids) + continue; + + donee_rq = cpu_rq(donee); + donee_dsq = &donee_rq->scx.bypass_dsq; + + /* + * $p's rq is not locked but $p's DSQ lock protects its + * scheduling properties making this test safe. + */ + if (!task_can_run_on_remote_rq(sch, p, donee_rq, false)) + continue; + + /* + * Moving $p from one non-local DSQ to another. The source rq + * and DSQ are already locked. Do an abbreviated dequeue and + * then perform enqueue without unlocking $donor_dsq. + * + * We don't want to drop and reacquire the lock on each + * iteration as @donor_dsq can be very long and potentially + * highly contended. Donee DSQs are less likely to be contended. + * The nested locking is safe as only this LB moves tasks + * between bypass DSQs. + */ + dispatch_dequeue_locked(p, donor_dsq); + dispatch_enqueue(sch, donee_dsq, p, SCX_ENQ_NESTED); + + /* + * $donee might have been idle and need to be woken up. No need + * to be clever. Kick every CPU that receives tasks. + */ + cpumask_set_cpu(donee, resched_mask); + + if (READ_ONCE(donee_dsq->nr) >= nr_donee_target) + cpumask_clear_cpu(donee, donee_mask); + + nr_balanced++; + if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) { + list_move_tail(&cursor.node, &n->scx.dsq_list.node); + raw_spin_unlock(&donor_dsq->lock); + raw_spin_rq_unlock_irq(rq); + cpu_relax(); + raw_spin_rq_lock_irq(rq); + raw_spin_lock(&donor_dsq->lock); + goto resume; + } } - /* allow only one instance, cleared at the end of scx_ops_bypass() */ - if (test_and_set_bit(0, &scx_in_softlockup)) - return; + list_del_init(&cursor.node); + raw_spin_unlock(&donor_dsq->lock); + raw_spin_rq_unlock_irq(rq); + + return nr_balanced; +} - printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n", - smp_processor_id(), dur_s, scx_ops.name); +static void bypass_lb_node(struct scx_sched *sch, int node) +{ + const struct cpumask *node_mask = cpumask_of_node(node); + struct cpumask *donee_mask = scx_bypass_lb_donee_cpumask; + struct cpumask *resched_mask = scx_bypass_lb_resched_cpumask; + u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0; + u32 nr_target, nr_donor_target; + u32 before_min = U32_MAX, before_max = 0; + u32 after_min = U32_MAX, after_max = 0; + int cpu; + + /* count the target tasks and CPUs */ + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { + u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr); + + nr_tasks += nr; + nr_cpus++; + + before_min = min(nr, before_min); + before_max = max(nr, before_max); + } + + if (!nr_cpus) + return; /* - * Some CPUs may be trapped in the dispatch paths. Enable breather - * immediately; otherwise, we might even be able to get to - * scx_ops_bypass(). + * We don't want CPUs to have more than $nr_donor_target tasks and + * balancing to fill donee CPUs upto $nr_target. Once targets are + * calculated, find the donee CPUs. */ - atomic_inc(&scx_ops_breather_depth); + nr_target = DIV_ROUND_UP(nr_tasks, nr_cpus); + nr_donor_target = DIV_ROUND_UP(nr_target * SCX_BYPASS_LB_DONOR_PCT, 100); + + cpumask_clear(donee_mask); + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { + if (READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr) < nr_target) + cpumask_set_cpu(cpu, donee_mask); + } + + /* iterate !donee CPUs and see if they should be offloaded */ + cpumask_clear(resched_mask); + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { + struct rq *rq = cpu_rq(cpu); + struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq; + + if (cpumask_empty(donee_mask)) + break; + if (cpumask_test_cpu(cpu, donee_mask)) + continue; + if (READ_ONCE(donor_dsq->nr) <= nr_donor_target) + continue; + + nr_balanced += bypass_lb_cpu(sch, rq, donee_mask, resched_mask, + nr_donor_target, nr_target); + } + + for_each_cpu(cpu, resched_mask) + resched_cpu(cpu); - scx_ops_error("soft lockup - CPU#%d stuck for %us", - smp_processor_id(), dur_s); + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { + u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr); + + after_min = min(nr, after_min); + after_max = max(nr, after_max); + + } + + trace_sched_ext_bypass_lb(node, nr_cpus, nr_tasks, nr_balanced, + before_min, before_max, after_min, after_max); } -static void scx_clear_softlockup(void) +/* + * In bypass mode, all tasks are put on the per-CPU bypass DSQs. If the machine + * is over-saturated and the BPF scheduler skewed tasks into few CPUs, some + * bypass DSQs can be overloaded. If there are enough tasks to saturate other + * lightly loaded CPUs, such imbalance can lead to very high execution latency + * on the overloaded CPUs and thus to hung tasks and RCU stalls. To avoid such + * outcomes, a simple load balancing mechanism is implemented by the following + * timer which runs periodically while bypass mode is in effect. + */ +static void scx_bypass_lb_timerfn(struct timer_list *timer) { - if (test_and_clear_bit(0, &scx_in_softlockup)) - atomic_dec(&scx_ops_breather_depth); + struct scx_sched *sch; + int node; + u32 intv_us; + + sch = rcu_dereference_all(scx_root); + if (unlikely(!sch) || !READ_ONCE(scx_bypass_depth)) + return; + + for_each_node_with_cpus(node) + bypass_lb_node(sch, node); + + intv_us = READ_ONCE(scx_bypass_lb_intv_us); + if (intv_us) + mod_timer(timer, jiffies + usecs_to_jiffies(intv_us)); } +static DEFINE_TIMER(scx_bypass_lb_timer, scx_bypass_lb_timerfn); + /** - * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress + * scx_bypass - [Un]bypass scx_ops and guarantee forward progress * @bypass: true for bypass, false for unbypass * * Bypassing guarantees that all runnable tasks make forward progress without @@ -4482,44 +4020,57 @@ static void scx_clear_softlockup(void) * * - ops.dispatch() is ignored. * - * - balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice + * - balance_one() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice * can't be trusted. Whenever a tick triggers, the running task is rotated to * the tail of the queue with core_sched_at touched. * * - pick_next_task() suppresses zero slice warning. * - * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM + * - scx_kick_cpu() is disabled to avoid irq_work malfunction during PM * operations. * * - scx_prio_less() reverts to the default core_sched_at order. */ -static void scx_ops_bypass(bool bypass) +static void scx_bypass(bool bypass) { static DEFINE_RAW_SPINLOCK(bypass_lock); static unsigned long bypass_timestamp; - - int cpu; + struct scx_sched *sch; unsigned long flags; + int cpu; raw_spin_lock_irqsave(&bypass_lock, flags); + sch = rcu_dereference_bh(scx_root); + if (bypass) { - scx_ops_bypass_depth++; - WARN_ON_ONCE(scx_ops_bypass_depth <= 0); - if (scx_ops_bypass_depth != 1) + u32 intv_us; + + WRITE_ONCE(scx_bypass_depth, scx_bypass_depth + 1); + WARN_ON_ONCE(scx_bypass_depth <= 0); + if (scx_bypass_depth != 1) goto unlock; + WRITE_ONCE(scx_slice_dfl, scx_slice_bypass_us * NSEC_PER_USEC); bypass_timestamp = ktime_get_ns(); - scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1); + if (sch) + scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1); + + intv_us = READ_ONCE(scx_bypass_lb_intv_us); + if (intv_us && !timer_pending(&scx_bypass_lb_timer)) { + scx_bypass_lb_timer.expires = + jiffies + usecs_to_jiffies(intv_us); + add_timer_global(&scx_bypass_lb_timer); + } } else { - scx_ops_bypass_depth--; - WARN_ON_ONCE(scx_ops_bypass_depth < 0); - if (scx_ops_bypass_depth != 0) + WRITE_ONCE(scx_bypass_depth, scx_bypass_depth - 1); + WARN_ON_ONCE(scx_bypass_depth < 0); + if (scx_bypass_depth != 0) goto unlock; - scx_add_event(SCX_EV_BYPASS_DURATION, - ktime_get_ns() - bypass_timestamp); + WRITE_ONCE(scx_slice_dfl, SCX_SLICE_DFL); + if (sch) + scx_add_event(sch, SCX_EV_BYPASS_DURATION, + ktime_get_ns() - bypass_timestamp); } - atomic_inc(&scx_ops_breather_depth); - /* * No task property is changing. We just need to make sure all currently * queued tasks are re-queued according to the new scx_rq_bypassing() @@ -4562,11 +4113,10 @@ static void scx_ops_bypass(bool bypass) */ list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, scx.runnable_node) { - struct sched_enq_and_set_ctx ctx; - /* cycling deq/enq is enough, see the function comment */ - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - sched_enq_and_set_task(&ctx); + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { + /* nothing */ ; + } } /* resched to restore ticks and idle state */ @@ -4576,10 +4126,8 @@ static void scx_ops_bypass(bool bypass) raw_spin_rq_unlock(rq); } - atomic_dec(&scx_ops_breather_depth); unlock: raw_spin_unlock_irqrestore(&bypass_lock, flags); - scx_clear_softlockup(); } static void free_exit_info(struct scx_exit_info *ei) @@ -4632,42 +4180,51 @@ static const char *scx_exit_reason(enum scx_exit_kind kind) } } -static void scx_ops_disable_workfn(struct kthread_work *work) +static void free_kick_syncs(void) { - struct scx_exit_info *ei = scx_exit_info; + int cpu; + + for_each_possible_cpu(cpu) { + struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu); + struct scx_kick_syncs *to_free; + + to_free = rcu_replace_pointer(*ksyncs, NULL, true); + if (to_free) + kvfree_rcu(to_free, rcu); + } +} + +static void scx_disable_workfn(struct kthread_work *work) +{ + struct scx_sched *sch = container_of(work, struct scx_sched, disable_work); + struct scx_exit_info *ei = sch->exit_info; struct scx_task_iter sti; struct task_struct *p; - struct rhashtable_iter rht_iter; - struct scx_dispatch_q *dsq; - int i, kind, cpu; + int kind, cpu; - kind = atomic_read(&scx_exit_kind); + kind = atomic_read(&sch->exit_kind); while (true) { - /* - * NONE indicates that a new scx_ops has been registered since - * disable was scheduled - don't kill the new ops. DONE - * indicates that the ops has already been disabled. - */ - if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE) + if (kind == SCX_EXIT_DONE) /* already disabled? */ return; - if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE)) + WARN_ON_ONCE(kind == SCX_EXIT_NONE); + if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE)) break; } ei->kind = kind; ei->reason = scx_exit_reason(ei->kind); /* guarantee forward progress by bypassing scx_ops */ - scx_ops_bypass(true); + scx_bypass(true); + WRITE_ONCE(scx_aborting, false); - switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) { - case SCX_OPS_DISABLING: + switch (scx_set_enable_state(SCX_DISABLING)) { + case SCX_DISABLING: WARN_ONCE(true, "sched_ext: duplicate disabling instance?"); break; - case SCX_OPS_DISABLED: + case SCX_DISABLED: pr_warn("sched_ext: ops error detected without ops (%s)\n", - scx_exit_info->msg); - WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != - SCX_OPS_DISABLING); + sch->exit_info->msg); + WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); goto done; default: break; @@ -4678,17 +4235,17 @@ static void scx_ops_disable_workfn(struct kthread_work *work) * we can safely use blocking synchronization constructs. Actually * disable ops. */ - mutex_lock(&scx_ops_enable_mutex); + mutex_lock(&scx_enable_mutex); static_branch_disable(&__scx_switched_all); WRITE_ONCE(scx_switching_all, false); /* * Shut down cgroup support before tasks so that the cgroup attach path - * doesn't race against scx_ops_exit_task(). + * doesn't race against scx_exit_task(). */ scx_cgroup_lock(); - scx_cgroup_exit(); + scx_cgroup_exit(sch); scx_cgroup_unlock(); /* @@ -4697,27 +4254,24 @@ static void scx_ops_disable_workfn(struct kthread_work *work) */ percpu_down_write(&scx_fork_rwsem); - scx_ops_init_task_enabled = false; + scx_init_task_enabled = false; scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *old_class = p->sched_class; - const struct sched_class *new_class = - __setscheduler_class(p->policy, p->prio); - struct sched_enq_and_set_ctx ctx; - - if (old_class != new_class && p->se.sched_delayed) - dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + const struct sched_class *new_class = scx_setscheduler_class(p); - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + update_rq_clock(task_rq(p)); - p->sched_class = new_class; - check_class_changing(task_rq(p), p, old_class); + if (old_class != new_class) + queue_flags |= DEQUEUE_CLASS; - sched_enq_and_set_task(&ctx); + scoped_guard (sched_change, p, queue_flags) { + p->sched_class = new_class; + } - check_class_changed(task_rq(p), p, old_class, p->prio); - scx_ops_exit_task(p); + scx_exit_task(p); } scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); @@ -4732,98 +4286,92 @@ static void scx_ops_disable_workfn(struct kthread_work *work) } /* no task is on scx, turn off all the switches and flush in-progress calls */ - static_branch_disable(&__scx_ops_enabled); - for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) - static_branch_disable(&scx_has_op[i]); - static_branch_disable(&scx_ops_allow_queued_wakeup); - static_branch_disable(&scx_ops_enq_last); - static_branch_disable(&scx_ops_enq_exiting); - static_branch_disable(&scx_ops_enq_migration_disabled); - static_branch_disable(&scx_ops_cpu_preempt); + static_branch_disable(&__scx_enabled); + bitmap_zero(sch->has_op, SCX_OPI_END); scx_idle_disable(); synchronize_rcu(); if (ei->kind >= SCX_EXIT_ERROR) { pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", - scx_ops.name, ei->reason); + sch->ops.name, ei->reason); if (ei->msg[0] != '\0') - pr_err("sched_ext: %s: %s\n", scx_ops.name, ei->msg); + pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg); #ifdef CONFIG_STACKTRACE stack_trace_print(ei->bt, ei->bt_len, 2); #endif } else { pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", - scx_ops.name, ei->reason); + sch->ops.name, ei->reason); } - if (scx_ops.exit) - SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei); + if (sch->ops.exit) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei); cancel_delayed_work_sync(&scx_watchdog_work); /* - * Delete the kobject from the hierarchy eagerly in addition to just - * dropping a reference. Otherwise, if the object is deleted - * asynchronously, sysfs could observe an object of the same name still - * in the hierarchy when another scheduler is loaded. + * scx_root clearing must be inside cpus_read_lock(). See + * handle_hotplug(). */ - kobject_del(scx_root_kobj); - kobject_put(scx_root_kobj); - scx_root_kobj = NULL; - - memset(&scx_ops, 0, sizeof(scx_ops)); - - rhashtable_walk_enter(&dsq_hash, &rht_iter); - do { - rhashtable_walk_start(&rht_iter); - - while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq)) - destroy_dsq(dsq->id); + cpus_read_lock(); + RCU_INIT_POINTER(scx_root, NULL); + cpus_read_unlock(); - rhashtable_walk_stop(&rht_iter); - } while (dsq == ERR_PTR(-EAGAIN)); - rhashtable_walk_exit(&rht_iter); + /* + * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs + * could observe an object of the same name still in the hierarchy when + * the next scheduler is loaded. + */ + kobject_del(&sch->kobj); free_percpu(scx_dsp_ctx); scx_dsp_ctx = NULL; scx_dsp_max_batch = 0; + free_kick_syncs(); - free_exit_info(scx_exit_info); - scx_exit_info = NULL; + if (scx_bypassed_for_enable) { + scx_bypassed_for_enable = false; + scx_bypass(false); + } - mutex_unlock(&scx_ops_enable_mutex); + mutex_unlock(&scx_enable_mutex); - WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != - SCX_OPS_DISABLING); + WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); done: - scx_ops_bypass(false); + scx_bypass(false); } -static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn); - -static void schedule_scx_ops_disable_work(void) +static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind) { - struct kthread_worker *helper = READ_ONCE(scx_ops_helper); + int none = SCX_EXIT_NONE; + + if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind)) + return false; /* - * We may be called spuriously before the first bpf_sched_ext_reg(). If - * scx_ops_helper isn't set up yet, there's nothing to do. + * Some CPUs may be trapped in the dispatch paths. Set the aborting + * flag to break potential live-lock scenarios, ensuring we can + * successfully reach scx_bypass(). */ - if (helper) - kthread_queue_work(helper, &scx_ops_disable_work); + WRITE_ONCE(scx_aborting, true); + return true; } -static void scx_ops_disable(enum scx_exit_kind kind) +static void scx_disable(enum scx_exit_kind kind) { - int none = SCX_EXIT_NONE; + struct scx_sched *sch; if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) kind = SCX_EXIT_ERROR; - atomic_try_cmpxchg(&scx_exit_kind, &none, kind); - - schedule_scx_ops_disable_work(); + rcu_read_lock(); + sch = rcu_dereference(scx_root); + if (sch) { + scx_claim_exit(sch, kind); + kthread_queue_work(sch->helper, &sch->disable_work); + } + rcu_read_unlock(); } static void dump_newline(struct seq_buf *s) @@ -4941,6 +4489,7 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, struct task_struct *p, char marker) { static unsigned long bt[SCX_EXIT_BT_LEN]; + struct scx_sched *sch = scx_root; char dsq_id_buf[19] = "(n/a)"; unsigned long ops_state = atomic_long_read(&p->scx.ops_state); unsigned int bt_len = 0; @@ -4961,11 +4510,12 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u", p->scx.dsq_vtime, p->scx.slice, p->scx.weight); - dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); + dump_line(s, " cpus=%*pb no_mig=%u", cpumask_pr_args(p->cpus_ptr), + p->migration_disabled); - if (SCX_HAS_OP(dump_task)) { + if (SCX_HAS_OP(sch, dump_task)) { ops_dump_init(s, " "); - SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p); + SCX_CALL_OP(sch, SCX_KF_REST, dump_task, NULL, dctx, p); ops_dump_exit(); } @@ -4982,6 +4532,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) { static DEFINE_SPINLOCK(dump_lock); static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n"; + struct scx_sched *sch = scx_root; struct scx_dump_ctx dctx = { .kind = ei->kind, .exit_code = ei->exit_code, @@ -5010,9 +4561,9 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) dump_stack_trace(&s, " ", ei->bt, ei->bt_len); } - if (SCX_HAS_OP(dump)) { + if (SCX_HAS_OP(sch, dump)) { ops_dump_init(&s, ""); - SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx); + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, dump, NULL, &dctx); ops_dump_exit(); } @@ -5028,12 +4579,12 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) size_t avail, used; bool idle; - rq_lock(rq, &rf); + rq_lock_irqsave(rq, &rf); idle = list_empty(&rq->scx.runnable_list) && rq->curr->sched_class == &idle_sched_class; - if (idle && !SCX_HAS_OP(dump_cpu)) + if (idle && !SCX_HAS_OP(sch, dump_cpu)) goto next; /* @@ -5046,10 +4597,10 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) seq_buf_init(&ns, buf, avail); dump_newline(&ns); - dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu", + dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu ksync=%lu", cpu, rq->scx.nr_running, rq->scx.flags, rq->scx.cpu_released, rq->scx.ops_qseq, - rq->scx.pnt_seq); + rq->scx.kick_sync); dump_line(&ns, " curr=%s[%d] class=%ps", rq->curr->comm, rq->curr->pid, rq->curr->sched_class); @@ -5067,9 +4618,10 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) cpumask_pr_args(rq->scx.cpus_to_wait)); used = seq_buf_used(&ns); - if (SCX_HAS_OP(dump_cpu)) { + if (SCX_HAS_OP(sch, dump_cpu)) { ops_dump_init(&ns, " "); - SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle); + SCX_CALL_OP(sch, SCX_KF_REST, dump_cpu, NULL, + &dctx, cpu, idle); ops_dump_exit(); } @@ -5096,20 +4648,20 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) scx_dump_task(&s, &dctx, p, ' '); next: - rq_unlock(rq, &rf); + rq_unlock_irqrestore(rq, &rf); } dump_newline(&s); dump_line(&s, "Event counters"); dump_line(&s, "--------------"); - scx_bpf_events(&events, sizeof(events)); + scx_read_events(sch, &events); scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK); scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); - scx_dump_event(s, &events, SCX_EV_ENQ_SLICE_DFL); + scx_dump_event(s, &events, SCX_EV_REFILL_SLICE_DFL); scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION); scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH); scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE); @@ -5121,59 +4673,156 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) spin_unlock_irqrestore(&dump_lock, flags); } -static void scx_ops_error_irq_workfn(struct irq_work *irq_work) +static void scx_error_irq_workfn(struct irq_work *irq_work) { - struct scx_exit_info *ei = scx_exit_info; + struct scx_sched *sch = container_of(irq_work, struct scx_sched, error_irq_work); + struct scx_exit_info *ei = sch->exit_info; if (ei->kind >= SCX_EXIT_ERROR) - scx_dump_state(ei, scx_ops.exit_dump_len); + scx_dump_state(ei, sch->ops.exit_dump_len); - schedule_scx_ops_disable_work(); + kthread_queue_work(sch->helper, &sch->disable_work); } -static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn); - -static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, - s64 exit_code, - const char *fmt, ...) +static bool scx_vexit(struct scx_sched *sch, + enum scx_exit_kind kind, s64 exit_code, + const char *fmt, va_list args) { - struct scx_exit_info *ei = scx_exit_info; - int none = SCX_EXIT_NONE; - va_list args; + struct scx_exit_info *ei = sch->exit_info; - if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind)) - return; + if (!scx_claim_exit(sch, kind)) + return false; ei->exit_code = exit_code; #ifdef CONFIG_STACKTRACE if (kind >= SCX_EXIT_ERROR) ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1); #endif - va_start(args, fmt); vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args); - va_end(args); /* * Set ei->kind and ->reason for scx_dump_state(). They'll be set again - * in scx_ops_disable_workfn(). + * in scx_disable_workfn(). */ ei->kind = kind; ei->reason = scx_exit_reason(ei->kind); - irq_work_queue(&scx_ops_error_irq_work); + irq_work_queue(&sch->error_irq_work); + return true; } -static struct kthread_worker *scx_create_rt_helper(const char *name) +static int alloc_kick_syncs(void) { - struct kthread_worker *helper; + int cpu; + + /* + * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size + * can exceed percpu allocator limits on large machines. + */ + for_each_possible_cpu(cpu) { + struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu); + struct scx_kick_syncs *new_ksyncs; - helper = kthread_run_worker(0, name); - if (helper) - sched_set_fifo(helper->task); - return helper; + WARN_ON_ONCE(rcu_access_pointer(*ksyncs)); + + new_ksyncs = kvzalloc_node(struct_size(new_ksyncs, syncs, nr_cpu_ids), + GFP_KERNEL, cpu_to_node(cpu)); + if (!new_ksyncs) { + free_kick_syncs(); + return -ENOMEM; + } + + rcu_assign_pointer(*ksyncs, new_ksyncs); + } + + return 0; } -static void check_hotplug_seq(const struct sched_ext_ops *ops) +static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) +{ + struct scx_sched *sch; + int node, ret; + + sch = kzalloc(sizeof(*sch), GFP_KERNEL); + if (!sch) + return ERR_PTR(-ENOMEM); + + sch->exit_info = alloc_exit_info(ops->exit_dump_len); + if (!sch->exit_info) { + ret = -ENOMEM; + goto err_free_sch; + } + + ret = rhashtable_init(&sch->dsq_hash, &dsq_hash_params); + if (ret < 0) + goto err_free_ei; + + sch->global_dsqs = kcalloc(nr_node_ids, sizeof(sch->global_dsqs[0]), + GFP_KERNEL); + if (!sch->global_dsqs) { + ret = -ENOMEM; + goto err_free_hash; + } + + for_each_node_state(node, N_POSSIBLE) { + struct scx_dispatch_q *dsq; + + dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node); + if (!dsq) { + ret = -ENOMEM; + goto err_free_gdsqs; + } + + init_dsq(dsq, SCX_DSQ_GLOBAL); + sch->global_dsqs[node] = dsq; + } + + sch->pcpu = alloc_percpu(struct scx_sched_pcpu); + if (!sch->pcpu) { + ret = -ENOMEM; + goto err_free_gdsqs; + } + + sch->helper = kthread_run_worker(0, "sched_ext_helper"); + if (IS_ERR(sch->helper)) { + ret = PTR_ERR(sch->helper); + goto err_free_pcpu; + } + + sched_set_fifo(sch->helper->task); + + atomic_set(&sch->exit_kind, SCX_EXIT_NONE); + init_irq_work(&sch->error_irq_work, scx_error_irq_workfn); + kthread_init_work(&sch->disable_work, scx_disable_workfn); + sch->ops = *ops; + ops->priv = sch; + + sch->kobj.kset = scx_kset; + ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); + if (ret < 0) + goto err_stop_helper; + + return sch; + +err_stop_helper: + kthread_destroy_worker(sch->helper); +err_free_pcpu: + free_percpu(sch->pcpu); +err_free_gdsqs: + for_each_node_state(node, N_POSSIBLE) + kfree(sch->global_dsqs[node]); + kfree(sch->global_dsqs); +err_free_hash: + rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); +err_free_ei: + free_exit_info(sch->exit_info); +err_free_sch: + kfree(sch); + return ERR_PTR(ret); +} + +static int check_hotplug_seq(struct scx_sched *sch, + const struct sched_ext_ops *ops) { unsigned long long global_hotplug_seq; @@ -5185,21 +4834,25 @@ static void check_hotplug_seq(const struct sched_ext_ops *ops) if (ops->hotplug_seq) { global_hotplug_seq = atomic_long_read(&scx_hotplug_seq); if (ops->hotplug_seq != global_hotplug_seq) { - scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, - "expected hotplug seq %llu did not match actual %llu", - ops->hotplug_seq, global_hotplug_seq); + scx_exit(sch, SCX_EXIT_UNREG_KERN, + SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, + "expected hotplug seq %llu did not match actual %llu", + ops->hotplug_seq, global_hotplug_seq); + return -EBUSY; } } + + return 0; } -static int validate_ops(const struct sched_ext_ops *ops) +static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) { /* * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the * ops.enqueue() callback isn't implemented. */ if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) { - scx_ops_error("SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented"); + scx_error(sch, "SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented"); return -EINVAL; } @@ -5209,22 +4862,26 @@ static int validate_ops(const struct sched_ext_ops *ops) */ if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) && (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) { - scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled"); + scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled"); return -EINVAL; } if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT) pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n"); + if (ops->cpu_acquire || ops->cpu_release) + pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n"); + return 0; } -static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) +static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) { + struct scx_sched *sch; struct scx_task_iter sti; struct task_struct *p; unsigned long timeout; - int i, cpu, node, ret; + int i, cpu, ret; if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), cpu_possible_mask)) { @@ -5232,87 +4889,31 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) return -EINVAL; } - mutex_lock(&scx_ops_enable_mutex); - - /* - * Clear event counters so a new scx scheduler gets - * fresh event counter values. - */ - for_each_possible_cpu(cpu) { - struct scx_event_stats *e = per_cpu_ptr(&event_stats_cpu, cpu); - memset(e, 0, sizeof(*e)); - } - - if (!scx_ops_helper) { - WRITE_ONCE(scx_ops_helper, - scx_create_rt_helper("sched_ext_ops_helper")); - if (!scx_ops_helper) { - ret = -ENOMEM; - goto err_unlock; - } - } - - if (!global_dsqs) { - struct scx_dispatch_q **dsqs; + mutex_lock(&scx_enable_mutex); - dsqs = kcalloc(nr_node_ids, sizeof(dsqs[0]), GFP_KERNEL); - if (!dsqs) { - ret = -ENOMEM; - goto err_unlock; - } - - for_each_node_state(node, N_POSSIBLE) { - struct scx_dispatch_q *dsq; - - dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node); - if (!dsq) { - for_each_node_state(node, N_POSSIBLE) - kfree(dsqs[node]); - kfree(dsqs); - ret = -ENOMEM; - goto err_unlock; - } - - init_dsq(dsq, SCX_DSQ_GLOBAL); - dsqs[node] = dsq; - } - - global_dsqs = dsqs; - } - - if (scx_ops_enable_state() != SCX_OPS_DISABLED) { + if (scx_enable_state() != SCX_DISABLED) { ret = -EBUSY; goto err_unlock; } - scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL); - if (!scx_root_kobj) { - ret = -ENOMEM; + ret = alloc_kick_syncs(); + if (ret) goto err_unlock; - } - scx_root_kobj->kset = scx_kset; - ret = kobject_init_and_add(scx_root_kobj, &scx_ktype, NULL, "root"); - if (ret < 0) - goto err; - - scx_exit_info = alloc_exit_info(ops->exit_dump_len); - if (!scx_exit_info) { - ret = -ENOMEM; - goto err_del; + sch = scx_alloc_and_add_sched(ops); + if (IS_ERR(sch)) { + ret = PTR_ERR(sch); + goto err_free_ksyncs; } /* - * Set scx_ops, transition to ENABLING and clear exit info to arm the - * disable path. Failure triggers full disabling from here on. + * Transition to ENABLING and clear exit info to arm the disable path. + * Failure triggers full disabling from here on. */ - scx_ops = *ops; - - WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_ENABLING) != - SCX_OPS_DISABLED); - - atomic_set(&scx_exit_kind, SCX_EXIT_NONE); - scx_warned_zero_slice = false; + WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED); + WARN_ON_ONCE(scx_root); + if (WARN_ON_ONCE(READ_ONCE(scx_aborting))) + WRITE_ONCE(scx_aborting, false); atomic_long_set(&scx_nr_rejected, 0); @@ -5325,28 +4926,39 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) */ cpus_read_lock(); + /* + * Make the scheduler instance visible. Must be inside cpus_read_lock(). + * See handle_hotplug(). + */ + rcu_assign_pointer(scx_root, sch); + scx_idle_enable(ops); - if (scx_ops.init) { - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init); + if (sch->ops.init) { + ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL); if (ret) { - ret = ops_sanitize_err("init", ret); + ret = ops_sanitize_err(sch, "init", ret); cpus_read_unlock(); - scx_ops_error("ops.init() failed (%d)", ret); + scx_error(sch, "ops.init() failed (%d)", ret); goto err_disable; } + sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; } for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) if (((void (**)(void))ops)[i]) - static_branch_enable_cpuslocked(&scx_has_op[i]); + set_bit(i, sch->has_op); - check_hotplug_seq(ops); + ret = check_hotplug_seq(sch, ops); + if (ret) { + cpus_read_unlock(); + goto err_disable; + } scx_idle_update_selcpu_topology(ops); cpus_read_unlock(); - ret = validate_ops(ops); + ret = validate_ops(sch, ops); if (ret) goto err_disable; @@ -5371,27 +4983,20 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_watchdog_timeout / 2); /* - * Once __scx_ops_enabled is set, %current can be switched to SCX - * anytime. This can lead to stalls as some BPF schedulers (e.g. - * userspace scheduling) may not function correctly before all tasks are - * switched. Init in bypass mode to guarantee forward progress. + * Once __scx_enabled is set, %current can be switched to SCX anytime. + * This can lead to stalls as some BPF schedulers (e.g. userspace + * scheduling) may not function correctly before all tasks are switched. + * Init in bypass mode to guarantee forward progress. */ - scx_ops_bypass(true); + scx_bypass(true); + scx_bypassed_for_enable = true; for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) if (((void (**)(void))ops)[i]) - static_branch_enable(&scx_has_op[i]); - - if (ops->flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) - static_branch_enable(&scx_ops_allow_queued_wakeup); - if (ops->flags & SCX_OPS_ENQ_LAST) - static_branch_enable(&scx_ops_enq_last); - if (ops->flags & SCX_OPS_ENQ_EXITING) - static_branch_enable(&scx_ops_enq_exiting); - if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED) - static_branch_enable(&scx_ops_enq_migration_disabled); - if (scx_ops.cpu_acquire || scx_ops.cpu_release) - static_branch_enable(&scx_ops_cpu_preempt); + set_bit(i, sch->has_op); + + if (sch->ops.cpu_acquire || sch->ops.cpu_release) + sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT; /* * Lock out forks, cgroup on/offlining and moves before opening the @@ -5399,8 +5004,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) */ percpu_down_write(&scx_fork_rwsem); - WARN_ON_ONCE(scx_ops_init_task_enabled); - scx_ops_init_task_enabled = true; + WARN_ON_ONCE(scx_init_task_enabled); + scx_init_task_enabled = true; /* * Enable ops for every task. Fork is excluded by scx_fork_rwsem @@ -5409,14 +5014,14 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) * tasks. Prep all tasks first and then enable them with preemption * disabled. * - * All cgroups should be initialized before scx_ops_init_task() so that - * the BPF scheduler can reliably track each task's cgroup membership - * from scx_ops_init_task(). Lock out cgroup on/offlining and task - * migrations while tasks are being initialized so that - * scx_cgroup_can_attach() never sees uninitialized tasks. + * All cgroups should be initialized before scx_init_task() so that the + * BPF scheduler can reliably track each task's cgroup membership from + * scx_init_task(). Lock out cgroup on/offlining and task migrations + * while tasks are being initialized so that scx_cgroup_can_attach() + * never sees uninitialized tasks. */ scx_cgroup_lock(); - ret = scx_cgroup_init(); + ret = scx_cgroup_init(sch); if (ret) goto err_disable_unlock_all; @@ -5432,20 +5037,18 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_task_iter_unlock(&sti); - ret = scx_ops_init_task(p, task_group(p), false); + ret = scx_init_task(p, task_group(p), false); if (ret) { put_task_struct(p); - scx_task_iter_relock(&sti); scx_task_iter_stop(&sti); - scx_ops_error("ops.init_task() failed (%d) for %s[%d]", - ret, p->comm, p->pid); + scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", + ret, p->comm, p->pid); goto err_disable_unlock_all; } scx_set_task_state(p, SCX_TASK_READY); put_task_struct(p); - scx_task_iter_relock(&sti); } scx_task_iter_stop(&sti); scx_cgroup_unlock(); @@ -5456,7 +5059,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) * all eligible tasks. */ WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); - static_branch_enable(&__scx_ops_enabled); + static_branch_enable(&__scx_enabled); /* * We're fully committed and can't fail. The task READY -> ENABLED @@ -5466,31 +5069,29 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) percpu_down_write(&scx_fork_rwsem); scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; const struct sched_class *old_class = p->sched_class; - const struct sched_class *new_class = - __setscheduler_class(p->policy, p->prio); - struct sched_enq_and_set_ctx ctx; - - if (old_class != new_class && p->se.sched_delayed) - dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + const struct sched_class *new_class = scx_setscheduler_class(p); - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - - p->scx.slice = SCX_SLICE_DFL; - p->sched_class = new_class; - check_class_changing(task_rq(p), p, old_class); + if (scx_get_task_state(p) != SCX_TASK_READY) + continue; - sched_enq_and_set_task(&ctx); + if (old_class != new_class) + queue_flags |= DEQUEUE_CLASS; - check_class_changed(task_rq(p), p, old_class, p->prio); + scoped_guard (sched_change, p, queue_flags) { + p->scx.slice = READ_ONCE(scx_slice_dfl); + p->sched_class = new_class; + } } scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); - scx_ops_bypass(false); + scx_bypassed_for_enable = false; + scx_bypass(false); - if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { - WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); + if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) { + WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE); goto err_disable; } @@ -5498,44 +5099,37 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) static_branch_enable(&__scx_switched_all); pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n", - scx_ops.name, scx_switched_all() ? "" : " (partial)"); - kobject_uevent(scx_root_kobj, KOBJ_ADD); - mutex_unlock(&scx_ops_enable_mutex); + sch->ops.name, scx_switched_all() ? "" : " (partial)"); + kobject_uevent(&sch->kobj, KOBJ_ADD); + mutex_unlock(&scx_enable_mutex); atomic_long_inc(&scx_enable_seq); return 0; -err_del: - kobject_del(scx_root_kobj); -err: - kobject_put(scx_root_kobj); - scx_root_kobj = NULL; - if (scx_exit_info) { - free_exit_info(scx_exit_info); - scx_exit_info = NULL; - } +err_free_ksyncs: + free_kick_syncs(); err_unlock: - mutex_unlock(&scx_ops_enable_mutex); + mutex_unlock(&scx_enable_mutex); return ret; err_disable_unlock_all: scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); - scx_ops_bypass(false); + /* we'll soon enter disable path, keep bypass on */ err_disable: - mutex_unlock(&scx_ops_enable_mutex); + mutex_unlock(&scx_enable_mutex); /* * Returning an error code here would not pass all the error information - * to userspace. Record errno using scx_ops_error() for cases - * scx_ops_error() wasn't already invoked and exit indicating success so - * that the error is notified through ops.exit() with all the details. + * to userspace. Record errno using scx_error() for cases scx_error() + * wasn't already invoked and exit indicating success so that the error + * is notified through ops.exit() with all the details. * - * Flush scx_ops_disable_work to ensure that error is reported before - * init completion. + * Flush scx_disable_work to ensure that error is reported before init + * completion. sch's base reference will be put by bpf_scx_unreg(). */ - scx_ops_error("scx_ops_enable() failed (%d)", ret); - kthread_flush_work(&scx_ops_disable_work); + scx_error(sch, "scx_enable() failed (%d)", ret); + kthread_flush_work(&sch->disable_work); return 0; } @@ -5586,21 +5180,8 @@ static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, return -EACCES; } -static const struct bpf_func_proto * -bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - switch (func_id) { - case BPF_FUNC_task_storage_get: - return &bpf_task_storage_get_proto; - case BPF_FUNC_task_storage_delete: - return &bpf_task_storage_delete_proto; - default: - return bpf_base_func_proto(func_id, prog); - } -} - static const struct bpf_verifier_ops bpf_scx_verifier_ops = { - .get_func_proto = bpf_scx_get_func_proto, + .get_func_proto = bpf_base_func_proto, .is_valid_access = bpf_scx_is_valid_access, .btf_struct_access = bpf_scx_btf_struct_access, }; @@ -5679,13 +5260,17 @@ static int bpf_scx_check_member(const struct btf_type *t, static int bpf_scx_reg(void *kdata, struct bpf_link *link) { - return scx_ops_enable(kdata, link); + return scx_enable(kdata, link); } static void bpf_scx_unreg(void *kdata, struct bpf_link *link) { - scx_ops_disable(SCX_EXIT_UNREG); - kthread_flush_work(&scx_ops_disable_work); + struct sched_ext_ops *ops = kdata; + struct scx_sched *sch = ops->priv; + + scx_disable(SCX_EXIT_UNREG); + kthread_flush_work(&sch->disable_work); + kobject_put(&sch->kobj); } static int bpf_scx_init(struct btf *btf) @@ -5739,6 +5324,8 @@ static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} +static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {} +static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {} #endif static void sched_ext_ops__cpu_online(s32 cpu) {} static void sched_ext_ops__cpu_offline(s32 cpu) {} @@ -5776,6 +5363,8 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = { .cgroup_move = sched_ext_ops__cgroup_move, .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, + .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, + .cgroup_set_idle = sched_ext_ops__cgroup_set_idle, #endif .cpu_online = sched_ext_ops__cpu_online, .cpu_offline = sched_ext_ops__cpu_offline, @@ -5807,10 +5396,7 @@ static struct bpf_struct_ops bpf_sched_ext_ops = { static void sysrq_handle_sched_ext_reset(u8 key) { - if (scx_ops_helper) - scx_ops_disable(SCX_EXIT_SYSRQ); - else - pr_info("sched_ext: BPF scheduler not yet used\n"); + scx_disable(SCX_EXIT_SYSRQ); } static const struct sysrq_key_op sysrq_sched_ext_reset_op = { @@ -5852,29 +5438,38 @@ static bool can_skip_idle_kick(struct rq *rq) return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE); } -static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *pseqs) +static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs) { struct rq *rq = cpu_rq(cpu); struct scx_rq *this_scx = &this_rq->scx; + const struct sched_class *cur_class; bool should_wait = false; unsigned long flags; raw_spin_rq_lock_irqsave(rq, flags); + cur_class = rq->curr->sched_class; /* * During CPU hotplug, a CPU may depend on kicking itself to make - * forward progress. Allow kicking self regardless of online state. + * forward progress. Allow kicking self regardless of online state. If + * @cpu is running a higher class task, we have no control over @cpu. + * Skip kicking. */ - if (cpu_online(cpu) || cpu == cpu_of(this_rq)) { + if ((cpu_online(cpu) || cpu == cpu_of(this_rq)) && + !sched_class_above(cur_class, &ext_sched_class)) { if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) { - if (rq->curr->sched_class == &ext_sched_class) + if (cur_class == &ext_sched_class) rq->curr->scx.slice = 0; cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); } if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { - pseqs[cpu] = rq->scx.pnt_seq; - should_wait = true; + if (cur_class == &ext_sched_class) { + ksyncs[cpu] = rq->scx.kick_sync; + should_wait = true; + } else { + cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); + } } resched_curr(rq); @@ -5906,12 +5501,20 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work) { struct rq *this_rq = this_rq(); struct scx_rq *this_scx = &this_rq->scx; - unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs); + struct scx_kick_syncs __rcu *ksyncs_pcpu = __this_cpu_read(scx_kick_syncs); bool should_wait = false; + unsigned long *ksyncs; s32 cpu; + if (unlikely(!ksyncs_pcpu)) { + pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_syncs"); + return; + } + + ksyncs = rcu_dereference_bh(ksyncs_pcpu)->syncs; + for_each_cpu(cpu, this_scx->cpus_to_kick) { - should_wait |= kick_one_cpu(cpu, this_rq, pseqs); + should_wait |= kick_one_cpu(cpu, this_rq, ksyncs); cpumask_clear_cpu(cpu, this_scx->cpus_to_kick); cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); } @@ -5925,20 +5528,21 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work) return; for_each_cpu(cpu, this_scx->cpus_to_wait) { - unsigned long *wait_pnt_seq = &cpu_rq(cpu)->scx.pnt_seq; + unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync; - if (cpu != cpu_of(this_rq)) { - /* - * Pairs with smp_store_release() issued by this CPU in - * switch_class() on the resched path. - * - * We busy-wait here to guarantee that no other task can - * be scheduled on our core before the target CPU has - * entered the resched path. - */ - while (smp_load_acquire(wait_pnt_seq) == pseqs[cpu]) - cpu_relax(); - } + /* + * Busy-wait until the task running at the time of kicking is no + * longer running. This can be used to implement e.g. core + * scheduling. + * + * smp_cond_load_acquire() pairs with store_releases in + * pick_task_scx() and put_prev_task_scx(). The former breaks + * the wait if SCX's scheduling path is entered even if the same + * task is picked subsequently. The latter is necessary to break + * the wait when $cpu is taken by a higher sched class. + */ + if (cpu != cpu_of(this_rq)) + smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]); cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); } @@ -5958,13 +5562,14 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work) */ void print_scx_info(const char *log_lvl, struct task_struct *p) { - enum scx_ops_enable_state state = scx_ops_enable_state(); + struct scx_sched *sch = scx_root; + enum scx_enable_state state = scx_enable_state(); const char *all = READ_ONCE(scx_switching_all) ? "+all" : ""; char runnable_at_buf[22] = "?"; struct sched_class *class; unsigned long runnable_at; - if (state == SCX_OPS_DISABLED) + if (state == SCX_DISABLED) return; /* @@ -5973,8 +5578,8 @@ void print_scx_info(const char *log_lvl, struct task_struct *p) */ if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) || class != &ext_sched_class) { - printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name, - scx_ops_enable_state_str[state], all); + printk("%sSched_ext: %s (%s%s)", log_lvl, sch->ops.name, + scx_enable_state_str[state], all); return; } @@ -5985,7 +5590,7 @@ void print_scx_info(const char *log_lvl, struct task_struct *p) /* print everything onto one line to conserve console space */ printk("%sSched_ext: %s (%s%s), task: runnable_at=%s", - log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all, + log_lvl, sch->ops.name, scx_enable_state_str[state], all, runnable_at_buf); } @@ -6001,12 +5606,12 @@ static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void * case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: case PM_RESTORE_PREPARE: - scx_ops_bypass(true); + scx_bypass(true); break; case PM_POST_HIBERNATION: case PM_POST_SUSPEND: case PM_POST_RESTORE: - scx_ops_bypass(false); + scx_bypass(false); break; } @@ -6029,19 +5634,14 @@ void __init init_sched_ext_class(void) WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT | SCX_TG_ONLINE); - BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); scx_idle_init_masks(); - scx_kick_cpus_pnt_seqs = - __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids, - __alignof__(scx_kick_cpus_pnt_seqs[0])); - BUG_ON(!scx_kick_cpus_pnt_seqs); - for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); int n = cpu_to_node(cpu); init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); + init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS); INIT_LIST_HEAD(&rq->scx.runnable_list); INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); @@ -6049,8 +5649,8 @@ void __init init_sched_ext_class(void) BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); - init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn); - init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); + rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); + rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); if (cpu_online(cpu)) cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE; @@ -6065,40 +5665,41 @@ void __init init_sched_ext_class(void) /******************************************************************************** * Helpers that can be called from the BPF scheduler. */ -static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags) +static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p, + u64 enq_flags) { - if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) + if (!scx_kf_allowed(sch, SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) return false; lockdep_assert_irqs_disabled(); if (unlikely(!p)) { - scx_ops_error("called with NULL task"); + scx_error(sch, "called with NULL task"); return false; } if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) { - scx_ops_error("invalid enq_flags 0x%llx", enq_flags); + scx_error(sch, "invalid enq_flags 0x%llx", enq_flags); return false; } return true; } -static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id, - u64 enq_flags) +static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p, + u64 dsq_id, u64 enq_flags) { struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); struct task_struct *ddsp_task; ddsp_task = __this_cpu_read(direct_dispatch_task); if (ddsp_task) { - mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags); + mark_direct_dispatch(sch, ddsp_task, p, dsq_id, enq_flags); return; } if (unlikely(dspc->cursor >= scx_dsp_max_batch)) { - scx_ops_error("dispatch buffer overflow"); + scx_error(sch, "dispatch buffer overflow"); return; } @@ -6136,7 +5737,8 @@ __bpf_kfunc_start_defs(); * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id * and this function can be called upto ops.dispatch_max_batch times to insert * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the - * remaining slots. scx_bpf_consume() flushes the batch and resets the counter. + * remaining slots. scx_bpf_dsq_move_to_local() flushes the batch and resets the + * counter. * * This function doesn't have any locking restrictions and may be called under * BPF locks (in the future when BPF introduces more flexible locking). @@ -6145,81 +5747,139 @@ __bpf_kfunc_start_defs(); * exhaustion. If zero, the current residual slice is maintained. If * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with * scx_bpf_kick_cpu() to trigger scheduling. + * + * Returns %true on successful insertion, %false on failure. On the root + * scheduler, %false return triggers scheduler abort and the caller doesn't need + * to check the return value. */ -__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, - u64 enq_flags) +__bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id, + u64 slice, u64 enq_flags) { - if (!scx_dsq_insert_preamble(p, enq_flags)) - return; + struct scx_sched *sch; + + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return false; + + if (!scx_dsq_insert_preamble(sch, p, enq_flags)) + return false; if (slice) p->scx.slice = slice; else p->scx.slice = p->scx.slice ?: 1; - scx_dsq_insert_commit(p, dsq_id, enq_flags); + scx_dsq_insert_commit(sch, p, dsq_id, enq_flags); + + return true; } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, - u64 enq_flags) +/* + * COMPAT: Will be removed in v6.23 along with the ___v2 suffix. + */ +__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, + u64 slice, u64 enq_flags) +{ + scx_bpf_dsq_insert___v2(p, dsq_id, slice, enq_flags); +} + +static bool scx_dsq_insert_vtime(struct scx_sched *sch, struct task_struct *p, + u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) { - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()"); - scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags); + if (!scx_dsq_insert_preamble(sch, p, enq_flags)) + return false; + + if (slice) + p->scx.slice = slice; + else + p->scx.slice = p->scx.slice ?: 1; + + p->scx.dsq_vtime = vtime; + + scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); + + return true; } +struct scx_bpf_dsq_insert_vtime_args { + /* @p can't be packed together as KF_RCU is not transitive */ + u64 dsq_id; + u64 slice; + u64 vtime; + u64 enq_flags; +}; + /** - * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ + * __scx_bpf_dsq_insert_vtime - Arg-wrapped vtime DSQ insertion * @p: task_struct to insert - * @dsq_id: DSQ to insert into - * @slice: duration @p can run for in nsecs, 0 to keep the current value - * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ - * @enq_flags: SCX_ENQ_* + * @args: struct containing the rest of the arguments + * @args->dsq_id: DSQ to insert into + * @args->slice: duration @p can run for in nsecs, 0 to keep the current value + * @args->vtime: @p's ordering inside the vtime-sorted queue of the target DSQ + * @args->enq_flags: SCX_ENQ_* * - * Insert @p into the vtime priority queue of the DSQ identified by @dsq_id. - * Tasks queued into the priority queue are ordered by @vtime. All other aspects - * are identical to scx_bpf_dsq_insert(). + * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument + * limit. BPF programs should use scx_bpf_dsq_insert_vtime() which is provided + * as an inline wrapper in common.bpf.h. * - * @vtime ordering is according to time_before64() which considers wrapping. A - * numerically larger vtime may indicate an earlier position in the ordering and - * vice-versa. + * Insert @p into the vtime priority queue of the DSQ identified by + * @args->dsq_id. Tasks queued into the priority queue are ordered by + * @args->vtime. All other aspects are identical to scx_bpf_dsq_insert(). + * + * @args->vtime ordering is according to time_before64() which considers + * wrapping. A numerically larger vtime may indicate an earlier position in the + * ordering and vice-versa. * * A DSQ can only be used as a FIFO or priority queue at any given time and this * function must not be called on a DSQ which already has one or more FIFO tasks * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and * SCX_DSQ_GLOBAL) cannot be used as priority queues. + * + * Returns %true on successful insertion, %false on failure. On the root + * scheduler, %false return triggers scheduler abort and the caller doesn't need + * to check the return value. */ -__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, - u64 slice, u64 vtime, u64 enq_flags) +__bpf_kfunc bool +__scx_bpf_dsq_insert_vtime(struct task_struct *p, + struct scx_bpf_dsq_insert_vtime_args *args) { - if (!scx_dsq_insert_preamble(p, enq_flags)) - return; + struct scx_sched *sch; - if (slice) - p->scx.slice = slice; - else - p->scx.slice = p->scx.slice ?: 1; + guard(rcu)(); - p->scx.dsq_vtime = vtime; + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return false; - scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); + return scx_dsq_insert_vtime(sch, p, args->dsq_id, args->slice, + args->vtime, args->enq_flags); } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, - u64 slice, u64 vtime, u64 enq_flags) +/* + * COMPAT: Will be removed in v6.23. + */ +__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, + u64 slice, u64 vtime, u64 enq_flags) { - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()"); - scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags); + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; + + scx_dsq_insert_vtime(sch, p, dsq_id, slice, vtime, enq_flags); } __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dsq_insert___v2, KF_RCU) +BTF_ID_FLAGS(func, __scx_bpf_dsq_insert_vtime, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { @@ -6230,13 +5890,22 @@ static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, struct task_struct *p, u64 dsq_id, u64 enq_flags) { + struct scx_sched *sch = scx_root; struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq; struct rq *this_rq, *src_rq, *locked_rq; bool dispatched = false; bool in_balance; unsigned long flags; - if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH)) + if (!scx_kf_allowed_if_unlocked() && + !scx_kf_allowed(sch, SCX_KF_DISPATCH)) + return false; + + /* + * If the BPF scheduler keeps calling this function repeatedly, it can + * cause similar live-lock conditions as consume_dispatch_q(). + */ + if (unlikely(READ_ONCE(scx_aborting))) return false; /* @@ -6259,13 +5928,6 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, raw_spin_rq_lock(src_rq); } - /* - * If the BPF scheduler keeps calling this function repeatedly, it can - * cause similar live-lock conditions as consume_dispatch_q(). Insert a - * breather if necessary. - */ - scx_ops_breather(src_rq); - locked_rq = src_rq; raw_spin_lock(&src_dsq->lock); @@ -6282,7 +5944,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, } /* @p is still on $src_dsq and stable, determine the destination */ - dst_dsq = find_dsq_for_dispatch(this_rq, dsq_id, p); + dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, p); /* * Apply vtime and slice updates before moving so that the new time is @@ -6295,7 +5957,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, p->scx.slice = kit->slice; /* execute move */ - locked_rq = move_task_between_dsqs(p, enq_flags, src_dsq, dst_dsq); + locked_rq = move_task_between_dsqs(sch, p, enq_flags, src_dsq, dst_dsq); dispatched = true; out: if (in_balance) { @@ -6321,7 +5983,15 @@ __bpf_kfunc_start_defs(); */ __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void) { - if (!scx_kf_allowed(SCX_KF_DISPATCH)) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return 0; + + if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) return 0; return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor); @@ -6336,14 +6006,21 @@ __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void) __bpf_kfunc void scx_bpf_dispatch_cancel(void) { struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); + struct scx_sched *sch; - if (!scx_kf_allowed(SCX_KF_DISPATCH)) + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; + + if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) return; if (dspc->cursor > 0) dspc->cursor--; else - scx_ops_error("dispatch buffer underflow"); + scx_error(sch, "dispatch buffer underflow"); } /** @@ -6364,23 +6041,30 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id) { struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); struct scx_dispatch_q *dsq; + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return false; - if (!scx_kf_allowed(SCX_KF_DISPATCH)) + if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) return false; - flush_dispatch_buf(dspc->rq); + flush_dispatch_buf(sch, dspc->rq); - dsq = find_user_dsq(dsq_id); + dsq = find_user_dsq(sch, dsq_id); if (unlikely(!dsq)) { - scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id); + scx_error(sch, "invalid DSQ ID 0x%016llx", dsq_id); return false; } - if (consume_dispatch_q(dspc->rq, dsq)) { + if (consume_dispatch_q(sch, dspc->rq, dsq)) { /* * A successfully consumed task can be dequeued before it starts * running while the CPU is trying to migrate other dispatched - * tasks. Bump nr_tasks to tell balance_scx() to retry on empty + * tasks. Bump nr_tasks to tell balance_one() to retry on empty * local DSQ. */ dspc->nr_tasks++; @@ -6390,13 +6074,6 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id) } } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc bool scx_bpf_consume(u64 dsq_id) -{ - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()"); - return scx_bpf_dsq_move_to_local(dsq_id); -} - /** * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs * @it__iter: DSQ iterator in progress @@ -6415,14 +6092,6 @@ __bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE; } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice( - struct bpf_iter_scx_dsq *it__iter, u64 slice) -{ - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()"); - scx_bpf_dsq_move_set_slice(it__iter, slice); -} - /** * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs * @it__iter: DSQ iterator in progress @@ -6442,14 +6111,6 @@ __bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME; } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime( - struct bpf_iter_scx_dsq *it__iter, u64 vtime) -{ - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()"); - scx_bpf_dsq_move_set_vtime(it__iter, vtime); -} - /** * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ * @it__iter: DSQ iterator in progress @@ -6471,8 +6132,9 @@ __bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime( * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq * lock (e.g. BPF timers or SYSCALL programs). * - * Returns %true if @p has been consumed, %false if @p had already been consumed - * or dequeued. + * Returns %true if @p has been consumed, %false if @p had already been + * consumed, dequeued, or, for sub-scheds, @dsq_id points to a disallowed local + * DSQ. */ __bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, @@ -6482,15 +6144,6 @@ __bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, p, dsq_id, enq_flags); } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, - struct task_struct *p, u64 dsq_id, - u64 enq_flags) -{ - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()"); - return scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags); -} - /** * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ * @it__iter: DSQ iterator in progress @@ -6516,30 +6169,16 @@ __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, - struct task_struct *p, u64 dsq_id, - u64 enq_flags) -{ - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_vtime() renamed to scx_bpf_dsq_move_vtime()"); - return scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags); -} - __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_dispatch) BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel) BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local) -BTF_ID_FLAGS(func, scx_bpf_consume) -BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice) -BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice) -BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime) -BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_dispatch) static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { @@ -6547,26 +6186,12 @@ static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { .set = &scx_kfunc_ids_dispatch, }; -__bpf_kfunc_start_defs(); - -/** - * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ - * - * Iterate over all of the tasks currently enqueued on the local DSQ of the - * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of - * processed tasks. Can only be called from ops.cpu_release(). - */ -__bpf_kfunc u32 scx_bpf_reenqueue_local(void) +static u32 reenq_local(struct rq *rq) { LIST_HEAD(tasks); u32 nr_enqueued = 0; - struct rq *rq; struct task_struct *p, *n; - if (!scx_kf_allowed(SCX_KF_CPU_RELEASE)) - return 0; - - rq = cpu_rq(smp_processor_id()); lockdep_assert_rq_held(rq); /* @@ -6586,12 +6211,8 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void) * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to * the current local DSQ for running tasks and thus are not * visible to the BPF scheduler. - * - * Also skip re-enqueueing tasks that can only run on this - * CPU, as they would just be re-added to the same local - * DSQ without any benefit. */ - if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1) + if (p->migration_pending) continue; dispatch_dequeue(rq, p); @@ -6607,6 +6228,37 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void) return nr_enqueued; } +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ + * + * Iterate over all of the tasks currently enqueued on the local DSQ of the + * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of + * processed tasks. Can only be called from ops.cpu_release(). + * + * COMPAT: Will be removed in v6.23 along with the ___v2 suffix on the void + * returning variant that can be called from anywhere. + */ +__bpf_kfunc u32 scx_bpf_reenqueue_local(void) +{ + struct scx_sched *sch; + struct rq *rq; + + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return 0; + + if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE)) + return 0; + + rq = cpu_rq(smp_processor_id()); + lockdep_assert_rq_held(rq); + + return reenq_local(rq); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_cpu_release) @@ -6630,24 +6282,46 @@ __bpf_kfunc_start_defs(); */ __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) { + struct scx_dispatch_q *dsq; + struct scx_sched *sch; + s32 ret; + if (unlikely(node >= (int)nr_node_ids || (node < 0 && node != NUMA_NO_NODE))) return -EINVAL; - return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node)); + + if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) + return -EINVAL; + + dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node); + if (!dsq) + return -ENOMEM; + + init_dsq(dsq, dsq_id); + + rcu_read_lock(); + + sch = rcu_dereference(scx_root); + if (sch) + ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node, + dsq_hash_params); + else + ret = -ENODEV; + + rcu_read_unlock(); + if (ret) + kfree(dsq); + return ret; } __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_unlocked) BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE) -BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice) -BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice) -BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime) -BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_unlocked) static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { @@ -6658,21 +6332,39 @@ static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { __bpf_kfunc_start_defs(); /** - * scx_bpf_kick_cpu - Trigger reschedule on a CPU - * @cpu: cpu to kick - * @flags: %SCX_KICK_* flags + * scx_bpf_task_set_slice - Set task's time slice + * @p: task of interest + * @slice: time slice to set in nsecs * - * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or - * trigger rescheduling on a busy CPU. This can be called from any online - * scx_ops operation and the actual kicking is performed asynchronously through - * an irq work. + * Set @p's time slice to @slice. Returns %true on success, %false if the + * calling scheduler doesn't have authority over @p. */ -__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) +__bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice) +{ + p->scx.slice = slice; + return true; +} + +/** + * scx_bpf_task_set_dsq_vtime - Set task's virtual time for DSQ ordering + * @p: task of interest + * @vtime: virtual time to set + * + * Set @p's virtual time to @vtime. Returns %true on success, %false if the + * calling scheduler doesn't have authority over @p. + */ +__bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime) +{ + p->scx.dsq_vtime = vtime; + return true; +} + +static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags) { struct rq *this_rq; unsigned long irq_flags; - if (!ops_cpu_valid(cpu, NULL)) + if (!ops_cpu_valid(sch, cpu, NULL)) return; local_irq_save(irq_flags); @@ -6696,7 +6388,7 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) struct rq *target_rq = cpu_rq(cpu); if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT))) - scx_ops_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); + scx_error(sch, "PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); if (raw_spin_rq_trylock(target_rq)) { if (can_skip_idle_kick(target_rq)) { @@ -6721,6 +6413,26 @@ out: } /** + * scx_bpf_kick_cpu - Trigger reschedule on a CPU + * @cpu: cpu to kick + * @flags: %SCX_KICK_* flags + * + * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or + * trigger rescheduling on a busy CPU. This can be called from any online + * scx_ops operation and the actual kicking is performed asynchronously through + * an irq work. + */ +__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) +{ + struct scx_sched *sch; + + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (likely(sch)) + scx_kick_cpu(sch, cpu, flags); +} + +/** * scx_bpf_dsq_nr_queued - Return the number of queued tasks * @dsq_id: id of the DSQ * @@ -6729,23 +6441,30 @@ out: */ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) { + struct scx_sched *sch; struct scx_dispatch_q *dsq; s32 ret; preempt_disable(); + sch = rcu_dereference_sched(scx_root); + if (unlikely(!sch)) { + ret = -ENODEV; + goto out; + } + if (dsq_id == SCX_DSQ_LOCAL) { ret = READ_ONCE(this_rq()->scx.local_dsq.nr); goto out; } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; - if (ops_cpu_valid(cpu, NULL)) { + if (ops_cpu_valid(sch, cpu, NULL)) { ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr); goto out; } } else { - dsq = find_user_dsq(dsq_id); + dsq = find_user_dsq(sch, dsq_id); if (dsq) { ret = READ_ONCE(dsq->nr); goto out; @@ -6768,7 +6487,13 @@ out: */ __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id) { - destroy_dsq(dsq_id); + struct scx_sched *sch; + + rcu_read_lock(); + sch = rcu_dereference(scx_root); + if (sch) + destroy_dsq(sch, dsq_id); + rcu_read_unlock(); } /** @@ -6785,22 +6510,34 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) { struct bpf_iter_scx_dsq_kern *kit = (void *)it; + struct scx_sched *sch; BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) > sizeof(struct bpf_iter_scx_dsq)); BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != __alignof__(struct bpf_iter_scx_dsq)); + BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & + ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); + + /* + * next() and destroy() will be called regardless of the return value. + * Always clear $kit->dsq. + */ + kit->dsq = NULL; + + sch = rcu_dereference_check(scx_root, rcu_read_lock_bh_held()); + if (unlikely(!sch)) + return -ENODEV; if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) return -EINVAL; - kit->dsq = find_user_dsq(dsq_id); + kit->dsq = find_user_dsq(sch, dsq_id); if (!kit->dsq) return -ENOENT; - INIT_LIST_HEAD(&kit->cursor.node); - kit->cursor.flags = SCX_DSQ_LNODE_ITER_CURSOR | flags; - kit->cursor.priv = READ_ONCE(kit->dsq->seq); + kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, flags, + READ_ONCE(kit->dsq->seq)); return 0; } @@ -6874,31 +6611,65 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) kit->dsq = NULL; } +/** + * scx_bpf_dsq_peek - Lockless peek at the first element. + * @dsq_id: DSQ to examine. + * + * Read the first element in the DSQ. This is semantically equivalent to using + * the DSQ iterator, but is lockfree. Of course, like any lockless operation, + * this provides only a point-in-time snapshot, and the contents may change + * by the time any subsequent locking operation reads the queue. + * + * Returns the pointer, or NULL indicates an empty queue OR internal error. + */ +__bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) +{ + struct scx_sched *sch; + struct scx_dispatch_q *dsq; + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return NULL; + + if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) { + scx_error(sch, "peek disallowed on builtin DSQ 0x%llx", dsq_id); + return NULL; + } + + dsq = find_user_dsq(sch, dsq_id); + if (unlikely(!dsq)) { + scx_error(sch, "peek on non-existent DSQ 0x%llx", dsq_id); + return NULL; + } + + return rcu_dereference(dsq->first_task); +} + __bpf_kfunc_end_defs(); -static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, - char *fmt, unsigned long long *data, u32 data__sz) +static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, + size_t line_size, char *fmt, unsigned long long *data, + u32 data__sz) { struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; s32 ret; if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || (data__sz && !data)) { - scx_ops_error("invalid data=%p and data__sz=%u", - (void *)data, data__sz); + scx_error(sch, "invalid data=%p and data__sz=%u", (void *)data, data__sz); return -EINVAL; } ret = copy_from_kernel_nofault(data_buf, data, data__sz); if (ret < 0) { - scx_ops_error("failed to read data fields (%d)", ret); + scx_error(sch, "failed to read data fields (%d)", ret); return ret; } ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8, &bprintf_data); if (ret < 0) { - scx_ops_error("format preparation failed (%d)", ret); + scx_error(sch, "format preparation failed (%d)", ret); return ret; } @@ -6906,18 +6677,17 @@ static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, bprintf_data.bin_args); bpf_bprintf_cleanup(&bprintf_data); if (ret < 0) { - scx_ops_error("(\"%s\", %p, %u) failed to format", - fmt, data, data__sz); + scx_error(sch, "(\"%s\", %p, %u) failed to format", fmt, data, data__sz); return ret; } return ret; } -static s32 bstr_format(struct scx_bstr_buf *buf, +static s32 bstr_format(struct scx_sched *sch, struct scx_bstr_buf *buf, char *fmt, unsigned long long *data, u32 data__sz) { - return __bstr_format(buf->data, buf->line, sizeof(buf->line), + return __bstr_format(sch, buf->data, buf->line, sizeof(buf->line), fmt, data, data__sz); } @@ -6936,12 +6706,14 @@ __bpf_kfunc_start_defs(); __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) { + struct scx_sched *sch; unsigned long flags; raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); - if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) - scx_ops_exit_kind(SCX_EXIT_UNREG_BPF, exit_code, "%s", - scx_exit_bstr_buf.line); + sch = rcu_dereference_bh(scx_root); + if (likely(sch) && + bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) + scx_exit(sch, SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); } @@ -6957,12 +6729,14 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data__sz) { + struct scx_sched *sch; unsigned long flags; raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); - if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) - scx_ops_exit_kind(SCX_EXIT_ERROR_BPF, 0, "%s", - scx_exit_bstr_buf.line); + sch = rcu_dereference_bh(scx_root); + if (likely(sch) && + bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) + scx_exit(sch, SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); } @@ -6981,17 +6755,24 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data__sz) { + struct scx_sched *sch; struct scx_dump_data *dd = &scx_dump_data; struct scx_bstr_buf *buf = &dd->buf; s32 ret; + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; + if (raw_smp_processor_id() != dd->cpu) { - scx_ops_error("scx_bpf_dump() must only be called from ops.dump() and friends"); + scx_error(sch, "scx_bpf_dump() must only be called from ops.dump() and friends"); return; } /* append the formatted string to the line buf */ - ret = __bstr_format(buf->data, buf->line + dd->cursor, + ret = __bstr_format(sch, buf->data, buf->line + dd->cursor, sizeof(buf->line) - dd->cursor, fmt, data, data__sz); if (ret < 0) { dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)", @@ -7018,6 +6799,24 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, } /** + * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ + * + * Iterate over all of the tasks currently enqueued on the local DSQ of the + * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from + * anywhere. + */ +__bpf_kfunc void scx_bpf_reenqueue_local___v2(void) +{ + struct rq *rq; + + guard(preempt)(); + + rq = this_rq(); + local_set(&rq->scx.reenq_local_deferred, 1); + schedule_deferred(rq); +} + +/** * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU * @cpu: CPU of interest * @@ -7027,7 +6826,12 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, */ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) { - if (ops_cpu_valid(cpu, NULL)) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) return arch_scale_cpu_capacity(cpu); else return SCX_CPUPERF_ONE; @@ -7049,7 +6853,12 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) */ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) { - if (ops_cpu_valid(cpu, NULL)) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) return arch_scale_freq_capacity(cpu); else return SCX_CPUPERF_ONE; @@ -7071,19 +6880,46 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) */ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) { + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; + if (unlikely(perf > SCX_CPUPERF_ONE)) { - scx_ops_error("Invalid cpuperf target %u for CPU %d", perf, cpu); + scx_error(sch, "Invalid cpuperf target %u for CPU %d", perf, cpu); return; } - if (ops_cpu_valid(cpu, NULL)) { - struct rq *rq = cpu_rq(cpu); + if (ops_cpu_valid(sch, cpu, NULL)) { + struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq(); + struct rq_flags rf; + + /* + * When called with an rq lock held, restrict the operation + * to the corresponding CPU to prevent ABBA deadlocks. + */ + if (locked_rq && rq != locked_rq) { + scx_error(sch, "Invalid target CPU %d", cpu); + return; + } + + /* + * If no rq lock is held, allow to operate on any CPU by + * acquiring the corresponding rq lock. + */ + if (!locked_rq) { + rq_lock_irqsave(rq, &rf); + update_rq_clock(rq); + } rq->scx.cpuperf_target = perf; + cpufreq_update_util(rq, 0); - rcu_read_lock_sched_notrace(); - cpufreq_update_util(cpu_rq(cpu), 0); - rcu_read_unlock_sched_notrace(); + if (!locked_rq) + rq_unlock_irqrestore(rq, &rf); } } @@ -7161,13 +6997,76 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) */ __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu) { - if (!ops_cpu_valid(cpu, NULL)) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return NULL; + + if (!ops_cpu_valid(sch, cpu, NULL)) return NULL; + if (!sch->warned_deprecated_rq) { + printk_deferred(KERN_WARNING "sched_ext: %s() is deprecated; " + "use scx_bpf_locked_rq() when holding rq lock " + "or scx_bpf_cpu_curr() to read remote curr safely.\n", __func__); + sch->warned_deprecated_rq = true; + } + return cpu_rq(cpu); } /** + * scx_bpf_locked_rq - Return the rq currently locked by SCX + * + * Returns the rq if a rq lock is currently held by SCX. + * Otherwise emits an error and returns NULL. + */ +__bpf_kfunc struct rq *scx_bpf_locked_rq(void) +{ + struct scx_sched *sch; + struct rq *rq; + + guard(preempt)(); + + sch = rcu_dereference_sched(scx_root); + if (unlikely(!sch)) + return NULL; + + rq = scx_locked_rq(); + if (!rq) { + scx_error(sch, "accessing rq without holding rq lock"); + return NULL; + } + + return rq; +} + +/** + * scx_bpf_cpu_curr - Return remote CPU's curr task + * @cpu: CPU of interest + * + * Callers must hold RCU read lock (KF_RCU). + */ +__bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu) +{ + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return NULL; + + if (!ops_cpu_valid(sch, cpu, NULL)) + return NULL; + + return rcu_dereference(cpu_rq(cpu)->curr); +} + +/** * scx_bpf_task_cgroup - Return the sched cgroup of a task * @p: task of interest * @@ -7183,8 +7082,15 @@ __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) { struct task_group *tg = p->sched_task_group; struct cgroup *cgrp = &cgrp_dfl_root.cgrp; + struct scx_sched *sch; + + guard(rcu)(); - if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p)) + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + goto out; + + if (!scx_kf_allowed_on_arg_tasks(sch, __SCX_KF_RQ_LOCKED, p)) goto out; cgrp = tg_cgrp(tg); @@ -7257,6 +7163,27 @@ __bpf_kfunc u64 scx_bpf_now(void) return clock; } +static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *events) +{ + struct scx_event_stats *e_cpu; + int cpu; + + /* Aggregate per-CPU event counters into @events. */ + memset(events, 0, sizeof(*events)); + for_each_possible_cpu(cpu) { + e_cpu = &per_cpu_ptr(sch->pcpu, cpu)->event_stats; + scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); + scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); + scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); + scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_EXITING); + scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + scx_agg_event(events, e_cpu, SCX_EV_REFILL_SLICE_DFL); + scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION); + scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH); + scx_agg_event(events, e_cpu, SCX_EV_BYPASS_ACTIVATE); + } +} + /* * scx_bpf_events - Get a system-wide event counter to * @events: output buffer from a BPF program @@ -7265,23 +7192,16 @@ __bpf_kfunc u64 scx_bpf_now(void) __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) { - struct scx_event_stats e_sys, *e_cpu; - int cpu; + struct scx_sched *sch; + struct scx_event_stats e_sys; - /* Aggregate per-CPU event counters into the system-wide counters. */ - memset(&e_sys, 0, sizeof(e_sys)); - for_each_possible_cpu(cpu) { - e_cpu = per_cpu_ptr(&event_stats_cpu, cpu); - scx_agg_event(&e_sys, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); - scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); - scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); - scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING); - scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); - scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SLICE_DFL); - scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DURATION); - scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DISPATCH); - scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_ACTIVATE); - } + rcu_read_lock(); + sch = rcu_dereference(scx_root); + if (sch) + scx_read_events(sch, &e_sys); + else + memset(&e_sys, 0, sizeof(e_sys)); + rcu_read_unlock(); /* * We cannot entirely trust a BPF-provided size since a BPF program @@ -7297,15 +7217,19 @@ __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_any) +BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_RCU); +BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_RCU); BTF_ID_FLAGS(func, scx_bpf_kick_cpu) BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) +BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) BTF_ID_FLAGS(func, scx_bpf_cpuperf_set) @@ -7314,15 +7238,11 @@ BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) -BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) -BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) -BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) -BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) -BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_cpu_rq) +BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_RET_NULL) +BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED) #ifdef CONFIG_CGROUP_SCHED BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) #endif @@ -7400,6 +7320,12 @@ static int __init scx_init(void) return ret; } + if (!alloc_cpumask_var(&scx_bypass_lb_donee_cpumask, GFP_KERNEL) || + !alloc_cpumask_var(&scx_bypass_lb_resched_cpumask, GFP_KERNEL)) { + pr_err("sched_ext: Failed to allocate cpumasks\n"); + return -ENOMEM; + } + return 0; } __initcall(scx_init); diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 1bda96b19a1b..43429b33e52c 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -8,8 +8,6 @@ */ #ifdef CONFIG_SCHED_CLASS_EXT -DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); - void scx_tick(struct rq *rq); void init_scx_entity(struct sched_ext_entity *scx); void scx_pre_fork(struct task_struct *p); @@ -21,6 +19,7 @@ void scx_rq_activate(struct rq *rq); void scx_rq_deactivate(struct rq *rq); int scx_check_setscheduler(struct task_struct *p, int policy); bool task_should_scx(int policy); +bool scx_allow_ttwu_queue(const struct task_struct *p); void init_sched_ext_class(void); static inline u32 scx_cpuperf_target(s32 cpu) @@ -36,13 +35,6 @@ static inline bool task_on_scx(const struct task_struct *p) return scx_enabled() && p->sched_class == &ext_sched_class; } -static inline bool scx_allow_ttwu_queue(const struct task_struct *p) -{ - return !scx_enabled() || - static_branch_likely(&scx_ops_allow_queued_wakeup) || - p->sched_class != &ext_sched_class; -} - #ifdef CONFIG_SCHED_CORE bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, bool in_fi); @@ -66,7 +58,7 @@ static inline void init_sched_ext_class(void) {} #endif /* CONFIG_SCHED_CLASS_EXT */ -#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP) +#ifdef CONFIG_SCHED_CLASS_EXT void __scx_update_idle(struct rq *rq, bool idle, bool do_notify); static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) @@ -80,22 +72,24 @@ static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {} #ifdef CONFIG_CGROUP_SCHED #ifdef CONFIG_EXT_GROUP_SCHED +void scx_tg_init(struct task_group *tg); int scx_tg_online(struct task_group *tg); void scx_tg_offline(struct task_group *tg); int scx_cgroup_can_attach(struct cgroup_taskset *tset); void scx_cgroup_move_task(struct task_struct *p); -void scx_cgroup_finish_attach(void); void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); void scx_group_set_idle(struct task_group *tg, bool idle); +void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us); #else /* CONFIG_EXT_GROUP_SCHED */ +static inline void scx_tg_init(struct task_group *tg) {} static inline int scx_tg_online(struct task_group *tg) { return 0; } static inline void scx_tg_offline(struct task_group *tg) {} static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } static inline void scx_cgroup_move_task(struct task_struct *p) {} -static inline void scx_cgroup_finish_attach(void) {} static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} static inline void scx_group_set_idle(struct task_group *tg, bool idle) {} +static inline void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us) {} #endif /* CONFIG_EXT_GROUP_SCHED */ #endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index cb343ca889e0..3d9d404d5cd2 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -17,7 +17,6 @@ static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); /* Enable/disable per-node idle cpumasks */ static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node); -#ifdef CONFIG_SMP /* Enable/disable LLC aware optimizations */ static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); @@ -47,6 +46,13 @@ static struct scx_idle_cpus scx_idle_global_masks; static struct scx_idle_cpus **scx_idle_node_masks; /* + * Local per-CPU cpumasks (used to generate temporary idle cpumasks). + */ +static DEFINE_PER_CPU(cpumask_var_t, local_idle_cpumask); +static DEFINE_PER_CPU(cpumask_var_t, local_llc_idle_cpumask); +static DEFINE_PER_CPU(cpumask_var_t, local_numa_idle_cpumask); + +/* * Return the idle masks associated to a target @node. * * NUMA_NO_NODE identifies the global idle cpumask. @@ -68,7 +74,7 @@ static int scx_cpu_node_if_enabled(int cpu) return cpu_to_node(cpu); } -bool scx_idle_test_and_clear_cpu(int cpu) +static bool scx_idle_test_and_clear_cpu(int cpu) { int node = scx_cpu_node_if_enabled(cpu); struct cpumask *idle_cpus = idle_cpumask(node)->cpu; @@ -131,6 +137,7 @@ found: goto retry; } +#ifdef CONFIG_NUMA /* * Tracks nodes that have not yet been visited when searching for an idle * CPU across all available nodes. @@ -179,11 +186,18 @@ static s32 pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, i return cpu; } +#else +static inline s32 +pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + return -EBUSY; +} +#endif /* * Find an idle CPU in the system, starting from @node. */ -s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) +static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) { s32 cpu; @@ -235,7 +249,7 @@ static struct cpumask *llc_span(s32 cpu) sd = rcu_dereference(per_cpu(sd_llc, cpu)); if (!sd) - return 0; + return NULL; return sched_domain_span(sd); } @@ -392,6 +406,14 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) } /* + * Return true if @p can run on all possible CPUs, false otherwise. + */ +static inline bool task_affinity_all(const struct task_struct *p) +{ + return p->nr_cpus_allowed >= num_possible_cpus(); +} + +/* * Built-in CPU idle selection policy: * * 1. Prioritize full-idle cores: @@ -403,13 +425,15 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) * branch prediction optimizations. * * 3. Pick a CPU within the same LLC (Last-Level Cache): - * - if the above conditions aren't met, pick a CPU that shares the same LLC - * to maintain cache locality. + * - if the above conditions aren't met, pick a CPU that shares the same + * LLC, if the LLC domain is a subset of @cpus_allowed, to maintain + * cache locality. * * 4. Pick a CPU within the same NUMA node, if enabled: - * - choose a CPU from the same NUMA node to reduce memory access latency. + * - choose a CPU from the same NUMA node, if the node cpumask is a + * subset of @cpus_allowed, to reduce memory access latency. * - * 5. Pick any idle CPU usable by the task. + * 5. Pick any idle CPU within the @cpus_allowed domain. * * Step 3 and 4 are performed only if the system has, respectively, * multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and @@ -424,35 +448,69 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because * we never call ops.select_cpu() for them, see select_task_rq(). */ -s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags) +s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + const struct cpumask *cpus_allowed, u64 flags) { - const struct cpumask *llc_cpus = NULL; - const struct cpumask *numa_cpus = NULL; + const struct cpumask *llc_cpus = NULL, *numa_cpus = NULL; + const struct cpumask *allowed = cpus_allowed ?: p->cpus_ptr; int node = scx_cpu_node_if_enabled(prev_cpu); + bool is_prev_allowed; s32 cpu; + preempt_disable(); + + /* + * Check whether @prev_cpu is still within the allowed set. If not, + * we can still try selecting a nearby CPU. + */ + is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed); + + /* + * Determine the subset of CPUs usable by @p within @cpus_allowed. + */ + if (allowed != p->cpus_ptr) { + struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_idle_cpumask); + + if (task_affinity_all(p)) { + allowed = cpus_allowed; + } else if (cpumask_and(local_cpus, cpus_allowed, p->cpus_ptr)) { + allowed = local_cpus; + } else { + cpu = -EBUSY; + goto out_enable; + } + } + /* * This is necessary to protect llc_cpus. */ rcu_read_lock(); /* - * Determine the scheduling domain only if the task is allowed to run - * on all CPUs. + * Determine the subset of CPUs that the task can use in its + * current LLC and node. * - * This is done primarily for efficiency, as it avoids the overhead of - * updating a cpumask every time we need to select an idle CPU (which - * can be costly in large SMP systems), but it also aligns logically: - * if a task's scheduling domain is restricted by user-space (through - * CPU affinity), the task will simply use the flat scheduling domain - * defined by user-space. + * If the task can run on all CPUs, use the node and LLC cpumasks + * directly. */ - if (p->nr_cpus_allowed >= num_possible_cpus()) { - if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) - numa_cpus = numa_span(prev_cpu); + if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) { + struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_numa_idle_cpumask); + const struct cpumask *cpus = numa_span(prev_cpu); + + if (allowed == p->cpus_ptr && task_affinity_all(p)) + numa_cpus = cpus; + else if (cpus && cpumask_and(local_cpus, allowed, cpus)) + numa_cpus = local_cpus; + } + + if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) { + struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_llc_idle_cpumask); + const struct cpumask *cpus = llc_span(prev_cpu); - if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) - llc_cpus = llc_span(prev_cpu); + if (allowed == p->cpus_ptr && task_affinity_all(p)) + llc_cpus = cpus; + else if (cpus && cpumask_and(local_cpus, allowed, cpus)) + llc_cpus = local_cpus; } /* @@ -466,7 +524,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 * then avoid a migration. */ cpu = smp_processor_id(); - if (cpus_share_cache(cpu, prev_cpu) && + if (is_prev_allowed && cpus_share_cache(cpu, prev_cpu) && scx_idle_test_and_clear_cpu(prev_cpu)) { cpu = prev_cpu; goto out_unlock; @@ -490,7 +548,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 cpu_rq(cpu)->scx.local_dsq.nr == 0 && (!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) && !cpumask_empty(idle_cpumask(waker_node)->cpu)) { - if (cpumask_test_cpu(cpu, p->cpus_ptr)) + if (cpumask_test_cpu(cpu, allowed)) goto out_unlock; } } @@ -503,7 +561,8 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 /* * Keep using @prev_cpu if it's part of a fully idle core. */ - if (cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) && + if (is_prev_allowed && + cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) && scx_idle_test_and_clear_cpu(prev_cpu)) { cpu = prev_cpu; goto out_unlock; @@ -535,7 +594,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 * begin in prev_cpu's node and proceed to other nodes in * order of increasing distance. */ - cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags | SCX_PICK_IDLE_CORE); + cpu = scx_pick_idle_cpu(allowed, node, flags | SCX_PICK_IDLE_CORE); if (cpu >= 0) goto out_unlock; @@ -552,7 +611,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 /* * Use @prev_cpu if it's idle. */ - if (scx_idle_test_and_clear_cpu(prev_cpu)) { + if (is_prev_allowed && scx_idle_test_and_clear_cpu(prev_cpu)) { cpu = prev_cpu; goto out_unlock; } @@ -583,10 +642,12 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 * in prev_cpu's node and proceed to other nodes in order of * increasing distance. */ - cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags); + cpu = scx_pick_idle_cpu(allowed, node, flags); out_unlock: rcu_read_unlock(); +out_enable: + preempt_enable(); return cpu; } @@ -596,7 +657,7 @@ out_unlock: */ void scx_idle_init_masks(void) { - int node; + int i; /* Allocate global idle cpumasks */ BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL)); @@ -607,13 +668,23 @@ void scx_idle_init_masks(void) sizeof(*scx_idle_node_masks), GFP_KERNEL); BUG_ON(!scx_idle_node_masks); - for_each_node(node) { - scx_idle_node_masks[node] = kzalloc_node(sizeof(**scx_idle_node_masks), - GFP_KERNEL, node); - BUG_ON(!scx_idle_node_masks[node]); + for_each_node(i) { + scx_idle_node_masks[i] = kzalloc_node(sizeof(**scx_idle_node_masks), + GFP_KERNEL, i); + BUG_ON(!scx_idle_node_masks[i]); + + BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->cpu, GFP_KERNEL, i)); + BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->smt, GFP_KERNEL, i)); + } - BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->cpu, GFP_KERNEL, node)); - BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->smt, GFP_KERNEL, node)); + /* Allocate local per-cpu idle cpumasks */ + for_each_possible_cpu(i) { + BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_idle_cpumask, i), + GFP_KERNEL, cpu_to_node(i))); + BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_llc_idle_cpumask, i), + GFP_KERNEL, cpu_to_node(i))); + BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_numa_idle_cpumask, i), + GFP_KERNEL, cpu_to_node(i))); } } @@ -662,21 +733,12 @@ static void update_builtin_idle(int cpu, bool idle) */ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) { + struct scx_sched *sch = scx_root; int cpu = cpu_of(rq); lockdep_assert_rq_held(rq); /* - * Trigger ops.update_idle() only when transitioning from a task to - * the idle thread and vice versa. - * - * Idle transitions are indicated by do_notify being set to true, - * managed by put_prev_task_idle()/set_next_task_idle(). - */ - if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq)) - SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); - - /* * Update the idle masks: * - for real idle transitions (do_notify == true) * - for idle-to-idle transitions (indicated by the previous task @@ -693,6 +755,21 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) if (static_branch_likely(&scx_builtin_idle_enabled)) if (do_notify || is_idle_task(rq->curr)) update_builtin_idle(cpu, idle); + + /* + * Trigger ops.update_idle() only when transitioning from a task to + * the idle thread and vice versa. + * + * Idle transitions are indicated by do_notify being set to true, + * managed by put_prev_task_idle()/set_next_task_idle(). + * + * This must come after builtin idle update so that BPF schedulers can + * create interlocking between ops.update_idle() and ops.enqueue() - + * either enqueue() sees the idle bit or update_idle() sees the task + * that enqueue() queued. + */ + if (SCX_HAS_OP(sch, update_idle) && do_notify && !scx_rq_bypassing(rq)) + SCX_CALL_OP(sch, SCX_KF_REST, update_idle, rq, cpu_of(rq), idle); } static void reset_idle_masks(struct sched_ext_ops *ops) @@ -716,7 +793,6 @@ static void reset_idle_masks(struct sched_ext_ops *ops) cpumask_and(idle_cpumask(node)->smt, cpu_online_mask, node_mask); } } -#endif /* CONFIG_SMP */ void scx_idle_enable(struct sched_ext_ops *ops) { @@ -730,9 +806,7 @@ void scx_idle_enable(struct sched_ext_ops *ops) else static_branch_disable_cpuslocked(&scx_builtin_idle_per_node); -#ifdef CONFIG_SMP reset_idle_masks(ops); -#endif } void scx_idle_disable(void) @@ -745,10 +819,10 @@ void scx_idle_disable(void) * Helpers that can be called from the BPF scheduler. */ -static int validate_node(int node) +static int validate_node(struct scx_sched *sch, int node) { if (!static_branch_likely(&scx_builtin_idle_per_node)) { - scx_ops_error("per-node idle tracking is disabled"); + scx_error(sch, "per-node idle tracking is disabled"); return -EOPNOTSUPP; } @@ -758,13 +832,13 @@ static int validate_node(int node) /* Make sure node is in a valid range */ if (node < 0 || node >= nr_node_ids) { - scx_ops_error("invalid node %d", node); + scx_error(sch, "invalid node %d", node); return -EINVAL; } /* Make sure the node is part of the set of possible nodes */ if (!node_possible(node)) { - scx_ops_error("unavailable node %d", node); + scx_error(sch, "unavailable node %d", node); return -EINVAL; } @@ -773,15 +847,101 @@ static int validate_node(int node) __bpf_kfunc_start_defs(); -static bool check_builtin_idle_enabled(void) +static bool check_builtin_idle_enabled(struct scx_sched *sch) { if (static_branch_likely(&scx_builtin_idle_enabled)) return true; - scx_ops_error("built-in idle tracking is disabled"); + scx_error(sch, "built-in idle tracking is disabled"); return false; } +/* + * Determine whether @p is a migration-disabled task in the context of BPF + * code. + * + * We can't simply check whether @p->migration_disabled is set in a + * sched_ext callback, because migration is always disabled for the current + * task while running BPF code. + * + * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) respectively + * disable and re-enable migration. For this reason, the current task + * inside a sched_ext callback is always a migration-disabled task. + * + * Therefore, when @p->migration_disabled == 1, check whether @p is the + * current task or not: if it is, then migration was not disabled before + * entering the callback, otherwise migration was disabled. + * + * Returns true if @p is migration-disabled, false otherwise. + */ +static bool is_bpf_migration_disabled(const struct task_struct *p) +{ + if (p->migration_disabled == 1) + return p != current; + else + return p->migration_disabled; +} + +static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, + s32 prev_cpu, u64 wake_flags, + const struct cpumask *allowed, u64 flags) +{ + struct rq *rq; + struct rq_flags rf; + s32 cpu; + + if (!ops_cpu_valid(sch, prev_cpu, NULL)) + return -EINVAL; + + if (!check_builtin_idle_enabled(sch)) + return -EBUSY; + + /* + * If called from an unlocked context, acquire the task's rq lock, + * so that we can safely access p->cpus_ptr and p->nr_cpus_allowed. + * + * Otherwise, allow to use this kfunc only from ops.select_cpu() + * and ops.select_enqueue(). + */ + if (scx_kf_allowed_if_unlocked()) { + rq = task_rq_lock(p, &rf); + } else { + if (!scx_kf_allowed(sch, SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE)) + return -EPERM; + rq = scx_locked_rq(); + } + + /* + * Validate locking correctness to access p->cpus_ptr and + * p->nr_cpus_allowed: if we're holding an rq lock, we're safe; + * otherwise, assert that p->pi_lock is held. + */ + if (!rq) + lockdep_assert_held(&p->pi_lock); + + /* + * This may also be called from ops.enqueue(), so we need to handle + * per-CPU tasks as well. For these tasks, we can skip all idle CPU + * selection optimizations and simply check whether the previously + * used CPU is idle and within the allowed cpumask. + */ + if (p->nr_cpus_allowed == 1 || is_bpf_migration_disabled(p)) { + if (cpumask_test_cpu(prev_cpu, allowed ?: p->cpus_ptr) && + scx_idle_test_and_clear_cpu(prev_cpu)) + cpu = prev_cpu; + else + cpu = -EBUSY; + } else { + cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, + allowed ?: p->cpus_ptr, flags); + } + + if (scx_kf_allowed_if_unlocked()) + task_rq_unlock(rq, p, &rf); + + return cpu; +} + /** * scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or * trigger an error if @cpu is invalid @@ -789,14 +949,14 @@ static bool check_builtin_idle_enabled(void) */ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu) { -#ifdef CONFIG_NUMA - if (!ops_cpu_valid(cpu, NULL)) - return NUMA_NO_NODE; + struct scx_sched *sch; + + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (unlikely(!sch) || !ops_cpu_valid(sch, cpu, NULL)) + return NUMA_NO_NODE; return cpu_to_node(cpu); -#else - return 0; -#endif } /** @@ -806,9 +966,10 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu) * @wake_flags: %SCX_WAKE_* flags * @is_idle: out parameter indicating whether the returned CPU is idle * - * Can only be called from ops.select_cpu() if the built-in CPU selection is - * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. - * @p, @prev_cpu and @wake_flags match ops.select_cpu(). + * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked + * context such as a BPF test_run() call, as long as built-in CPU selection + * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE + * is set. * * Returns the picked CPU with *@is_idle indicating whether the picked CPU is * currently idle and thus a good candidate for direct dispatching. @@ -816,31 +977,89 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu) __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) { -#ifdef CONFIG_SMP + struct scx_sched *sch; s32 cpu; -#endif - if (!ops_cpu_valid(prev_cpu, NULL)) - goto prev_cpu; - if (!check_builtin_idle_enabled()) - goto prev_cpu; + guard(rcu)(); - if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) - goto prev_cpu; + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; -#ifdef CONFIG_SMP - cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0); + cpu = select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags, NULL, 0); if (cpu >= 0) { *is_idle = true; return cpu; } -#endif - -prev_cpu: *is_idle = false; return prev_cpu; } +struct scx_bpf_select_cpu_and_args { + /* @p and @cpus_allowed can't be packed together as KF_RCU is not transitive */ + s32 prev_cpu; + u64 wake_flags; + u64 flags; +}; + +/** + * __scx_bpf_select_cpu_and - Arg-wrapped CPU selection with cpumask + * @p: task_struct to select a CPU for + * @cpus_allowed: cpumask of allowed CPUs + * @args: struct containing the rest of the arguments + * @args->prev_cpu: CPU @p was on previously + * @args->wake_flags: %SCX_WAKE_* flags + * @args->flags: %SCX_PICK_IDLE* flags + * + * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument + * limit. BPF programs should use scx_bpf_select_cpu_and() which is provided + * as an inline wrapper in common.bpf.h. + * + * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked + * context such as a BPF test_run() call, as long as built-in CPU selection + * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE + * is set. + * + * @p, @args->prev_cpu and @args->wake_flags match ops.select_cpu(). + * + * Returns the selected idle CPU, which will be automatically awakened upon + * returning from ops.select_cpu() and can be used for direct dispatch, or + * a negative value if no idle CPU is available. + */ +__bpf_kfunc s32 +__scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed, + struct scx_bpf_select_cpu_and_args *args) +{ + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + + return select_cpu_from_kfunc(sch, p, args->prev_cpu, args->wake_flags, + cpus_allowed, args->flags); +} + +/* + * COMPAT: Will be removed in v6.22. + */ +__bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + const struct cpumask *cpus_allowed, u64 flags) +{ + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + + return select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags, + cpus_allowed, flags); +} + /** * scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the * idle-tracking per-CPU cpumask of a target NUMA node. @@ -848,19 +1067,23 @@ prev_cpu: * * Returns an empty cpumask if idle tracking is not enabled, if @node is * not valid, or running on a UP kernel. In this case the actual error will - * be reported to the BPF scheduler via scx_ops_error(). + * be reported to the BPF scheduler via scx_error(). */ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) { - node = validate_node(node); + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return cpu_none_mask; + + node = validate_node(sch, node); if (node < 0) return cpu_none_mask; -#ifdef CONFIG_SMP return idle_cpumask(node)->cpu; -#else - return cpu_none_mask; -#endif } /** @@ -872,19 +1095,23 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) */ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) { + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return cpu_none_mask; + if (static_branch_unlikely(&scx_builtin_idle_per_node)) { - scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); + scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); return cpu_none_mask; } - if (!check_builtin_idle_enabled()) + if (!check_builtin_idle_enabled(sch)) return cpu_none_mask; -#ifdef CONFIG_SMP return idle_cpumask(NUMA_NO_NODE)->cpu; -#else - return cpu_none_mask; -#endif } /** @@ -895,22 +1122,26 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) * * Returns an empty cpumask if idle tracking is not enabled, if @node is * not valid, or running on a UP kernel. In this case the actual error will - * be reported to the BPF scheduler via scx_ops_error(). + * be reported to the BPF scheduler via scx_error(). */ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) { - node = validate_node(node); + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return cpu_none_mask; + + node = validate_node(sch, node); if (node < 0) return cpu_none_mask; -#ifdef CONFIG_SMP if (sched_smt_active()) return idle_cpumask(node)->smt; else return idle_cpumask(node)->cpu; -#else - return cpu_none_mask; -#endif } /** @@ -923,22 +1154,26 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) */ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) { + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return cpu_none_mask; + if (static_branch_unlikely(&scx_builtin_idle_per_node)) { - scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); + scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); return cpu_none_mask; } - if (!check_builtin_idle_enabled()) + if (!check_builtin_idle_enabled(sch)) return cpu_none_mask; -#ifdef CONFIG_SMP if (sched_smt_active()) return idle_cpumask(NUMA_NO_NODE)->smt; else return idle_cpumask(NUMA_NO_NODE)->cpu; -#else - return cpu_none_mask; -#endif } /** @@ -968,13 +1203,21 @@ __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) */ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) { - if (!check_builtin_idle_enabled()) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) return false; - if (ops_cpu_valid(cpu, NULL)) - return scx_idle_test_and_clear_cpu(cpu); - else + if (!check_builtin_idle_enabled(sch)) + return false; + + if (!ops_cpu_valid(sch, cpu, NULL)) return false; + + return scx_idle_test_and_clear_cpu(cpu); } /** @@ -999,7 +1242,15 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed, int node, u64 flags) { - node = validate_node(node); + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + + node = validate_node(sch, node); if (node < 0) return node; @@ -1031,12 +1282,20 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed, __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { - scx_ops_error("per-node idle tracking is enabled"); + scx_error(sch, "per-node idle tracking is enabled"); return -EBUSY; } - if (!check_builtin_idle_enabled()) + if (!check_builtin_idle_enabled(sch)) return -EBUSY; return scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags); @@ -1066,9 +1325,16 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed, int node, u64 flags) { + struct scx_sched *sch; s32 cpu; - node = validate_node(node); + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + + node = validate_node(sch, node); if (node < 0) return node; @@ -1106,10 +1372,17 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed, __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, u64 flags) { + struct scx_sched *sch; s32 cpu; + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { - scx_ops_error("per-node idle tracking is enabled"); + scx_error(sch, "per-node idle tracking is enabled"); return -EBUSY; } @@ -1140,6 +1413,9 @@ BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) +BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_idle) static const struct btf_kfunc_id_set scx_kfunc_set_idle = { @@ -1147,21 +1423,11 @@ static const struct btf_kfunc_id_set scx_kfunc_set_idle = { .set = &scx_kfunc_ids_idle, }; -BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) -BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) -BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) - -static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { - .owner = THIS_MODULE, - .set = &scx_kfunc_ids_select_cpu, -}; - int scx_idle_init(void) { int ret; - ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) || - register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) || + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) || register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) || register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle); diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h index 511cc2221f7a..fa583f141f35 100644 --- a/kernel/sched/ext_idle.h +++ b/kernel/sched/ext_idle.h @@ -12,22 +12,11 @@ struct sched_ext_ops; -#ifdef CONFIG_SMP void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops); void scx_idle_init_masks(void); -bool scx_idle_test_and_clear_cpu(int cpu); -s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags); -#else /* !CONFIG_SMP */ -static inline void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) {} -static inline void scx_idle_init_masks(void) {} -static inline bool scx_idle_test_and_clear_cpu(int cpu) { return false; } -static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) -{ - return -EBUSY; -} -#endif /* CONFIG_SMP */ -s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags); +s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + const struct cpumask *cpus_allowed, u64 flags); void scx_idle_enable(struct sched_ext_ops *ops); void scx_idle_disable(void); int scx_idle_init(void); diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h new file mode 100644 index 000000000000..386c677e4c9a --- /dev/null +++ b/kernel/sched/ext_internal.h @@ -0,0 +1,1101 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2025 Tejun Heo <tj@kernel.org> + */ +#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) + +enum scx_consts { + SCX_DSP_DFL_MAX_BATCH = 32, + SCX_DSP_MAX_LOOPS = 32, + SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, + + SCX_EXIT_BT_LEN = 64, + SCX_EXIT_MSG_LEN = 1024, + SCX_EXIT_DUMP_DFL_LEN = 32768, + + SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, + + /* + * Iterating all tasks may take a while. Periodically drop + * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. + */ + SCX_TASK_ITER_BATCH = 32, + + SCX_BYPASS_LB_DFL_INTV_US = 500 * USEC_PER_MSEC, + SCX_BYPASS_LB_DONOR_PCT = 125, + SCX_BYPASS_LB_MIN_DELTA_DIV = 4, + SCX_BYPASS_LB_BATCH = 256, +}; + +enum scx_exit_kind { + SCX_EXIT_NONE, + SCX_EXIT_DONE, + + SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ + SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ + SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ + SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ + + SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ + SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ + SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ +}; + +/* + * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(), + * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes + * are 64bit of the format: + * + * Bits: [63 .. 48 47 .. 32 31 .. 0] + * [ SYS ACT ] [ SYS RSN ] [ USR ] + * + * SYS ACT: System-defined exit actions + * SYS RSN: System-defined exit reasons + * USR : User-defined exit codes and reasons + * + * Using the above, users may communicate intention and context by ORing system + * actions and/or system reasons with a user-defined exit code. + */ +enum scx_exit_code { + /* Reasons */ + SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, + + /* Actions */ + SCX_ECODE_ACT_RESTART = 1LLU << 48, +}; + +enum scx_exit_flags { + /* + * ops.exit() may be called even if the loading failed before ops.init() + * finishes successfully. This is because ops.exit() allows rich exit + * info communication. The following flag indicates whether ops.init() + * finished successfully. + */ + SCX_EFLAG_INITIALIZED, +}; + +/* + * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is + * being disabled. + */ +struct scx_exit_info { + /* %SCX_EXIT_* - broad category of the exit reason */ + enum scx_exit_kind kind; + + /* exit code if gracefully exiting */ + s64 exit_code; + + /* %SCX_EFLAG_* */ + u64 flags; + + /* textual representation of the above */ + const char *reason; + + /* backtrace if exiting due to an error */ + unsigned long *bt; + u32 bt_len; + + /* informational message */ + char *msg; + + /* debug dump */ + char *dump; +}; + +/* sched_ext_ops.flags */ +enum scx_ops_flags { + /* + * Keep built-in idle tracking even if ops.update_idle() is implemented. + */ + SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, + + /* + * By default, if there are no other task to run on the CPU, ext core + * keeps running the current task even after its slice expires. If this + * flag is specified, such tasks are passed to ops.enqueue() with + * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. + */ + SCX_OPS_ENQ_LAST = 1LLU << 1, + + /* + * An exiting task may schedule after PF_EXITING is set. In such cases, + * bpf_task_from_pid() may not be able to find the task and if the BPF + * scheduler depends on pid lookup for dispatching, the task will be + * lost leading to various issues including RCU grace period stalls. + * + * To mask this problem, by default, unhashed tasks are automatically + * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't + * depend on pid lookups and wants to handle these tasks directly, the + * following flag can be used. + */ + SCX_OPS_ENQ_EXITING = 1LLU << 2, + + /* + * If set, only tasks with policy set to SCHED_EXT are attached to + * sched_ext. If clear, SCHED_NORMAL tasks are also included. + */ + SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, + + /* + * A migration disabled task can only execute on its current CPU. By + * default, such tasks are automatically put on the CPU's local DSQ with + * the default slice on enqueue. If this ops flag is set, they also go + * through ops.enqueue(). + * + * A migration disabled task never invokes ops.select_cpu() as it can + * only select the current CPU. Also, p->cpus_ptr will only contain its + * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr + * and thus may disagree with cpumask_weight(p->cpus_ptr). + */ + SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, + + /* + * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes + * ops.enqueue() on the ops.select_cpu() selected or the wakee's + * previous CPU via IPI (inter-processor interrupt) to reduce cacheline + * transfers. When this optimization is enabled, ops.select_cpu() is + * skipped in some cases (when racing against the wakee switching out). + * As the BPF scheduler may depend on ops.select_cpu() being invoked + * during wakeups, queued wakeup is disabled by default. + * + * If this ops flag is set, queued wakeup optimization is enabled and + * the BPF scheduler must be able to handle ops.enqueue() invoked on the + * wakee's CPU without preceding ops.select_cpu() even for tasks which + * may be executed on multiple CPUs. + */ + SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, + + /* + * If set, enable per-node idle cpumasks. If clear, use a single global + * flat idle cpumask. + */ + SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, + + /* + * CPU cgroup support flags + */ + SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ + + SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | + SCX_OPS_ENQ_LAST | + SCX_OPS_ENQ_EXITING | + SCX_OPS_ENQ_MIGRATION_DISABLED | + SCX_OPS_ALLOW_QUEUED_WAKEUP | + SCX_OPS_SWITCH_PARTIAL | + SCX_OPS_BUILTIN_IDLE_PER_NODE | + SCX_OPS_HAS_CGROUP_WEIGHT, + + /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */ + __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56, + + SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56, +}; + +/* argument container for ops.init_task() */ +struct scx_init_task_args { + /* + * Set if ops.init_task() is being invoked on the fork path, as opposed + * to the scheduler transition path. + */ + bool fork; +#ifdef CONFIG_EXT_GROUP_SCHED + /* the cgroup the task is joining */ + struct cgroup *cgroup; +#endif +}; + +/* argument container for ops.exit_task() */ +struct scx_exit_task_args { + /* Whether the task exited before running on sched_ext. */ + bool cancelled; +}; + +/* argument container for ops->cgroup_init() */ +struct scx_cgroup_init_args { + /* the weight of the cgroup [1..10000] */ + u32 weight; + + /* bandwidth control parameters from cpu.max and cpu.max.burst */ + u64 bw_period_us; + u64 bw_quota_us; + u64 bw_burst_us; +}; + +enum scx_cpu_preempt_reason { + /* next task is being scheduled by &sched_class_rt */ + SCX_CPU_PREEMPT_RT, + /* next task is being scheduled by &sched_class_dl */ + SCX_CPU_PREEMPT_DL, + /* next task is being scheduled by &sched_class_stop */ + SCX_CPU_PREEMPT_STOP, + /* unknown reason for SCX being preempted */ + SCX_CPU_PREEMPT_UNKNOWN, +}; + +/* + * Argument container for ops->cpu_acquire(). Currently empty, but may be + * expanded in the future. + */ +struct scx_cpu_acquire_args {}; + +/* argument container for ops->cpu_release() */ +struct scx_cpu_release_args { + /* the reason the CPU was preempted */ + enum scx_cpu_preempt_reason reason; + + /* the task that's going to be scheduled on the CPU */ + struct task_struct *task; +}; + +/* + * Informational context provided to dump operations. + */ +struct scx_dump_ctx { + enum scx_exit_kind kind; + s64 exit_code; + const char *reason; + u64 at_ns; + u64 at_jiffies; +}; + +/** + * struct sched_ext_ops - Operation table for BPF scheduler implementation + * + * A BPF scheduler can implement an arbitrary scheduling policy by + * implementing and loading operations in this table. Note that a userland + * scheduling policy can also be implemented using the BPF scheduler + * as a shim layer. + */ +struct sched_ext_ops { + /** + * @select_cpu: Pick the target CPU for a task which is being woken up + * @p: task being woken up + * @prev_cpu: the cpu @p was on before sleeping + * @wake_flags: SCX_WAKE_* + * + * Decision made here isn't final. @p may be moved to any CPU while it + * is getting dispatched for execution later. However, as @p is not on + * the rq at this point, getting the eventual execution CPU right here + * saves a small bit of overhead down the line. + * + * If an idle CPU is returned, the CPU is kicked and will try to + * dispatch. While an explicit custom mechanism can be added, + * select_cpu() serves as the default way to wake up idle CPUs. + * + * @p may be inserted into a DSQ directly by calling + * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped. + * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ + * of the CPU returned by this operation. + * + * Note that select_cpu() is never called for tasks that can only run + * on a single CPU or tasks with migration disabled, as they don't have + * the option to select a different CPU. See select_task_rq() for + * details. + */ + s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); + + /** + * @enqueue: Enqueue a task on the BPF scheduler + * @p: task being enqueued + * @enq_flags: %SCX_ENQ_* + * + * @p is ready to run. Insert directly into a DSQ by calling + * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly + * inserted, the bpf scheduler owns @p and if it fails to dispatch @p, + * the task will stall. + * + * If @p was inserted into a DSQ from ops.select_cpu(), this callback is + * skipped. + */ + void (*enqueue)(struct task_struct *p, u64 enq_flags); + + /** + * @dequeue: Remove a task from the BPF scheduler + * @p: task being dequeued + * @deq_flags: %SCX_DEQ_* + * + * Remove @p from the BPF scheduler. This is usually called to isolate + * the task while updating its scheduling properties (e.g. priority). + * + * The ext core keeps track of whether the BPF side owns a given task or + * not and can gracefully ignore spurious dispatches from BPF side, + * which makes it safe to not implement this method. However, depending + * on the scheduling logic, this can lead to confusing behaviors - e.g. + * scheduling position not being updated across a priority change. + */ + void (*dequeue)(struct task_struct *p, u64 deq_flags); + + /** + * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs + * @cpu: CPU to dispatch tasks for + * @prev: previous task being switched out + * + * Called when a CPU's local dsq is empty. The operation should dispatch + * one or more tasks from the BPF scheduler into the DSQs using + * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ + * using scx_bpf_dsq_move_to_local(). + * + * The maximum number of times scx_bpf_dsq_insert() can be called + * without an intervening scx_bpf_dsq_move_to_local() is specified by + * ops.dispatch_max_batch. See the comments on top of the two functions + * for more details. + * + * When not %NULL, @prev is an SCX task with its slice depleted. If + * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in + * @prev->scx.flags, it is not enqueued yet and will be enqueued after + * ops.dispatch() returns. To keep executing @prev, return without + * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST. + */ + void (*dispatch)(s32 cpu, struct task_struct *prev); + + /** + * @tick: Periodic tick + * @p: task running currently + * + * This operation is called every 1/HZ seconds on CPUs which are + * executing an SCX task. Setting @p->scx.slice to 0 will trigger an + * immediate dispatch cycle on the CPU. + */ + void (*tick)(struct task_struct *p); + + /** + * @runnable: A task is becoming runnable on its associated CPU + * @p: task becoming runnable + * @enq_flags: %SCX_ENQ_* + * + * This and the following three functions can be used to track a task's + * execution state transitions. A task becomes ->runnable() on a CPU, + * and then goes through one or more ->running() and ->stopping() pairs + * as it runs on the CPU, and eventually becomes ->quiescent() when it's + * done running on the CPU. + * + * @p is becoming runnable on the CPU because it's + * + * - waking up (%SCX_ENQ_WAKEUP) + * - being moved from another CPU + * - being restored after temporarily taken off the queue for an + * attribute change. + * + * This and ->enqueue() are related but not coupled. This operation + * notifies @p's state transition and may not be followed by ->enqueue() + * e.g. when @p is being dispatched to a remote CPU, or when @p is + * being enqueued on a CPU experiencing a hotplug event. Likewise, a + * task may be ->enqueue()'d without being preceded by this operation + * e.g. after exhausting its slice. + */ + void (*runnable)(struct task_struct *p, u64 enq_flags); + + /** + * @running: A task is starting to run on its associated CPU + * @p: task starting to run + * + * Note that this callback may be called from a CPU other than the + * one the task is going to run on. This can happen when a task + * property is changed (i.e., affinity), since scx_next_task_scx(), + * which triggers this callback, may run on a CPU different from + * the task's assigned CPU. + * + * Therefore, always use scx_bpf_task_cpu(@p) to determine the + * target CPU the task is going to use. + * + * See ->runnable() for explanation on the task state notifiers. + */ + void (*running)(struct task_struct *p); + + /** + * @stopping: A task is stopping execution + * @p: task stopping to run + * @runnable: is task @p still runnable? + * + * Note that this callback may be called from a CPU other than the + * one the task was running on. This can happen when a task + * property is changed (i.e., affinity), since dequeue_task_scx(), + * which triggers this callback, may run on a CPU different from + * the task's assigned CPU. + * + * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU + * the task was running on. + * + * See ->runnable() for explanation on the task state notifiers. If + * !@runnable, ->quiescent() will be invoked after this operation + * returns. + */ + void (*stopping)(struct task_struct *p, bool runnable); + + /** + * @quiescent: A task is becoming not runnable on its associated CPU + * @p: task becoming not runnable + * @deq_flags: %SCX_DEQ_* + * + * See ->runnable() for explanation on the task state notifiers. + * + * @p is becoming quiescent on the CPU because it's + * + * - sleeping (%SCX_DEQ_SLEEP) + * - being moved to another CPU + * - being temporarily taken off the queue for an attribute change + * (%SCX_DEQ_SAVE) + * + * This and ->dequeue() are related but not coupled. This operation + * notifies @p's state transition and may not be preceded by ->dequeue() + * e.g. when @p is being dispatched to a remote CPU. + */ + void (*quiescent)(struct task_struct *p, u64 deq_flags); + + /** + * @yield: Yield CPU + * @from: yielding task + * @to: optional yield target task + * + * If @to is NULL, @from is yielding the CPU to other runnable tasks. + * The BPF scheduler should ensure that other available tasks are + * dispatched before the yielding task. Return value is ignored in this + * case. + * + * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf + * scheduler can implement the request, return %true; otherwise, %false. + */ + bool (*yield)(struct task_struct *from, struct task_struct *to); + + /** + * @core_sched_before: Task ordering for core-sched + * @a: task A + * @b: task B + * + * Used by core-sched to determine the ordering between two tasks. See + * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on + * core-sched. + * + * Both @a and @b are runnable and may or may not currently be queued on + * the BPF scheduler. Should return %true if @a should run before @b. + * %false if there's no required ordering or @b should run before @a. + * + * If not specified, the default is ordering them according to when they + * became runnable. + */ + bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); + + /** + * @set_weight: Set task weight + * @p: task to set weight for + * @weight: new weight [1..10000] + * + * Update @p's weight to @weight. + */ + void (*set_weight)(struct task_struct *p, u32 weight); + + /** + * @set_cpumask: Set CPU affinity + * @p: task to set CPU affinity for + * @cpumask: cpumask of cpus that @p can run on + * + * Update @p's CPU affinity to @cpumask. + */ + void (*set_cpumask)(struct task_struct *p, + const struct cpumask *cpumask); + + /** + * @update_idle: Update the idle state of a CPU + * @cpu: CPU to update the idle state for + * @idle: whether entering or exiting the idle state + * + * This operation is called when @rq's CPU goes or leaves the idle + * state. By default, implementing this operation disables the built-in + * idle CPU tracking and the following helpers become unavailable: + * + * - scx_bpf_select_cpu_dfl() + * - scx_bpf_select_cpu_and() + * - scx_bpf_test_and_clear_cpu_idle() + * - scx_bpf_pick_idle_cpu() + * + * The user also must implement ops.select_cpu() as the default + * implementation relies on scx_bpf_select_cpu_dfl(). + * + * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle + * tracking. + */ + void (*update_idle)(s32 cpu, bool idle); + + /** + * @cpu_acquire: A CPU is becoming available to the BPF scheduler + * @cpu: The CPU being acquired by the BPF scheduler. + * @args: Acquire arguments, see the struct definition. + * + * A CPU that was previously released from the BPF scheduler is now once + * again under its control. + */ + void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); + + /** + * @cpu_release: A CPU is taken away from the BPF scheduler + * @cpu: The CPU being released by the BPF scheduler. + * @args: Release arguments, see the struct definition. + * + * The specified CPU is no longer under the control of the BPF + * scheduler. This could be because it was preempted by a higher + * priority sched_class, though there may be other reasons as well. The + * caller should consult @args->reason to determine the cause. + */ + void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); + + /** + * @init_task: Initialize a task to run in a BPF scheduler + * @p: task to initialize for BPF scheduling + * @args: init arguments, see the struct definition + * + * Either we're loading a BPF scheduler or a new task is being forked. + * Initialize @p for BPF scheduling. This operation may block and can + * be used for allocations, and is called exactly once for a task. + * + * Return 0 for success, -errno for failure. An error return while + * loading will abort loading of the BPF scheduler. During a fork, it + * will abort that specific fork. + */ + s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); + + /** + * @exit_task: Exit a previously-running task from the system + * @p: task to exit + * @args: exit arguments, see the struct definition + * + * @p is exiting or the BPF scheduler is being unloaded. Perform any + * necessary cleanup for @p. + */ + void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); + + /** + * @enable: Enable BPF scheduling for a task + * @p: task to enable BPF scheduling for + * + * Enable @p for BPF scheduling. enable() is called on @p any time it + * enters SCX, and is always paired with a matching disable(). + */ + void (*enable)(struct task_struct *p); + + /** + * @disable: Disable BPF scheduling for a task + * @p: task to disable BPF scheduling for + * + * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. + * Disable BPF scheduling for @p. A disable() call is always matched + * with a prior enable() call. + */ + void (*disable)(struct task_struct *p); + + /** + * @dump: Dump BPF scheduler state on error + * @ctx: debug dump context + * + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. + */ + void (*dump)(struct scx_dump_ctx *ctx); + + /** + * @dump_cpu: Dump BPF scheduler state for a CPU on error + * @ctx: debug dump context + * @cpu: CPU to generate debug dump for + * @idle: @cpu is currently idle without any runnable tasks + * + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for + * @cpu. If @idle is %true and this operation doesn't produce any + * output, @cpu is skipped for dump. + */ + void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); + + /** + * @dump_task: Dump BPF scheduler state for a runnable task on error + * @ctx: debug dump context + * @p: runnable task to generate debug dump for + * + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for + * @p. + */ + void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); + +#ifdef CONFIG_EXT_GROUP_SCHED + /** + * @cgroup_init: Initialize a cgroup + * @cgrp: cgroup being initialized + * @args: init arguments, see the struct definition + * + * Either the BPF scheduler is being loaded or @cgrp created, initialize + * @cgrp for sched_ext. This operation may block. + * + * Return 0 for success, -errno for failure. An error return while + * loading will abort loading of the BPF scheduler. During cgroup + * creation, it will abort the specific cgroup creation. + */ + s32 (*cgroup_init)(struct cgroup *cgrp, + struct scx_cgroup_init_args *args); + + /** + * @cgroup_exit: Exit a cgroup + * @cgrp: cgroup being exited + * + * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit + * @cgrp for sched_ext. This operation my block. + */ + void (*cgroup_exit)(struct cgroup *cgrp); + + /** + * @cgroup_prep_move: Prepare a task to be moved to a different cgroup + * @p: task being moved + * @from: cgroup @p is being moved from + * @to: cgroup @p is being moved to + * + * Prepare @p for move from cgroup @from to @to. This operation may + * block and can be used for allocations. + * + * Return 0 for success, -errno for failure. An error return aborts the + * migration. + */ + s32 (*cgroup_prep_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * @cgroup_move: Commit cgroup move + * @p: task being moved + * @from: cgroup @p is being moved from + * @to: cgroup @p is being moved to + * + * Commit the move. @p is dequeued during this operation. + */ + void (*cgroup_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * @cgroup_cancel_move: Cancel cgroup move + * @p: task whose cgroup move is being canceled + * @from: cgroup @p was being moved from + * @to: cgroup @p was being moved to + * + * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). + * Undo the preparation. + */ + void (*cgroup_cancel_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * @cgroup_set_weight: A cgroup's weight is being changed + * @cgrp: cgroup whose weight is being updated + * @weight: new weight [1..10000] + * + * Update @cgrp's weight to @weight. + */ + void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); + + /** + * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed + * @cgrp: cgroup whose bandwidth is being updated + * @period_us: bandwidth control period + * @quota_us: bandwidth control quota + * @burst_us: bandwidth control burst + * + * Update @cgrp's bandwidth control parameters. This is from the cpu.max + * cgroup interface. + * + * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled + * to. For example, if @period_us is 1_000_000 and @quota_us is + * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be + * interpreted in the same fashion and specifies how much @cgrp can + * burst temporarily. The specific control mechanism and thus the + * interpretation of @period_us and burstiness is up to the BPF + * scheduler. + */ + void (*cgroup_set_bandwidth)(struct cgroup *cgrp, + u64 period_us, u64 quota_us, u64 burst_us); + + /** + * @cgroup_set_idle: A cgroup's idle state is being changed + * @cgrp: cgroup whose idle state is being updated + * @idle: whether the cgroup is entering or exiting idle state + * + * Update @cgrp's idle state to @idle. This callback is invoked when + * a cgroup transitions between idle and non-idle states, allowing the + * BPF scheduler to adjust its behavior accordingly. + */ + void (*cgroup_set_idle)(struct cgroup *cgrp, bool idle); + +#endif /* CONFIG_EXT_GROUP_SCHED */ + + /* + * All online ops must come before ops.cpu_online(). + */ + + /** + * @cpu_online: A CPU became online + * @cpu: CPU which just came up + * + * @cpu just came online. @cpu will not call ops.enqueue() or + * ops.dispatch(), nor run tasks associated with other CPUs beforehand. + */ + void (*cpu_online)(s32 cpu); + + /** + * @cpu_offline: A CPU is going offline + * @cpu: CPU which is going offline + * + * @cpu is going offline. @cpu will not call ops.enqueue() or + * ops.dispatch(), nor run tasks associated with other CPUs afterwards. + */ + void (*cpu_offline)(s32 cpu); + + /* + * All CPU hotplug ops must come before ops.init(). + */ + + /** + * @init: Initialize the BPF scheduler + */ + s32 (*init)(void); + + /** + * @exit: Clean up after the BPF scheduler + * @info: Exit info + * + * ops.exit() is also called on ops.init() failure, which is a bit + * unusual. This is to allow rich reporting through @info on how + * ops.init() failed. + */ + void (*exit)(struct scx_exit_info *info); + + /** + * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch + */ + u32 dispatch_max_batch; + + /** + * @flags: %SCX_OPS_* flags + */ + u64 flags; + + /** + * @timeout_ms: The maximum amount of time, in milliseconds, that a + * runnable task should be able to wait before being scheduled. The + * maximum timeout may not exceed the default timeout of 30 seconds. + * + * Defaults to the maximum allowed timeout value of 30 seconds. + */ + u32 timeout_ms; + + /** + * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default + * value of 32768 is used. + */ + u32 exit_dump_len; + + /** + * @hotplug_seq: A sequence number that may be set by the scheduler to + * detect when a hotplug event has occurred during the loading process. + * If 0, no detection occurs. Otherwise, the scheduler will fail to + * load if the sequence number does not match @scx_hotplug_seq on the + * enable path. + */ + u64 hotplug_seq; + + /** + * @name: BPF scheduler's name + * + * Must be a non-zero valid BPF object name including only isalnum(), + * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the + * BPF scheduler is enabled. + */ + char name[SCX_OPS_NAME_LEN]; + + /* internal use only, must be NULL */ + void *priv; +}; + +enum scx_opi { + SCX_OPI_BEGIN = 0, + SCX_OPI_NORMAL_BEGIN = 0, + SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), + SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), + SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), + SCX_OPI_END = SCX_OP_IDX(init), +}; + +/* + * Collection of event counters. Event types are placed in descending order. + */ +struct scx_event_stats { + /* + * If ops.select_cpu() returns a CPU which can't be used by the task, + * the core scheduler code silently picks a fallback CPU. + */ + s64 SCX_EV_SELECT_CPU_FALLBACK; + + /* + * When dispatching to a local DSQ, the CPU may have gone offline in + * the meantime. In this case, the task is bounced to the global DSQ. + */ + s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; + + /* + * If SCX_OPS_ENQ_LAST is not set, the number of times that a task + * continued to run because there were no other tasks on the CPU. + */ + s64 SCX_EV_DISPATCH_KEEP_LAST; + + /* + * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task + * is dispatched to a local DSQ when exiting. + */ + s64 SCX_EV_ENQ_SKIP_EXITING; + + /* + * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a + * migration disabled task skips ops.enqueue() and is dispatched to its + * local DSQ. + */ + s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; + + /* + * Total number of times a task's time slice was refilled with the + * default value (SCX_SLICE_DFL). + */ + s64 SCX_EV_REFILL_SLICE_DFL; + + /* + * The total duration of bypass modes in nanoseconds. + */ + s64 SCX_EV_BYPASS_DURATION; + + /* + * The number of tasks dispatched in the bypassing mode. + */ + s64 SCX_EV_BYPASS_DISPATCH; + + /* + * The number of times the bypassing mode has been activated. + */ + s64 SCX_EV_BYPASS_ACTIVATE; +}; + +struct scx_sched_pcpu { + /* + * The event counters are in a per-CPU variable to minimize the + * accounting overhead. A system-wide view on the event counter is + * constructed when requested by scx_bpf_events(). + */ + struct scx_event_stats event_stats; +}; + +struct scx_sched { + struct sched_ext_ops ops; + DECLARE_BITMAP(has_op, SCX_OPI_END); + + /* + * Dispatch queues. + * + * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. + * This is to avoid live-locking in bypass mode where all tasks are + * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If + * per-node split isn't sufficient, it can be further split. + */ + struct rhashtable dsq_hash; + struct scx_dispatch_q **global_dsqs; + struct scx_sched_pcpu __percpu *pcpu; + + /* + * Updates to the following warned bitfields can race causing RMW issues + * but it doesn't really matter. + */ + bool warned_zero_slice:1; + bool warned_deprecated_rq:1; + + atomic_t exit_kind; + struct scx_exit_info *exit_info; + + struct kobject kobj; + + struct kthread_worker *helper; + struct irq_work error_irq_work; + struct kthread_work disable_work; + struct rcu_work rcu_work; +}; + +enum scx_wake_flags { + /* expose select WF_* flags as enums */ + SCX_WAKE_FORK = WF_FORK, + SCX_WAKE_TTWU = WF_TTWU, + SCX_WAKE_SYNC = WF_SYNC, +}; + +enum scx_enq_flags { + /* expose select ENQUEUE_* flags as enums */ + SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, + SCX_ENQ_HEAD = ENQUEUE_HEAD, + SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, + + /* high 32bits are SCX specific */ + + /* + * Set the following to trigger preemption when calling + * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the + * current task is cleared to zero and the CPU is kicked into the + * scheduling path. Implies %SCX_ENQ_HEAD. + */ + SCX_ENQ_PREEMPT = 1LLU << 32, + + /* + * The task being enqueued was previously enqueued on the current CPU's + * %SCX_DSQ_LOCAL, but was removed from it in a call to the + * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was + * invoked in a ->cpu_release() callback, and the task is again + * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the + * task will not be scheduled on the CPU until at least the next invocation + * of the ->cpu_acquire() callback. + */ + SCX_ENQ_REENQ = 1LLU << 40, + + /* + * The task being enqueued is the only task available for the cpu. By + * default, ext core keeps executing such tasks but when + * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the + * %SCX_ENQ_LAST flag set. + * + * The BPF scheduler is responsible for triggering a follow-up + * scheduling event. Otherwise, Execution may stall. + */ + SCX_ENQ_LAST = 1LLU << 41, + + /* high 8 bits are internal */ + __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, + + SCX_ENQ_CLEAR_OPSS = 1LLU << 56, + SCX_ENQ_DSQ_PRIQ = 1LLU << 57, + SCX_ENQ_NESTED = 1LLU << 58, +}; + +enum scx_deq_flags { + /* expose select DEQUEUE_* flags as enums */ + SCX_DEQ_SLEEP = DEQUEUE_SLEEP, + + /* high 32bits are SCX specific */ + + /* + * The generic core-sched layer decided to execute the task even though + * it hasn't been dispatched yet. Dequeue from the BPF side. + */ + SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, +}; + +enum scx_pick_idle_cpu_flags { + SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ + SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */ +}; + +enum scx_kick_flags { + /* + * Kick the target CPU if idle. Guarantees that the target CPU goes + * through at least one full scheduling cycle before going idle. If the + * target CPU can be determined to be currently not idle and going to go + * through a scheduling cycle before going idle, noop. + */ + SCX_KICK_IDLE = 1LLU << 0, + + /* + * Preempt the current task and execute the dispatch path. If the + * current task of the target CPU is an SCX task, its ->scx.slice is + * cleared to zero before the scheduling path is invoked so that the + * task expires and the dispatch path is invoked. + */ + SCX_KICK_PREEMPT = 1LLU << 1, + + /* + * The scx_bpf_kick_cpu() call will return after the current SCX task of + * the target CPU switches out. This can be used to implement e.g. core + * scheduling. This has no effect if the current task on the target CPU + * is not on SCX. + */ + SCX_KICK_WAIT = 1LLU << 2, +}; + +enum scx_tg_flags { + SCX_TG_ONLINE = 1U << 0, + SCX_TG_INITED = 1U << 1, +}; + +enum scx_enable_state { + SCX_ENABLING, + SCX_ENABLED, + SCX_DISABLING, + SCX_DISABLED, +}; + +static const char *scx_enable_state_str[] = { + [SCX_ENABLING] = "enabling", + [SCX_ENABLED] = "enabled", + [SCX_DISABLING] = "disabling", + [SCX_DISABLED] = "disabled", +}; + +/* + * sched_ext_entity->ops_state + * + * Used to track the task ownership between the SCX core and the BPF scheduler. + * State transitions look as follows: + * + * NONE -> QUEUEING -> QUEUED -> DISPATCHING + * ^ | | + * | v v + * \-------------------------------/ + * + * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call + * sites for explanations on the conditions being waited upon and why they are + * safe. Transitions out of them into NONE or QUEUED must store_release and the + * waiters should load_acquire. + * + * Tracking scx_ops_state enables sched_ext core to reliably determine whether + * any given task can be dispatched by the BPF scheduler at all times and thus + * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler + * to try to dispatch any task anytime regardless of its state as the SCX core + * can safely reject invalid dispatches. + */ +enum scx_ops_state { + SCX_OPSS_NONE, /* owned by the SCX core */ + SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ + SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ + SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ + + /* + * QSEQ brands each QUEUED instance so that, when dispatch races + * dequeue/requeue, the dispatcher can tell whether it still has a claim + * on the task being dispatched. + * + * As some 32bit archs can't do 64bit store_release/load_acquire, + * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on + * 32bit machines. The dispatch race window QSEQ protects is very narrow + * and runs with IRQ disabled. 30 bits should be sufficient. + */ + SCX_OPSS_QSEQ_SHIFT = 2, +}; + +/* Use macros to ensure that the type is unsigned long for the masks */ +#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) +#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) + +DECLARE_PER_CPU(struct rq *, scx_locked_rq_state); + +/* + * Return the rq currently locked from an scx callback, or NULL if no rq is + * locked. + */ +static inline struct rq *scx_locked_rq(void) +{ + return __this_cpu_read(scx_locked_rq_state); +} + +static inline bool scx_kf_allowed_if_unlocked(void) +{ + return !current->scx.kf_mask; +} + +static inline bool scx_rq_bypassing(struct rq *rq) +{ + return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); +} diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0fb9bf995a47..da46c3164537 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -88,7 +88,6 @@ static int __init setup_sched_thermal_decay_shift(char *str) } __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift); -#ifdef CONFIG_SMP /* * For asym packing, by default the lower numbered CPU has higher priority. */ @@ -111,7 +110,6 @@ int __weak arch_asym_cpu_priority(int cpu) * (default: ~5%) */ #define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078) -#endif #ifdef CONFIG_CFS_BANDWIDTH /* @@ -162,7 +160,7 @@ static int __init sched_fair_sysctl_init(void) return 0; } late_initcall(sched_fair_sysctl_init); -#endif +#endif /* CONFIG_SYSCTL */ static inline void update_load_add(struct load_weight *lw, unsigned long inc) { @@ -471,7 +469,7 @@ static int se_is_idle(struct sched_entity *se) return cfs_rq_is_idle(group_cfs_rq(se)); } -#else /* !CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED: */ #define for_each_sched_entity(se) \ for (; se; se = NULL) @@ -517,7 +515,7 @@ static int se_is_idle(struct sched_entity *se) return task_has_idle_policy(task_of(se)); } -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); @@ -556,7 +554,7 @@ static inline bool entity_before(const struct sched_entity *a, static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) { - return (s64)(se->vruntime - cfs_rq->min_vruntime); + return (s64)(se->vruntime - cfs_rq->zero_vruntime); } #define __node_2_se(node) \ @@ -608,13 +606,13 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * * Which we track using: * - * v0 := cfs_rq->min_vruntime + * v0 := cfs_rq->zero_vruntime * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime * \Sum w_i := cfs_rq->avg_load * - * Since min_vruntime is a monotonic increasing variable that closely tracks - * the per-task service, these deltas: (v_i - v), will be in the order of the - * maximal (virtual) lag induced in the system due to quantisation. + * Since zero_vruntime closely tracks the per-task service, these + * deltas: (v_i - v), will be in the order of the maximal (virtual) lag + * induced in the system due to quantisation. * * Also, we use scale_load_down() to reduce the size. * @@ -673,7 +671,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) avg = div_s64(avg, load); } - return cfs_rq->min_vruntime + avg; + return cfs_rq->zero_vruntime + avg; } /* @@ -734,7 +732,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) load += weight; } - return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load; + return avg >= (s64)(vruntime - cfs_rq->zero_vruntime) * load; } int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -742,42 +740,14 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) return vruntime_eligible(cfs_rq, se->vruntime); } -static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) -{ - u64 min_vruntime = cfs_rq->min_vruntime; - /* - * open coded max_vruntime() to allow updating avg_vruntime - */ - s64 delta = (s64)(vruntime - min_vruntime); - if (delta > 0) { - avg_vruntime_update(cfs_rq, delta); - min_vruntime = vruntime; - } - return min_vruntime; -} - -static void update_min_vruntime(struct cfs_rq *cfs_rq) +static void update_zero_vruntime(struct cfs_rq *cfs_rq) { - struct sched_entity *se = __pick_root_entity(cfs_rq); - struct sched_entity *curr = cfs_rq->curr; - u64 vruntime = cfs_rq->min_vruntime; - - if (curr) { - if (curr->on_rq) - vruntime = curr->vruntime; - else - curr = NULL; - } + u64 vruntime = avg_vruntime(cfs_rq); + s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime); - if (se) { - if (!curr) - vruntime = se->min_vruntime; - else - vruntime = min_vruntime(vruntime, se->min_vruntime); - } + avg_vruntime_update(cfs_rq, delta); - /* ensure we never gain time by being placed backwards. */ - cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime); + cfs_rq->zero_vruntime = vruntime; } static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) @@ -850,6 +820,7 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { avg_vruntime_add(cfs_rq, se); + update_zero_vruntime(cfs_rq); se->min_vruntime = se->vruntime; se->min_slice = se->slice; rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, @@ -861,6 +832,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, &min_vruntime_cb); avg_vruntime_sub(cfs_rq, se); + update_zero_vruntime(cfs_rq); } struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq) @@ -884,23 +856,44 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) } /* - * HACK, stash a copy of deadline at the point of pick in vlag, - * which isn't used until dequeue. + * Set the vruntime up to which an entity can run before looking + * for another entity to pick. + * In case of run to parity, we use the shortest slice of the enqueued + * entities to set the protected period. + * When run to parity is disabled, we give a minimum quantum to the running + * entity to ensure progress. */ -static inline void set_protect_slice(struct sched_entity *se) +static inline void set_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - se->vlag = se->deadline; + u64 slice = normalized_sysctl_sched_base_slice; + u64 vprot = se->deadline; + + if (sched_feat(RUN_TO_PARITY)) + slice = cfs_rq_min_slice(cfs_rq); + + slice = min(slice, se->slice); + if (slice != se->slice) + vprot = min_vruntime(vprot, se->vruntime + calc_delta_fair(slice, se)); + + se->vprot = vprot; +} + +static inline void update_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + u64 slice = cfs_rq_min_slice(cfs_rq); + + se->vprot = min_vruntime(se->vprot, se->vruntime + calc_delta_fair(slice, se)); } static inline bool protect_slice(struct sched_entity *se) { - return se->vlag == se->deadline; + return ((s64)(se->vprot - se->vruntime) > 0); } static inline void cancel_protect_slice(struct sched_entity *se) { if (protect_slice(se)) - se->vlag = se->deadline + 1; + se->vprot = se->vruntime; } /* @@ -922,7 +915,7 @@ static inline void cancel_protect_slice(struct sched_entity *se) * * Which allows tree pruning through eligibility. */ -static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) +static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect) { struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; struct sched_entity *se = __pick_first_entity(cfs_rq); @@ -936,10 +929,20 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) if (cfs_rq->nr_queued == 1) return curr && curr->on_rq ? curr : se; + /* + * Picking the ->next buddy will affect latency but not fairness. + */ + if (sched_feat(PICK_BUDDY) && + cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { + /* ->next will never be delayed */ + WARN_ON_ONCE(cfs_rq->next->sched_delayed); + return cfs_rq->next; + } + if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; - if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr)) + if (curr && protect && protect_slice(curr)) return curr; /* Pick the leftmost entity if it's eligible */ @@ -983,6 +986,11 @@ found: return best; } +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) +{ + return __pick_eevdf(cfs_rq, true); +} + struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); @@ -996,7 +1004,6 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) /************************************************************** * Scheduling class statistics methods: */ -#ifdef CONFIG_SMP int sched_update_scaling(void) { unsigned int factor = get_update_sysctl_factor(); @@ -1008,7 +1015,6 @@ int sched_update_scaling(void) return 0; } -#endif static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); @@ -1041,7 +1047,6 @@ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) } #include "pelt.h" -#ifdef CONFIG_SMP static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); @@ -1131,34 +1136,40 @@ void post_init_entity_util_avg(struct task_struct *p) sa->runnable_avg = sa->util_avg; } -#else /* !CONFIG_SMP */ -void init_entity_runnable_average(struct sched_entity *se) -{ -} -void post_init_entity_util_avg(struct task_struct *p) -{ -} -static void update_tg_load_avg(struct cfs_rq *cfs_rq) -{ -} -#endif /* CONFIG_SMP */ - -static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) +static s64 update_se(struct rq *rq, struct sched_entity *se) { u64 now = rq_clock_task(rq); s64 delta_exec; - delta_exec = now - curr->exec_start; + delta_exec = now - se->exec_start; if (unlikely(delta_exec <= 0)) return delta_exec; - curr->exec_start = now; - curr->sum_exec_runtime += delta_exec; + se->exec_start = now; + if (entity_is_task(se)) { + struct task_struct *donor = task_of(se); + struct task_struct *running = rq->curr; + /* + * If se is a task, we account the time against the running + * task, as w/ proxy-exec they may not be the same. + */ + running->se.exec_start = now; + running->se.sum_exec_runtime += delta_exec; + + trace_sched_stat_runtime(running, delta_exec); + account_group_exec_runtime(running, delta_exec); + + /* cgroup time is always accounted against the donor */ + cgroup_account_cputime(donor, delta_exec); + } else { + /* If not task, account the time against donor se */ + se->sum_exec_runtime += delta_exec; + } if (schedstat_enabled()) { struct sched_statistics *stats; - stats = __schedstats_from_se(curr); + stats = __schedstats_from_se(se); __schedstat_set(stats->exec_max, max(delta_exec, stats->exec_max)); } @@ -1166,58 +1177,14 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) return delta_exec; } -static inline void update_curr_task(struct task_struct *p, s64 delta_exec) -{ - trace_sched_stat_runtime(p, delta_exec); - account_group_exec_runtime(p, delta_exec); - cgroup_account_cputime(p, delta_exec); -} - -static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr) -{ - if (!sched_feat(PREEMPT_SHORT)) - return false; - - if (curr->vlag == curr->deadline) - return false; - - return !entity_eligible(cfs_rq, curr); -} - -static inline bool do_preempt_short(struct cfs_rq *cfs_rq, - struct sched_entity *pse, struct sched_entity *se) -{ - if (!sched_feat(PREEMPT_SHORT)) - return false; - - if (pse->slice >= se->slice) - return false; - - if (!entity_eligible(cfs_rq, pse)) - return false; - - if (entity_before(pse, se)) - return true; - - if (!entity_eligible(cfs_rq, se)) - return true; - - return false; -} +static void set_next_buddy(struct sched_entity *se); /* * Used by other classes to account runtime. */ s64 update_curr_common(struct rq *rq) { - struct task_struct *donor = rq->donor; - s64 delta_exec; - - delta_exec = update_curr_se(rq, &donor->se); - if (likely(delta_exec > 0)) - update_curr_task(donor, delta_exec); - - return delta_exec; + return update_se(rq, &rq->donor->se); } /* @@ -1225,6 +1192,12 @@ s64 update_curr_common(struct rq *rq) */ static void update_curr(struct cfs_rq *cfs_rq) { + /* + * Note: cfs_rq->curr corresponds to the task picked to + * run (ie: rq->donor.se) which due to proxy-exec may + * not necessarily be the actual task running + * (rq->curr.se). This is easy to confuse! + */ struct sched_entity *curr = cfs_rq->curr; struct rq *rq = rq_of(cfs_rq); s64 delta_exec; @@ -1233,19 +1206,14 @@ static void update_curr(struct cfs_rq *cfs_rq) if (unlikely(!curr)) return; - delta_exec = update_curr_se(rq, curr); + delta_exec = update_se(rq, curr); if (unlikely(delta_exec <= 0)) return; curr->vruntime += calc_delta_fair(delta_exec, curr); resched = update_deadline(cfs_rq, curr); - update_min_vruntime(cfs_rq); if (entity_is_task(curr)) { - struct task_struct *p = task_of(curr); - - update_curr_task(p, delta_exec); - /* * If the fair_server is active, we need to account for the * fair_server time whether or not the task is running on @@ -1256,8 +1224,7 @@ static void update_curr(struct cfs_rq *cfs_rq) * against fair_server such that it can account for this time * and possibly avoid running this period. */ - if (dl_server_active(&rq->fair_server)) - dl_server_update(&rq->fair_server, delta_exec); + dl_server_update(&rq->fair_server, delta_exec); } account_cfs_rq_runtime(cfs_rq, delta_exec); @@ -1265,7 +1232,7 @@ static void update_curr(struct cfs_rq *cfs_rq) if (cfs_rq->nr_queued == 1) return; - if (resched || did_preempt_short(cfs_rq, curr)) { + if (resched || !protect_slice(curr)) { resched_curr_lazy(rq); clear_buddies(cfs_rq, curr); } @@ -1512,7 +1479,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) * by the PTE scanner and NUMA hinting faults should be trapped based * on resident pages */ - nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT); + nr_scan_pages = MB_TO_PAGES(sysctl_numa_balancing_scan_size); rss = get_mm_rss(p->mm); if (!rss) rss = nr_scan_pages; @@ -1940,17 +1907,18 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, struct pglist_data *pgdat; unsigned long rate_limit; unsigned int latency, th, def_th; + long nr = folio_nr_pages(folio); pgdat = NODE_DATA(dst_nid); if (pgdat_free_space_enough(pgdat)) { /* workload changed, reset hot threshold */ pgdat->nbp_threshold = 0; + mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE_NRL, nr); return true; } def_th = sysctl_numa_balancing_hot_threshold; - rate_limit = sysctl_numa_balancing_promote_rate_limit << \ - (20 - PAGE_SHIFT); + rate_limit = MB_TO_PAGES(sysctl_numa_balancing_promote_rate_limit); numa_promotion_adjust_threshold(pgdat, rate_limit, def_th); th = pgdat->nbp_threshold ? : def_th; @@ -1958,8 +1926,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, if (latency >= th) return false; - return !numa_promotion_rate_limit(pgdat, rate_limit, - folio_nr_pages(folio)); + return !numa_promotion_rate_limit(pgdat, rate_limit, nr); } this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); @@ -2114,12 +2081,12 @@ static inline int numa_idle_core(int idle_core, int cpu) return idle_core; } -#else +#else /* !CONFIG_SCHED_SMT: */ static inline int numa_idle_core(int idle_core, int cpu) { return idle_core; } -#endif +#endif /* !CONFIG_SCHED_SMT */ /* * Gather all necessary information to make NUMA balancing placement @@ -2273,7 +2240,8 @@ static bool task_numa_compare(struct task_numa_env *env, rcu_read_lock(); cur = rcu_dereference(dst_rq->curr); - if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) + if (cur && ((cur->flags & (PF_EXITING | PF_KTHREAD)) || + !cur->mm)) cur = NULL; /* @@ -3329,6 +3297,15 @@ static void task_numa_work(struct callback_head *work) if (p->flags & PF_EXITING) return; + /* + * Memory is pinned to only one NUMA node via cpuset.mems, naturally + * no page can be migrated. + */ + if (cpusets_enabled() && nodes_weight(cpuset_current_mems_allowed) == 1) { + trace_sched_skip_cpuset_numa(current, &cpuset_current_mems_allowed); + return; + } + if (!mm->numa_next_scan) { mm->numa_next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); @@ -3549,7 +3526,7 @@ out: } } -void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +void init_numa_balancing(u64 clone_flags, struct task_struct *p) { int mm_users = 0; struct mm_struct *mm = p->mm; @@ -3663,7 +3640,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu) p->numa_scan_period = task_scan_start(p); } -#else +#else /* !CONFIG_NUMA_BALANCING: */ + static void task_tick_numa(struct rq *rq, struct task_struct *curr) { } @@ -3680,20 +3658,18 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu) { } -#endif /* CONFIG_NUMA_BALANCING */ +#endif /* !CONFIG_NUMA_BALANCING */ static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); -#ifdef CONFIG_SMP if (entity_is_task(se)) { struct rq *rq = rq_of(cfs_rq); account_numa_enqueue(rq, task_of(se)); list_add(&se->group_node, &rq->cfs_tasks); } -#endif cfs_rq->nr_queued++; } @@ -3701,12 +3677,10 @@ static void account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); -#ifdef CONFIG_SMP if (entity_is_task(se)) { account_numa_dequeue(rq_of(cfs_rq), task_of(se)); list_del_init(&se->group_node); } -#endif cfs_rq->nr_queued--; } @@ -3758,7 +3732,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) *ptr -= min_t(typeof(*ptr), *ptr, _val); \ } while (0) -#ifdef CONFIG_SMP static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -3775,12 +3748,6 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); } -#else -static inline void -enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } -static inline void -dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } -#endif static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); @@ -3795,6 +3762,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, update_entity_lag(cfs_rq, se); se->deadline -= se->vruntime; se->rel_deadline = 1; + cfs_rq->nr_queued--; if (!curr) __dequeue_entity(cfs_rq, se); update_load_sub(&cfs_rq->load, se->load.weight); @@ -3811,29 +3779,19 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, update_load_set(&se->load, weight); -#ifdef CONFIG_SMP do { u32 divider = get_pelt_divider(&se->avg); se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); } while (0); -#endif enqueue_load_avg(cfs_rq, se); if (se->on_rq) { - update_load_add(&cfs_rq->load, se->load.weight); place_entity(cfs_rq, se, 0); + update_load_add(&cfs_rq->load, se->load.weight); if (!curr) __enqueue_entity(cfs_rq, se); - - /* - * The entity's vruntime has been adjusted, so let's check - * whether the rq-wide min_vruntime needs updated too. Since - * the calculations above require stable min_vruntime rather - * than up-to-date one, we do the update at the end of the - * reweight process. - */ - update_min_vruntime(cfs_rq); + cfs_rq->nr_queued++; } } @@ -3851,7 +3809,6 @@ static void reweight_task_fair(struct rq *rq, struct task_struct *p, static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); #ifdef CONFIG_FAIR_GROUP_SCHED -#ifdef CONFIG_SMP /* * All this does is approximate the hierarchical proportion which includes that * global sum we all love to hate. @@ -3958,7 +3915,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq) */ return clamp_t(long, shares, MIN_SHARES, tg_shares); } -#endif /* CONFIG_SMP */ /* * Recomputes the group entity based on the current state of its group @@ -3976,23 +3932,16 @@ static void update_cfs_group(struct sched_entity *se) if (!gcfs_rq || !gcfs_rq->load.weight) return; - if (throttled_hierarchy(gcfs_rq)) - return; - -#ifndef CONFIG_SMP - shares = READ_ONCE(gcfs_rq->tg->shares); -#else shares = calc_group_shares(gcfs_rq); -#endif if (unlikely(se->load.weight != shares)) reweight_entity(cfs_rq_of(se), se, shares); } -#else /* CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline void update_cfs_group(struct sched_entity *se) { } -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) { @@ -4017,7 +3966,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) } } -#ifdef CONFIG_SMP static inline bool load_avg_is_decayed(struct sched_avg *sa) { if (sa->load_sum) @@ -4086,6 +4034,9 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) if (child_cfs_rq_on_list(cfs_rq)) return false; + if (cfs_rq->tg_load_avg_contrib) + return false; + return true; } @@ -4469,7 +4420,7 @@ static inline bool skip_blocked_update(struct sched_entity *se) return true; } -#else /* CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {} @@ -4482,7 +4433,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {} -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* !CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_NO_HZ_COMMON static inline void migrate_se_pelt_lag(struct sched_entity *se) @@ -4563,9 +4514,9 @@ static inline void migrate_se_pelt_lag(struct sched_entity *se) __update_load_avg_blocked_se(now, se); } -#else +#else /* !CONFIG_NO_HZ_COMMON: */ static void migrate_se_pelt_lag(struct sched_entity *se) {} -#endif +#endif /* !CONFIG_NO_HZ_COMMON */ /** * update_cfs_rq_load_avg - update the cfs_rq's load/util averages @@ -4933,13 +4884,6 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, goto done; /* - * To avoid overestimation of actual task utilization, skip updates if - * we cannot grant there is idle time in this CPU. - */ - if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)))) - return; - - /* * To avoid underestimate of task utilization, skip updates of EWMA if * we cannot grant that thread got all CPU time it wanted. */ @@ -5139,48 +5083,6 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); } -#else /* CONFIG_SMP */ - -static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) -{ - return !cfs_rq->nr_queued; -} - -#define UPDATE_TG 0x0 -#define SKIP_AGE_LOAD 0x0 -#define DO_ATTACH 0x0 -#define DO_DETACH 0x0 - -static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) -{ - cfs_rq_util_change(cfs_rq, 0); -} - -static inline void remove_entity_load_avg(struct sched_entity *se) {} - -static inline void -attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline void -detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} - -static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf) -{ - return 0; -} - -static inline void -util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} - -static inline void -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {} - -static inline void -util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, - bool task_sleep) {} -static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} - -#endif /* CONFIG_SMP */ - void __setparam_fair(struct task_struct *p, const struct sched_attr *attr) { struct sched_entity *se = &p->se; @@ -5248,7 +5150,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i) * = (W*V + w_i*(V - vl_i)) / (W + w_i) * = (W*V + w_i*V - w_i*vl_i) / (W + w_i) - * = (V*(W + w_i) - w_i*l) / (W + w_i) + * = (V*(W + w_i) - w_i*vl_i) / (W + w_i) * = V - w_i*vl_i / (W + w_i) * * And the actual lag after adding an entity with vl_i is: @@ -5364,18 +5266,16 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (cfs_rq->nr_queued == 1) { check_enqueue_throttle(cfs_rq); - if (!throttled_hierarchy(cfs_rq)) { - list_add_leaf_cfs_rq(cfs_rq); - } else { + list_add_leaf_cfs_rq(cfs_rq); #ifdef CONFIG_CFS_BANDWIDTH + if (cfs_rq->pelt_clock_throttled) { struct rq *rq = rq_of(cfs_rq); - if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) - cfs_rq->throttled_clock = rq_clock(rq); - if (!cfs_rq->throttled_clock_self) - cfs_rq->throttled_clock_self = rq_clock(rq); -#endif + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - + cfs_rq->throttled_clock_pelt; + cfs_rq->pelt_clock_throttled = 0; } +#endif } } @@ -5414,8 +5314,6 @@ static void set_delayed(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_runnable--; - if (cfs_rq_throttled(cfs_rq)) - break; } } @@ -5436,8 +5334,6 @@ static void clear_delayed(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_runnable++; - if (cfs_rq_throttled(cfs_rq)) - break; } } @@ -5465,7 +5361,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * DELAY_DEQUEUE relies on spurious wakeups, special task * states must not suffer spurious wakeups, excempt them. */ - if (flags & DEQUEUE_SPECIAL) + if (flags & (DEQUEUE_SPECIAL | DEQUEUE_THROTTLE)) delay = false; WARN_ON_ONCE(delay && se->sched_delayed); @@ -5511,20 +5407,21 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_cfs_group(se); - /* - * Now advance min_vruntime if @se was the entity holding it back, - * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be - * put back on, and if we advance min_vruntime, we'll be placed back - * further than we started -- i.e. we'll be penalized. - */ - if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) - update_min_vruntime(cfs_rq); - if (flags & DEQUEUE_DELAYED) finish_delayed_dequeue_entity(se); - if (cfs_rq->nr_queued == 0) + if (cfs_rq->nr_queued == 0) { update_idle_cfs_rq_clock_pelt(cfs_rq); +#ifdef CONFIG_CFS_BANDWIDTH + if (throttled_hierarchy(cfs_rq)) { + struct rq *rq = rq_of(cfs_rq); + + list_del_leaf_cfs_rq(cfs_rq); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); + cfs_rq->pelt_clock_throttled = 1; + } +#endif + } return true; } @@ -5545,7 +5442,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) __dequeue_entity(cfs_rq, se); update_load_avg(cfs_rq, se, UPDATE_TG); - set_protect_slice(se); + set_protect_slice(cfs_rq, se); } update_stats_curr_start(cfs_rq, se); @@ -5584,16 +5481,6 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) { struct sched_entity *se; - /* - * Picking the ->next buddy will affect latency but not fairness. - */ - if (sched_feat(PICK_BUDDY) && - cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { - /* ->next will never be delayed */ - WARN_ON_ONCE(cfs_rq->next->sched_delayed); - return cfs_rq->next; - } - se = pick_eevdf(cfs_rq); if (se->sched_delayed) { dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); @@ -5680,7 +5567,7 @@ void cfs_bandwidth_usage_dec(void) { static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used); } -#else /* CONFIG_JUMP_LABEL */ +#else /* !CONFIG_JUMP_LABEL: */ static bool cfs_bandwidth_used(void) { return true; @@ -5688,16 +5575,7 @@ static bool cfs_bandwidth_used(void) void cfs_bandwidth_usage_inc(void) {} void cfs_bandwidth_usage_dec(void) {} -#endif /* CONFIG_JUMP_LABEL */ - -/* - * default period for cfs group bandwidth. - * default: 0.1s, units: nanoseconds - */ -static inline u64 default_cfs_period(void) -{ - return 100000000ULL; -} +#endif /* !CONFIG_JUMP_LABEL */ static inline u64 sched_cfs_bandwidth_slice(void) { @@ -5807,74 +5685,253 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) return cfs_bandwidth_used() && cfs_rq->throttled; } +static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq) +{ + return cfs_bandwidth_used() && cfs_rq->pelt_clock_throttled; +} + /* check whether cfs_rq, or any parent, is throttled */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { return cfs_bandwidth_used() && cfs_rq->throttle_count; } +static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu) +{ + return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]); +} + +static inline bool task_is_throttled(struct task_struct *p) +{ + return cfs_bandwidth_used() && p->throttled; +} + +static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags); +static void throttle_cfs_rq_work(struct callback_head *work) +{ + struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work); + struct sched_entity *se; + struct cfs_rq *cfs_rq; + struct rq *rq; + + WARN_ON_ONCE(p != current); + p->sched_throttle_work.next = &p->sched_throttle_work; + + /* + * If task is exiting, then there won't be a return to userspace, so we + * don't have to bother with any of this. + */ + if ((p->flags & PF_EXITING)) + return; + + scoped_guard(task_rq_lock, p) { + se = &p->se; + cfs_rq = cfs_rq_of(se); + + /* Raced, forget */ + if (p->sched_class != &fair_sched_class) + return; + + /* + * If not in limbo, then either replenish has happened or this + * task got migrated out of the throttled cfs_rq, move along. + */ + if (!cfs_rq->throttle_count) + return; + rq = scope.rq; + update_rq_clock(rq); + WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node)); + dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_THROTTLE); + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); + /* + * Must not set throttled before dequeue or dequeue will + * mistakenly regard this task as an already throttled one. + */ + p->throttled = true; + resched_curr(rq); + } +} + +void init_cfs_throttle_work(struct task_struct *p) +{ + init_task_work(&p->sched_throttle_work, throttle_cfs_rq_work); + /* Protect against double add, see throttle_cfs_rq() and throttle_cfs_rq_work() */ + p->sched_throttle_work.next = &p->sched_throttle_work; + INIT_LIST_HEAD(&p->throttle_node); +} + /* - * Ensure that neither of the group entities corresponding to src_cpu or - * dest_cpu are members of a throttled hierarchy when performing group - * load-balance operations. + * Task is throttled and someone wants to dequeue it again: + * it could be sched/core when core needs to do things like + * task affinity change, task group change, task sched class + * change etc. and in these cases, DEQUEUE_SLEEP is not set; + * or the task is blocked after throttled due to freezer etc. + * and in these cases, DEQUEUE_SLEEP is set. */ -static inline int throttled_lb_pair(struct task_group *tg, - int src_cpu, int dest_cpu) +static void detach_task_cfs_rq(struct task_struct *p); +static void dequeue_throttled_task(struct task_struct *p, int flags) { - struct cfs_rq *src_cfs_rq, *dest_cfs_rq; + WARN_ON_ONCE(p->se.on_rq); + list_del_init(&p->throttle_node); - src_cfs_rq = tg->cfs_rq[src_cpu]; - dest_cfs_rq = tg->cfs_rq[dest_cpu]; + /* task blocked after throttled */ + if (flags & DEQUEUE_SLEEP) { + p->throttled = false; + return; + } + + /* + * task is migrating off its old cfs_rq, detach + * the task's load from its old cfs_rq. + */ + if (task_on_rq_migrating(p)) + detach_task_cfs_rq(p); +} + +static bool enqueue_throttled_task(struct task_struct *p) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); + + /* @p should have gone through dequeue_throttled_task() first */ + WARN_ON_ONCE(!list_empty(&p->throttle_node)); + + /* + * If the throttled task @p is enqueued to a throttled cfs_rq, + * take the fast path by directly putting the task on the + * target cfs_rq's limbo list. + * + * Do not do that when @p is current because the following race can + * cause @p's group_node to be incorectly re-insterted in its rq's + * cfs_tasks list, despite being throttled: + * + * cpuX cpuY + * p ret2user + * throttle_cfs_rq_work() sched_move_task(p) + * LOCK task_rq_lock + * dequeue_task_fair(p) + * UNLOCK task_rq_lock + * LOCK task_rq_lock + * task_current_donor(p) == true + * task_on_rq_queued(p) == true + * dequeue_task(p) + * put_prev_task(p) + * sched_change_group() + * enqueue_task(p) -> p's new cfs_rq + * is throttled, go + * fast path and skip + * actual enqueue + * set_next_task(p) + * list_move(&se->group_node, &rq->cfs_tasks); // bug + * schedule() + * + * In the above race case, @p current cfs_rq is in the same rq as + * its previous cfs_rq because sched_move_task() only moves a task + * to a different group from the same rq, so we can use its current + * cfs_rq to derive rq and test if the task is current. + */ + if (throttled_hierarchy(cfs_rq) && + !task_current_donor(rq_of(cfs_rq), p)) { + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); + return true; + } - return throttled_hierarchy(src_cfs_rq) || - throttled_hierarchy(dest_cfs_rq); + /* we can't take the fast path, do an actual enqueue*/ + p->throttled = false; + return false; } +static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags); static int tg_unthrottle_up(struct task_group *tg, void *data) { struct rq *rq = data; struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct task_struct *p, *tmp; + + if (--cfs_rq->throttle_count) + return 0; - cfs_rq->throttle_count--; - if (!cfs_rq->throttle_count) { + if (cfs_rq->pelt_clock_throttled) { cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - cfs_rq->throttled_clock_pelt; + cfs_rq->pelt_clock_throttled = 0; + } - /* Add cfs_rq with load or one or more already running entities to the list */ - if (!cfs_rq_is_decayed(cfs_rq)) - list_add_leaf_cfs_rq(cfs_rq); + if (cfs_rq->throttled_clock_self) { + u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; - if (cfs_rq->throttled_clock_self) { - u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; + cfs_rq->throttled_clock_self = 0; - cfs_rq->throttled_clock_self = 0; + if (WARN_ON_ONCE((s64)delta < 0)) + delta = 0; - if (WARN_ON_ONCE((s64)delta < 0)) - delta = 0; + cfs_rq->throttled_clock_self_time += delta; + } - cfs_rq->throttled_clock_self_time += delta; - } + /* Re-enqueue the tasks that have been throttled at this level. */ + list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) { + list_del_init(&p->throttle_node); + p->throttled = false; + enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP); } + /* Add cfs_rq with load or one or more already running entities to the list */ + if (!cfs_rq_is_decayed(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); + return 0; } +static inline bool task_has_throttle_work(struct task_struct *p) +{ + return p->sched_throttle_work.next != &p->sched_throttle_work; +} + +static inline void task_throttle_setup_work(struct task_struct *p) +{ + if (task_has_throttle_work(p)) + return; + + /* + * Kthreads and exiting tasks don't return to userspace, so adding the + * work is pointless + */ + if ((p->flags & (PF_EXITING | PF_KTHREAD))) + return; + + task_work_add(p, &p->sched_throttle_work, TWA_RESUME); +} + +static void record_throttle_clock(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + + if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) + cfs_rq->throttled_clock = rq_clock(rq); + + if (!cfs_rq->throttled_clock_self) + cfs_rq->throttled_clock_self = rq_clock(rq); +} + static int tg_throttle_down(struct task_group *tg, void *data) { struct rq *rq = data; struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - /* group is entering throttled state, stop time */ - if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); - list_del_leaf_cfs_rq(cfs_rq); + if (cfs_rq->throttle_count++) + return 0; - WARN_ON_ONCE(cfs_rq->throttled_clock_self); - if (cfs_rq->nr_queued) - cfs_rq->throttled_clock_self = rq_clock(rq); + /* + * For cfs_rqs that still have entities enqueued, PELT clock + * stop happens at dequeue time when all entities are dequeued. + */ + if (!cfs_rq->nr_queued) { + list_del_leaf_cfs_rq(cfs_rq); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); + cfs_rq->pelt_clock_throttled = 1; } - cfs_rq->throttle_count++; + WARN_ON_ONCE(cfs_rq->throttled_clock_self); + WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list)); return 0; } @@ -5882,9 +5939,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - struct sched_entity *se; - long queued_delta, runnable_delta, idle_delta, dequeue = 1; - long rq_h_nr_queued = rq->cfs.h_nr_queued; + int dequeue = 1; raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -5907,80 +5962,17 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) if (!dequeue) return false; /* Throttle no longer required. */ - se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; - /* freeze hierarchy runnable averages while throttled */ rcu_read_lock(); walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); - queued_delta = cfs_rq->h_nr_queued; - runnable_delta = cfs_rq->h_nr_runnable; - idle_delta = cfs_rq->h_nr_idle; - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - int flags; - - /* throttled entity or throttle-on-deactivate */ - if (!se->on_rq) - goto done; - - /* - * Abuse SPECIAL to avoid delayed dequeue in this instance. - * This avoids teaching dequeue_entities() about throttled - * entities and keeps things relatively simple. - */ - flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL; - if (se->sched_delayed) - flags |= DEQUEUE_DELAYED; - dequeue_entity(qcfs_rq, se, flags); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued -= queued_delta; - qcfs_rq->h_nr_runnable -= runnable_delta; - qcfs_rq->h_nr_idle -= idle_delta; - - if (qcfs_rq->load.weight) { - /* Avoid re-evaluating load for this entity: */ - se = parent_entity(se); - break; - } - } - - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - /* throttled entity or throttle-on-deactivate */ - if (!se->on_rq) - goto done; - - update_load_avg(qcfs_rq, se, 0); - se_update_runnable(se); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued -= queued_delta; - qcfs_rq->h_nr_runnable -= runnable_delta; - qcfs_rq->h_nr_idle -= idle_delta; - } - - /* At this point se is NULL and we are at root level*/ - sub_nr_running(rq, queued_delta); - - /* Stop the fair server if throttling resulted in no runnable tasks */ - if (rq_h_nr_queued && !rq->cfs.h_nr_queued) - dl_server_stop(&rq->fair_server); -done: /* * Note: distribution will already see us throttled via the * throttled-list. rq->lock protects completion. */ cfs_rq->throttled = 1; WARN_ON_ONCE(cfs_rq->throttled_clock); - if (cfs_rq->nr_queued) - cfs_rq->throttled_clock = rq_clock(rq); return true; } @@ -5988,11 +5980,19 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - struct sched_entity *se; - long queued_delta, runnable_delta, idle_delta; - long rq_h_nr_queued = rq->cfs.h_nr_queued; + struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; - se = cfs_rq->tg->se[cpu_of(rq)]; + /* + * It's possible we are called with runtime_remaining < 0 due to things + * like async unthrottled us with a positive runtime_remaining but other + * still running entities consumed those runtime before we reached here. + * + * We can't unthrottle this cfs_rq without any runtime remaining because + * any enqueue in tg_unthrottle_up() will immediately trigger a throttle, + * which is not supposed to happen on unthrottle path. + */ + if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0) + return; cfs_rq->throttled = 0; @@ -6020,62 +6020,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) if (list_add_leaf_cfs_rq(cfs_rq_of(se))) break; } - goto unthrottle_throttle; } - queued_delta = cfs_rq->h_nr_queued; - runnable_delta = cfs_rq->h_nr_runnable; - idle_delta = cfs_rq->h_nr_idle; - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - - /* Handle any unfinished DELAY_DEQUEUE business first. */ - if (se->sched_delayed) { - int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED; - - dequeue_entity(qcfs_rq, se, flags); - } else if (se->on_rq) - break; - enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued += queued_delta; - qcfs_rq->h_nr_runnable += runnable_delta; - qcfs_rq->h_nr_idle += idle_delta; - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(qcfs_rq)) - goto unthrottle_throttle; - } - - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - - update_load_avg(qcfs_rq, se, UPDATE_TG); - se_update_runnable(se); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued += queued_delta; - qcfs_rq->h_nr_runnable += runnable_delta; - qcfs_rq->h_nr_idle += idle_delta; - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(qcfs_rq)) - goto unthrottle_throttle; - } - - /* Start the fair server if un-throttling resulted in new runnable tasks */ - if (!rq_h_nr_queued && rq->cfs.h_nr_queued) - dl_server_start(&rq->fair_server); - - /* At this point se is NULL and we are at root level*/ - add_nr_running(rq, queued_delta); - -unthrottle_throttle: assert_list_leaf_cfs_rq(rq); /* Determine whether we need to wake up potentially idle CPU: */ @@ -6083,7 +6029,6 @@ unthrottle_throttle: resched_curr(rq); } -#ifdef CONFIG_SMP static void __cfsb_csd_unthrottle(void *arg) { struct cfs_rq *cursor, *tmp; @@ -6142,12 +6087,6 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) if (first) smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd); } -#else -static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) -{ - unthrottle_cfs_rq(cfs_rq); -} -#endif static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) { @@ -6454,6 +6393,16 @@ static void sync_throttle(struct task_group *tg, int cpu) cfs_rq->throttle_count = pcfs_rq->throttle_count; cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu)); + + /* + * It is not enough to sync the "pelt_clock_throttled" indicator + * with the parent cfs_rq when the hierarchy is not queued. + * Always join a throttled hierarchy with PELT clock throttled + * and leaf it to the first enqueue, or distribution to + * unthrottle the PELT clock. + */ + if (cfs_rq->throttle_count) + cfs_rq->pelt_clock_throttled = 1; } /* conditionally throttle active cfs_rq's from put_prev_entity() */ @@ -6485,8 +6434,6 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) return HRTIMER_NORESTART; } -extern const u64 max_cfs_quota_period; - static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = @@ -6513,7 +6460,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) * to fail. */ new = old * 2; - if (new < max_cfs_quota_period) { + if (new < max_bw_quota_period_us * NSEC_PER_USEC) { cfs_b->period = ns_to_ktime(new); cfs_b->quota *= 2; cfs_b->burst *= 2; @@ -6547,7 +6494,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *paren raw_spin_lock_init(&cfs_b->lock); cfs_b->runtime = 0; cfs_b->quota = RUNTIME_INF; - cfs_b->period = ns_to_ktime(default_cfs_period()); + cfs_b->period = us_to_ktime(default_bw_period_us()); cfs_b->burst = 0; cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF; @@ -6568,6 +6515,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); + INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list); } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -6603,7 +6551,6 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) * guaranteed at this point that no additional cfs_rq of this group can * join a CSD list. */ -#ifdef CONFIG_SMP for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); unsigned long flags; @@ -6615,7 +6562,6 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) __cfsb_csd_unthrottle(rq); local_irq_restore(flags); } -#endif } /* @@ -6728,28 +6674,37 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) if (cfs_task_bw_constrained(p)) tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); } -#endif +#endif /* CONFIG_NO_HZ_FULL */ -#else /* CONFIG_CFS_BANDWIDTH */ +#else /* !CONFIG_CFS_BANDWIDTH: */ static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static inline void sync_throttle(struct task_group *tg, int cpu) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static void task_throttle_setup_work(struct task_struct *p) {} +static bool task_is_throttled(struct task_struct *p) { return false; } +static void dequeue_throttled_task(struct task_struct *p, int flags) {} +static bool enqueue_throttled_task(struct task_struct *p) { return false; } +static void record_throttle_clock(struct cfs_rq *cfs_rq) {} static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { return 0; } +static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq) +{ + return false; +} + static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { return 0; } -static inline int throttled_lb_pair(struct task_group *tg, - int src_cpu, int dest_cpu) +static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu) { return 0; } @@ -6772,7 +6727,7 @@ bool cfs_task_bw_constrained(struct task_struct *p) return false; } #endif -#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* !CONFIG_CFS_BANDWIDTH */ #if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL) static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {} @@ -6817,7 +6772,7 @@ static void hrtick_update(struct rq *rq) hrtick_start_fair(rq, donor); } -#else /* !CONFIG_SCHED_HRTICK */ +#else /* !CONFIG_SCHED_HRTICK: */ static inline void hrtick_start_fair(struct rq *rq, struct task_struct *p) { @@ -6826,9 +6781,8 @@ hrtick_start_fair(struct rq *rq, struct task_struct *p) static inline void hrtick_update(struct rq *rq) { } -#endif +#endif /* !CONFIG_SCHED_HRTICK */ -#ifdef CONFIG_SMP static inline bool cpu_overutilized(int cpu) { unsigned long rq_util_min, rq_util_max; @@ -6870,9 +6824,6 @@ static inline void check_update_overutilized_status(struct rq *rq) if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu)) set_rd_overutilized(rq->rd, 1); } -#else -static inline void check_update_overutilized_status(struct rq *rq) { } -#endif /* Runqueue only has SCHED_IDLE tasks enqueued */ static int sched_idle_rq(struct rq *rq) @@ -6881,12 +6832,10 @@ static int sched_idle_rq(struct rq *rq) rq->nr_running); } -#ifdef CONFIG_SMP static int sched_idle_cpu(int cpu) { return sched_idle_rq(cpu_rq(cpu)); } -#endif static void requeue_delayed_entity(struct sched_entity *se) @@ -6935,13 +6884,16 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) int rq_h_nr_queued = rq->cfs.h_nr_queued; u64 slice = 0; + if (task_is_throttled(p) && enqueue_throttled_task(p)) + return; + /* * The code below (indirectly) updates schedutil which looks at * the cfs_rq utilization to select a frequency. * Let's add the task's estimated utilization to the cfs_rq's * estimated utilization, before we update schedutil. */ - if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) + if (!p->se.sched_delayed || (flags & ENQUEUE_DELAYED)) util_est_enqueue(&rq->cfs, p); if (flags & ENQUEUE_DELAYED) { @@ -6987,10 +6939,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = 1; - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - goto enqueue_throttle; - flags = ENQUEUE_WAKEUP; } @@ -7012,18 +6960,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = 1; - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - goto enqueue_throttle; } - if (!rq_h_nr_queued && rq->cfs.h_nr_queued) { - /* Account for idle runtime */ - if (!rq->nr_running) - dl_server_update_idle_time(rq, rq->curr); + if (!rq_h_nr_queued && rq->cfs.h_nr_queued) dl_server_start(&rq->fair_server); - } /* At this point se is NULL and we are at root level*/ add_nr_running(rq, 1); @@ -7045,14 +6985,11 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!task_new) check_update_overutilized_status(rq); -enqueue_throttle: assert_list_leaf_cfs_rq(rq); hrtick_update(rq); } -static void set_next_buddy(struct sched_entity *se); - /* * Basically dequeue_task_fair(), except it can deal with dequeue_entity() * failing half-way through and resume the dequeue later. @@ -7065,9 +7002,9 @@ static void set_next_buddy(struct sched_entity *se); static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) { bool was_sched_idle = sched_idle_rq(rq); - int rq_h_nr_queued = rq->cfs.h_nr_queued; bool task_sleep = flags & DEQUEUE_SLEEP; bool task_delayed = flags & DEQUEUE_DELAYED; + bool task_throttled = flags & DEQUEUE_THROTTLE; struct task_struct *p = NULL; int h_nr_idle = 0; int h_nr_queued = 0; @@ -7101,9 +7038,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = h_nr_queued; - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - return 0; + if (throttled_hierarchy(cfs_rq) && task_throttled) + record_throttle_clock(cfs_rq); /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { @@ -7115,7 +7051,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) * Bias pick_next to pick a task from this cfs_rq, as * p is sleeping when it is within its sched_slice. */ - if (task_sleep && se && !throttled_hierarchy(cfs_rq)) + if (task_sleep && se) set_next_buddy(se); break; } @@ -7142,16 +7078,12 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = h_nr_queued; - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - return 0; + if (throttled_hierarchy(cfs_rq) && task_throttled) + record_throttle_clock(cfs_rq); } sub_nr_running(rq, h_nr_queued); - if (rq_h_nr_queued && !rq->cfs.h_nr_queued) - dl_server_stop(&rq->fair_server); - /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) rq->next_balance = jiffies; @@ -7181,7 +7113,12 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) */ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) { - if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE)))) + if (task_is_throttled(p)) { + dequeue_throttled_task(p, flags); + return true; + } + + if (!p->se.sched_delayed) util_est_dequeue(&rq->cfs, p); util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); @@ -7196,7 +7133,10 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) return true; } -#ifdef CONFIG_SMP +static inline unsigned int cfs_h_nr_delayed(struct rq *rq) +{ + return (rq->cfs.h_nr_queued - rq->cfs.h_nr_runnable); +} /* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); @@ -7357,8 +7297,12 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync) if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu; - if (sync && cpu_rq(this_cpu)->nr_running == 1) - return this_cpu; + if (sync) { + struct rq *rq = cpu_rq(this_cpu); + + if ((rq->nr_running - cfs_h_nr_delayed(rq)) == 1) + return this_cpu; + } if (available_idle_cpu(prev_cpu)) return prev_cpu; @@ -7663,7 +7607,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t return -1; } -#else /* CONFIG_SCHED_SMT */ +#else /* !CONFIG_SCHED_SMT: */ static inline void set_idle_cores(int cpu, int val) { @@ -7684,7 +7628,7 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd return -1; } -#endif /* CONFIG_SCHED_SMT */ +#endif /* !CONFIG_SCHED_SMT */ /* * Scan the LLC domain for idle CPUs; this is dynamically regulated by @@ -8721,18 +8665,6 @@ static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context set_task_max_allowed_capacity(p); } -static int -balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) -{ - if (sched_fair_runnable(rq)) - return 1; - - return sched_balance_newidle(rq, rf) != 0; -} -#else -static inline void set_task_max_allowed_capacity(struct task_struct *p) {} -#endif /* CONFIG_SMP */ - static void set_next_buddy(struct sched_entity *se) { for_each_sched_entity(se) { @@ -8744,11 +8676,77 @@ static void set_next_buddy(struct sched_entity *se) } } +enum preempt_wakeup_action { + PREEMPT_WAKEUP_NONE, /* No preemption. */ + PREEMPT_WAKEUP_SHORT, /* Ignore slice protection. */ + PREEMPT_WAKEUP_PICK, /* Let __pick_eevdf() decide. */ + PREEMPT_WAKEUP_RESCHED, /* Force reschedule. */ +}; + +static inline bool +set_preempt_buddy(struct cfs_rq *cfs_rq, int wake_flags, + struct sched_entity *pse, struct sched_entity *se) +{ + /* + * Keep existing buddy if the deadline is sooner than pse. + * The older buddy may be cache cold and completely unrelated + * to the current wakeup but that is unpredictable where as + * obeying the deadline is more in line with EEVDF objectives. + */ + if (cfs_rq->next && entity_before(cfs_rq->next, pse)) + return false; + + set_next_buddy(pse); + return true; +} + +/* + * WF_SYNC|WF_TTWU indicates the waker expects to sleep but it is not + * strictly enforced because the hint is either misunderstood or + * multiple tasks must be woken up. + */ +static inline enum preempt_wakeup_action +preempt_sync(struct rq *rq, int wake_flags, + struct sched_entity *pse, struct sched_entity *se) +{ + u64 threshold, delta; + + /* + * WF_SYNC without WF_TTWU is not expected so warn if it happens even + * though it is likely harmless. + */ + WARN_ON_ONCE(!(wake_flags & WF_TTWU)); + + threshold = sysctl_sched_migration_cost; + delta = rq_clock_task(rq) - se->exec_start; + if ((s64)delta < 0) + delta = 0; + + /* + * WF_RQ_SELECTED implies the tasks are stacking on a CPU when they + * could run on other CPUs. Reduce the threshold before preemption is + * allowed to an arbitrary lower value as it is more likely (but not + * guaranteed) the waker requires the wakee to finish. + */ + if (wake_flags & WF_RQ_SELECTED) + threshold >>= 2; + + /* + * As WF_SYNC is not strictly obeyed, allow some runtime for batch + * wakeups to be issued. + */ + if (entity_before(pse, se) && delta >= threshold) + return PREEMPT_WAKEUP_RESCHED; + + return PREEMPT_WAKEUP_NONE; +} + /* * Preempt the current task with a newly woken task if needed: */ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) { + enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK; struct task_struct *donor = rq->donor; struct sched_entity *se = &donor->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(donor); @@ -8763,13 +8761,9 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. */ - if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) + if (task_is_throttled(p)) return; - if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) { - set_next_buddy(pse); - } - /* * We can come here with TIF_NEED_RESCHED already set from new task * wake up path. @@ -8801,7 +8795,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * When non-idle entity preempt an idle entity, * don't give idle entity slice protection. */ - cancel_protect_slice(se); + preempt_action = PREEMPT_WAKEUP_SHORT; goto preempt; } @@ -8819,42 +8813,94 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int /* * If @p has a shorter slice than current and @p is eligible, override * current's slice protection in order to allow preemption. - * - * Note that even if @p does not turn out to be the most eligible - * task at this moment, current's slice protection will be lost. */ - if (do_preempt_short(cfs_rq, pse, se)) - cancel_protect_slice(se); + if (sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice)) { + preempt_action = PREEMPT_WAKEUP_SHORT; + goto pick; + } + + /* + * Ignore wakee preemption on WF_FORK as it is less likely that + * there is shared data as exec often follow fork. Do not + * preempt for tasks that are sched_delayed as it would violate + * EEVDF to forcibly queue an ineligible task. + */ + if ((wake_flags & WF_FORK) || pse->sched_delayed) + return; + + /* + * If @p potentially is completing work required by current then + * consider preemption. + * + * Reschedule if waker is no longer eligible. */ + if (in_task() && !entity_eligible(cfs_rq, se)) { + preempt_action = PREEMPT_WAKEUP_RESCHED; + goto preempt; + } + + /* Prefer picking wakee soon if appropriate. */ + if (sched_feat(NEXT_BUDDY) && + set_preempt_buddy(cfs_rq, wake_flags, pse, se)) { + + /* + * Decide whether to obey WF_SYNC hint for a new buddy. Old + * buddies are ignored as they may not be relevant to the + * waker and less likely to be cache hot. + */ + if (wake_flags & WF_SYNC) + preempt_action = preempt_sync(rq, wake_flags, pse, se); + } + + switch (preempt_action) { + case PREEMPT_WAKEUP_NONE: + return; + case PREEMPT_WAKEUP_RESCHED: + goto preempt; + case PREEMPT_WAKEUP_SHORT: + fallthrough; + case PREEMPT_WAKEUP_PICK: + break; + } +pick: /* * If @p has become the most eligible task, force preemption. */ - if (pick_eevdf(cfs_rq) == pse) + if (__pick_eevdf(cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT) == pse) goto preempt; + if (sched_feat(RUN_TO_PARITY)) + update_protect_slice(cfs_rq, se); + return; preempt: + if (preempt_action == PREEMPT_WAKEUP_SHORT) + cancel_protect_slice(se); + resched_curr_lazy(rq); } -static struct task_struct *pick_task_fair(struct rq *rq) +static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf) { struct sched_entity *se; struct cfs_rq *cfs_rq; + struct task_struct *p; + bool throttled; again: cfs_rq = &rq->cfs; if (!cfs_rq->nr_queued) return NULL; + throttled = false; + do { /* Might not have done put_prev_entity() */ if (cfs_rq->curr && cfs_rq->curr->on_rq) update_curr(cfs_rq); - if (unlikely(check_cfs_rq_runtime(cfs_rq))) - goto again; + throttled |= check_cfs_rq_runtime(cfs_rq); se = pick_next_entity(rq, cfs_rq); if (!se) @@ -8862,7 +8908,10 @@ again: cfs_rq = group_cfs_rq(se); } while (cfs_rq); - return task_of(se); + p = task_of(se); + if (unlikely(throttled)) + task_throttle_setup_work(p); + return p; } static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); @@ -8876,7 +8925,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf int new_tasks; again: - p = pick_task_fair(rq); + p = pick_task_fair(rq, rf); if (!p) goto idle; se = &p->se; @@ -8925,26 +8974,26 @@ again: return p; simple: -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ put_prev_set_next_task(rq, prev, p); return p; idle: - if (!rf) - return NULL; + if (rf) { + new_tasks = sched_balance_newidle(rq, rf); - new_tasks = sched_balance_newidle(rq, rf); - - /* - * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is - * possible for any higher priority task to appear. In that case we - * must re-start the pick_next_entity() loop. - */ - if (new_tasks < 0) - return RETRY_TASK; + /* + * Because sched_balance_newidle() releases (and re-acquires) + * rq->lock, it is possible for any higher priority task to + * appear. In that case we must re-start the pick_next_entity() + * loop. + */ + if (new_tasks < 0) + return RETRY_TASK; - if (new_tasks > 0) - goto again; + if (new_tasks > 0) + goto again; + } /* * rq is about to be idle, check if we need to update the @@ -8955,19 +9004,10 @@ idle: return NULL; } -static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev) -{ - return pick_next_task_fair(rq, prev, NULL); -} - -static bool fair_server_has_tasks(struct sched_dl_entity *dl_se) +static struct task_struct * +fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf) { - return !!dl_se->rq->cfs.nr_queued; -} - -static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) -{ - return pick_task_fair(dl_se->rq); + return pick_task_fair(dl_se->rq, rf); } void fair_server_init(struct rq *rq) @@ -8976,7 +9016,7 @@ void fair_server_init(struct rq *rq) init_dl_entity(dl_se); - dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task); + dl_server_init(dl_se, rq, fair_server_pick_task); } /* @@ -8998,7 +9038,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct t */ static void yield_task_fair(struct rq *rq) { - struct task_struct *curr = rq->curr; + struct task_struct *curr = rq->donor; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se; @@ -9022,15 +9062,26 @@ static void yield_task_fair(struct rq *rq) */ rq_clock_skip_update(rq); - se->deadline += calc_delta_fair(se->slice, se); + /* + * Forfeit the remaining vruntime, only if the entity is eligible. This + * condition is necessary because in core scheduling we prefer to run + * ineligible tasks rather than force idling. If this happens we may + * end up in a loop where the core scheduler picks the yielding task, + * which yields immediately again; without the condition the vruntime + * ends up quickly running away. + */ + if (entity_eligible(cfs_rq, se)) { + se->vruntime = se->deadline; + se->deadline += calc_delta_fair(se->slice, se); + } } static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; - /* throttled hierarchies are not runnable */ - if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) + /* !se->on_rq also covers throttled task */ + if (!se->on_rq) return false; /* Tell the scheduler that we'd really like se to run next. */ @@ -9041,7 +9092,6 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) return true; } -#ifdef CONFIG_SMP /************************************************** * Fair scheduling class load-balancing methods. * @@ -9343,13 +9393,13 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) return src_weight - dst_weight; } -#else +#else /* !CONFIG_NUMA_BALANCING: */ static inline long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { return 0; } -#endif +#endif /* !CONFIG_NUMA_BALANCING */ /* * Check whether the task is ineligible on the destination cpu @@ -9390,15 +9440,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * We do not migrate tasks that are: * 1) delayed dequeued unless we migrate load, or - * 2) throttled_lb_pair, or + * 2) target cfs_rq is in throttled hierarchy, or * 3) cannot be migrated to this CPU due to cpus_ptr, or * 4) running (obviously), or - * 5) are cache-hot on their current CPU. + * 5) are cache-hot on their current CPU, or + * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled) */ if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) return 0; - if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) + if (lb_throttled_hierarchy(p, env->dst_cpu)) return 0; /* @@ -9415,6 +9466,9 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (kthread_is_per_cpu(p)) return 0; + if (task_is_blocked(p)) + return 0; + if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { int cpu; @@ -9450,7 +9504,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* Record that we found at least one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; - if (task_on_cpu(env->src_rq, p)) { + if (task_on_cpu(env->src_rq, p) || + task_current_donor(env->src_rq, p)) { schedstat_inc(p->stats.nr_failed_migrations_running); return 0; } @@ -9494,6 +9549,9 @@ static void detach_task(struct task_struct *p, struct lb_env *env) schedstat_inc(p->stats.nr_forced_migrations); } + WARN_ON(task_current(env->src_rq, p)); + WARN_ON(task_current_donor(env->src_rq, p)); + deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); } @@ -9758,12 +9816,12 @@ static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) if (!has_blocked) rq->has_blocked_load = 0; } -#else +#else /* !CONFIG_NO_HZ_COMMON: */ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } static inline bool others_have_blocked(struct rq *rq) { return false; } static inline void update_blocked_load_tick(struct rq *rq) {} static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} -#endif +#endif /* !CONFIG_NO_HZ_COMMON */ static bool __update_blocked_others(struct rq *rq, bool *done) { @@ -9872,7 +9930,7 @@ static unsigned long task_h_load(struct task_struct *p) return div64_ul(p->se.avg.load_avg * cfs_rq->h_load, cfs_rq_load_avg(cfs_rq) + 1); } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static bool __update_blocked_fair(struct rq *rq, bool *done) { struct cfs_rq *cfs_rq = &rq->cfs; @@ -9889,7 +9947,7 @@ static unsigned long task_h_load(struct task_struct *p) { return p->se.avg.load_avg; } -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static void sched_balance_update_blocked_averages(int cpu) { @@ -10034,9 +10092,9 @@ void update_group_capacity(struct sched_domain *sd, int cpu) min_capacity = ULONG_MAX; max_capacity = 0; - if (child->flags & SD_OVERLAP) { + if (child->flags & SD_NUMA) { /* - * SD_OVERLAP domains cannot assume that child groups + * SD_NUMA domains cannot assume that child groups * span the current group. */ @@ -10049,7 +10107,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } } else { /* - * !SD_OVERLAP domains can assume that child groups + * !SD_NUMA domains can assume that child groups * span the current group. */ @@ -10256,7 +10314,7 @@ sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group (sgs->group_weight - sgs->idle_cpus != 1)) return false; - return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu); + return sched_asym(env->sd, env->dst_cpu, READ_ONCE(group->asym_prefer_cpu)); } /* One group has more than one SMT CPU while the other group does not */ @@ -10493,7 +10551,8 @@ static bool update_sd_pick_busiest(struct lb_env *env, case group_asym_packing: /* Prefer to move from lowest priority CPU's work */ - return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu); + return sched_asym_prefer(READ_ONCE(sds->busiest->asym_prefer_cpu), + READ_ONCE(sg->asym_prefer_cpu)); case group_misfit_task: /* @@ -10601,7 +10660,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) return remote; return all; } -#else +#else /* !CONFIG_NUMA_BALANCING: */ static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) { return all; @@ -10611,7 +10670,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) { return regular; } -#endif /* CONFIG_NUMA_BALANCING */ +#endif /* !CONFIG_NUMA_BALANCING */ struct sg_lb_stats; @@ -10678,7 +10737,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, if (sd->flags & SD_ASYM_CPUCAPACITY) sgs->group_misfit_task_load = 1; - for_each_cpu(i, sched_group_span(group)) { + for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { struct rq *rq = cpu_rq(i); unsigned int local; @@ -11730,6 +11789,21 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd } /* + * This flag serializes load-balancing passes over large domains + * (above the NODE topology level) - only one load-balancing instance + * may run at a time, to reduce overhead on very large systems with + * lots of CPUs and large NUMA distances. + * + * - Note that load-balancing passes triggered while another one + * is executing are skipped and not re-tried. + * + * - Also note that this does not serialize rebalance_domains() + * execution, as non-SD_SERIALIZE domains will still be + * load-balanced in parallel. + */ +static atomic_t sched_balance_running = ATOMIC_INIT(0); + +/* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. */ @@ -11754,6 +11828,7 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq, .fbq_type = all, .tasks = LIST_HEAD_INIT(env.tasks), }; + bool need_unlock = false; cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask); @@ -11765,6 +11840,14 @@ redo: goto out_balanced; } + if (!need_unlock && (sd->flags & SD_SERIALIZE)) { + int zero = 0; + if (!atomic_try_cmpxchg_acquire(&sched_balance_running, &zero, 1)) + goto out_balanced; + + need_unlock = true; + } + group = sched_balance_find_src_group(&env); if (!group) { schedstat_inc(sd->lb_nobusyg[idle]); @@ -12005,6 +12088,9 @@ out_one_pinned: sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; out: + if (need_unlock) + atomic_set_release(&sched_balance_running, 0); + return ld_moved; } @@ -12130,21 +12216,6 @@ out_unlock: } /* - * This flag serializes load-balancing passes over large domains - * (above the NODE topology level) - only one load-balancing instance - * may run at a time, to reduce overhead on very large systems with - * lots of CPUs and large NUMA distances. - * - * - Note that load-balancing passes triggered while another one - * is executing are skipped and not re-tried. - * - * - Also note that this does not serialize rebalance_domains() - * execution, as non-SD_SERIALIZE domains will still be - * load-balanced in parallel. - */ -static atomic_t sched_balance_running = ATOMIC_INIT(0); - -/* * Scale the max sched_balance_rq interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk. */ @@ -12153,24 +12224,43 @@ void update_max_interval(void) max_load_balance_interval = HZ*num_online_cpus()/10; } -static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) +static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success) { + sd->newidle_call++; + sd->newidle_success += success; + + if (sd->newidle_call >= 1024) { + sd->newidle_ratio = sd->newidle_success; + sd->newidle_call /= 2; + sd->newidle_success /= 2; + } +} + +static inline bool +update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success) +{ + unsigned long next_decay = sd->last_decay_max_lb_cost + HZ; + unsigned long now = jiffies; + + if (cost) + update_newidle_stats(sd, success); + if (cost > sd->max_newidle_lb_cost) { /* * Track max cost of a domain to make sure to not delay the * next wakeup on the CPU. */ sd->max_newidle_lb_cost = cost; - sd->last_decay_max_lb_cost = jiffies; - } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) { + sd->last_decay_max_lb_cost = now; + + } else if (time_after(now, next_decay)) { /* * Decay the newidle max times by ~1% per second to ensure that * it is not outdated and the current max cost is actually * shorter. */ sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256; - sd->last_decay_max_lb_cost = jiffies; - + sd->last_decay_max_lb_cost = now; return true; } @@ -12193,7 +12283,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; - int need_serialize, need_decay = 0; + int need_decay = 0; u64 max_cost = 0; rcu_read_lock(); @@ -12202,7 +12292,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) * Decay the newidle max times here because this is a regular * visit to all the domains. */ - need_decay = update_newidle_cost(sd, 0); + need_decay = update_newidle_cost(sd, 0, 0); max_cost += sd->max_newidle_lb_cost; /* @@ -12217,13 +12307,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) } interval = get_sd_balance_interval(sd, busy); - - need_serialize = sd->flags & SD_SERIALIZE; - if (need_serialize) { - if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1)) - goto out; - } - if (time_after_eq(jiffies, sd->last_balance + interval)) { if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) { /* @@ -12237,9 +12320,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) sd->last_balance = jiffies; interval = get_sd_balance_interval(sd, busy); } - if (need_serialize) - atomic_set_release(&sched_balance_running, 0); -out: if (time_after(next_balance, sd->last_balance + interval)) { next_balance = sd->last_balance + interval; update_next_balance = 1; @@ -12757,7 +12837,7 @@ static void nohz_newidle_balance(struct rq *this_rq) atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu)); } -#else /* !CONFIG_NO_HZ_COMMON */ +#else /* !CONFIG_NO_HZ_COMMON: */ static inline void nohz_balancer_kick(struct rq *rq) { } static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) @@ -12766,7 +12846,7 @@ static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle } static inline void nohz_newidle_balance(struct rq *this_rq) { } -#endif /* CONFIG_NO_HZ_COMMON */ +#endif /* !CONFIG_NO_HZ_COMMON */ /* * sched_balance_newidle is called by schedule() if this_cpu is about to become @@ -12818,18 +12898,21 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); + if (!sd) { + rcu_read_unlock(); + goto out; + } if (!get_rd_overloaded(this_rq->rd) || - (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) { + this_rq->avg_idle < sd->max_newidle_lb_cost) { - if (sd) - update_next_balance(sd, &next_balance); + update_next_balance(sd, &next_balance); rcu_read_unlock(); - goto out; } rcu_read_unlock(); + rq_modified_clear(this_rq); raw_spin_rq_unlock(this_rq); t0 = sched_clock_cpu(this_cpu); @@ -12845,6 +12928,22 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) break; if (sd->flags & SD_BALANCE_NEWIDLE) { + unsigned int weight = 1; + + if (sched_feat(NI_RANDOM)) { + /* + * Throw a 1k sided dice; and only run + * newidle_balance according to the success + * rate. + */ + u32 d1k = sched_rng() % 1024; + weight = 1 + sd->newidle_ratio; + if (d1k > weight) { + update_newidle_stats(sd, 0); + continue; + } + weight = (1024 + weight/2) / weight; + } pulled_task = sched_balance_rq(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, @@ -12852,10 +12951,14 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) t1 = sched_clock_cpu(this_cpu); domain_cost = t1 - t0; - update_newidle_cost(sd, domain_cost); - curr_cost += domain_cost; t0 = t1; + + /* + * Track max cost of a domain to make sure to not delay the + * next wakeup on the CPU. + */ + update_newidle_cost(sd, domain_cost, weight * !!pulled_task); } /* @@ -12880,8 +12983,8 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) if (this_rq->cfs.h_nr_queued && !pulled_task) pulled_task = 1; - /* Is there a task of a high priority class? */ - if (this_rq->nr_running != this_rq->cfs.h_nr_queued) + /* If a higher prio class was modified, restart the pick */ + if (rq_modified_above(this_rq, &fair_sched_class)) pulled_task = -1; out: @@ -12963,8 +13066,6 @@ static void rq_offline_fair(struct rq *rq) clear_tg_offline_cfs_rqs(rq); } -#endif /* CONFIG_SMP */ - #ifdef CONFIG_SCHED_CORE static inline bool __entity_slice_used(struct sched_entity *se, int min_nr_tasks) @@ -13001,7 +13102,170 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) } /* - * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed. + * Consider any infeasible weight scenario. Take for instance two tasks, + * each bound to their respective sibling, one with weight 1 and one with + * weight 2. Then the lower weight task will run ahead of the higher weight + * task without bound. + * + * This utterly destroys the concept of a shared time base. + * + * Remember; all this is about a proportionally fair scheduling, where each + * tasks receives: + * + * w_i + * dt_i = ---------- dt (1) + * \Sum_j w_j + * + * which we do by tracking a virtual time, s_i: + * + * 1 + * s_i = --- d[t]_i (2) + * w_i + * + * Where d[t] is a delta of discrete time, while dt is an infinitesimal. + * The immediate corollary is that the ideal schedule S, where (2) to use + * an infinitesimal delta, is: + * + * 1 + * S = ---------- dt (3) + * \Sum_i w_i + * + * From which we can define the lag, or deviation from the ideal, as: + * + * lag(i) = S - s_i (4) + * + * And since the one and only purpose is to approximate S, we get that: + * + * \Sum_i w_i lag(i) := 0 (5) + * + * If this were not so, we no longer converge to S, and we can no longer + * claim our scheduler has any of the properties we derive from S. This is + * exactly what you did above, you broke it! + * + * + * Let's continue for a while though; to see if there is anything useful to + * be learned. We can combine (1)-(3) or (4)-(5) and express S in s_i: + * + * \Sum_i w_i s_i + * S = -------------- (6) + * \Sum_i w_i + * + * Which gives us a way to compute S, given our s_i. Now, if you've read + * our code, you know that we do not in fact do this, the reason for this + * is two-fold. Firstly, computing S in that way requires a 64bit division + * for every time we'd use it (see 12), and secondly, this only describes + * the steady-state, it doesn't handle dynamics. + * + * Anyway, in (6): s_i -> x + (s_i - x), to get: + * + * \Sum_i w_i (s_i - x) + * S - x = -------------------- (7) + * \Sum_i w_i + * + * Which shows that S and s_i transform alike (which makes perfect sense + * given that S is basically the (weighted) average of s_i). + * + * So the thing to remember is that the above is strictly UP. It is + * possible to generalize to multiple runqueues -- however it gets really + * yuck when you have to add affinity support, as illustrated by our very + * first counter-example. + * + * Luckily I think we can avoid needing a full multi-queue variant for + * core-scheduling (or load-balancing). The crucial observation is that we + * only actually need this comparison in the presence of forced-idle; only + * then do we need to tell if the stalled rq has higher priority over the + * other. + * + * [XXX assumes SMT2; better consider the more general case, I suspect + * it'll work out because our comparison is always between 2 rqs and the + * answer is only interesting if one of them is forced-idle] + * + * And (under assumption of SMT2) when there is forced-idle, there is only + * a single queue, so everything works like normal. + * + * Let, for our runqueue 'k': + * + * T_k = \Sum_i w_i s_i + * W_k = \Sum_i w_i ; for all i of k (8) + * + * Then we can write (6) like: + * + * T_k + * S_k = --- (9) + * W_k + * + * From which immediately follows that: + * + * T_k + T_l + * S_k+l = --------- (10) + * W_k + W_l + * + * On which we can define a combined lag: + * + * lag_k+l(i) := S_k+l - s_i (11) + * + * And that gives us the tools to compare tasks across a combined runqueue. + * + * + * Combined this gives the following: + * + * a) when a runqueue enters force-idle, sync it against it's sibling rq(s) + * using (7); this only requires storing single 'time'-stamps. + * + * b) when comparing tasks between 2 runqueues of which one is forced-idle, + * compare the combined lag, per (11). + * + * Now, of course cgroups (I so hate them) make this more interesting in + * that a) seems to suggest we need to iterate all cgroup on a CPU at such + * boundaries, but I think we can avoid that. The force-idle is for the + * whole CPU, all it's rqs. So we can mark it in the root and lazily + * propagate downward on demand. + */ + +/* + * So this sync is basically a relative reset of S to 0. + * + * So with 2 queues, when one goes idle, we drop them both to 0 and one + * then increases due to not being idle, and the idle one builds up lag to + * get re-elected. So far so simple, right? + * + * When there's 3, we can have the situation where 2 run and one is idle, + * we sync to 0 and let the idle one build up lag to get re-election. Now + * suppose another one also drops idle. At this point dropping all to 0 + * again would destroy the built-up lag from the queue that was already + * idle, not good. + * + * So instead of syncing everything, we can: + * + * less := !((s64)(s_a - s_b) <= 0) + * + * (v_a - S_a) - (v_b - S_b) == v_a - v_b - S_a + S_b + * == v_a - (v_b - S_a + S_b) + * + * IOW, we can recast the (lag) comparison to a one-sided difference. + * So if then, instead of syncing the whole queue, sync the idle queue + * against the active queue with S_a + S_b at the point where we sync. + * + * (XXX consider the implication of living in a cyclic group: N / 2^n N) + * + * This gives us means of syncing single queues against the active queue, + * and for already idle queues to preserve their build-up lag. + * + * Of course, then we get the situation where there's 2 active and one + * going idle, who do we pick to sync against? Theory would have us sync + * against the combined S, but as we've already demonstrated, there is no + * such thing in infeasible weight scenarios. + * + * One thing I've considered; and this is where that core_active rudiment + * came from, is having active queues sync up between themselves after + * every tick. This limits the observed divergence due to the work + * conservancy. + * + * On top of that, we can improve upon things by employing (10) here. + */ + +/* + * se_fi_update - Update the cfs_rq->zero_vruntime_fi in a CFS hierarchy if needed. */ static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq, bool forceidle) @@ -13015,7 +13279,7 @@ static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq, cfs_rq->forceidle_seq = fi_seq; } - cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime; + cfs_rq->zero_vruntime_fi = cfs_rq->zero_vruntime; } } @@ -13061,18 +13325,18 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, cfs_rqa = sea->cfs_rq; cfs_rqb = seb->cfs_rq; -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ cfs_rqa = &task_rq(a)->cfs; cfs_rqb = &task_rq(b)->cfs; -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ /* * Find delta after normalizing se's vruntime with its cfs_rq's - * min_vruntime_fi, which would have been updated in prior calls + * zero_vruntime_fi, which would have been updated in prior calls * to se_fi_update(). */ delta = (s64)(sea->vruntime - seb->vruntime) + - (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi); + (s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi); return delta > 0; } @@ -13088,9 +13352,9 @@ static int task_is_throttled_fair(struct task_struct *p, int cpu) #endif return throttled_hierarchy(cfs_rq); } -#else +#else /* !CONFIG_SCHED_CORE: */ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} -#endif +#endif /* !CONFIG_SCHED_CORE */ /* * scheduler tick hitting a task of our scheduling class. @@ -13134,11 +13398,14 @@ static void task_fork_fair(struct task_struct *p) * the current task. */ static void -prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) +prio_changed_fair(struct rq *rq, struct task_struct *p, u64 oldprio) { if (!task_on_rq_queued(p)) return; + if (p->prio == oldprio) + return; + if (rq->cfs.nr_queued == 1) return; @@ -13150,8 +13417,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) if (task_current_donor(rq, p)) { if (p->prio > oldprio) resched_curr(rq); - } else + } else { wakeup_preempt(rq, p, 0); + } } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -13163,10 +13431,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq_throttled(cfs_rq)) - return; - - if (!throttled_hierarchy(cfs_rq)) + /* + * If a task gets attached to this cfs_rq and before being queued, + * it gets migrated to another CPU due to reasons like affinity + * change, make sure this cfs_rq stays on leaf cfs_rq list to have + * that removed load decayed or it can cause faireness problem. + */ + if (!cfs_rq_pelt_clock_throttled(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); /* Start to propagate at parent */ @@ -13177,22 +13448,20 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) update_load_avg(cfs_rq, se, UPDATE_TG); - if (cfs_rq_throttled(cfs_rq)) - break; - - if (!throttled_hierarchy(cfs_rq)) + if (!cfs_rq_pelt_clock_throttled(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); } + + assert_list_leaf_cfs_rq(rq_of(cfs_rq)); } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static void propagate_entity_cfs_rq(struct sched_entity *se) { } -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static void detach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); -#ifdef CONFIG_SMP /* * In case the task sched_avg hasn't been attached: * - A forked task which hasn't been woken up by wake_up_new_task(). @@ -13201,7 +13470,6 @@ static void detach_entity_cfs_rq(struct sched_entity *se) */ if (!se->avg.last_update_time) return; -#endif /* Catch up with the cfs_rq and remove our load when we leave */ update_load_avg(cfs_rq, se, 0); @@ -13235,6 +13503,12 @@ static void attach_task_cfs_rq(struct task_struct *p) attach_entity_cfs_rq(se); } +static void switching_from_fair(struct rq *rq, struct task_struct *p) +{ + if (p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); +} + static void switched_from_fair(struct rq *rq, struct task_struct *p) { detach_task_cfs_rq(p); @@ -13265,7 +13539,6 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs { struct sched_entity *se = &p->se; -#ifdef CONFIG_SMP if (task_on_rq_queued(p)) { /* * Move the next running task to the front of the list, so our @@ -13273,7 +13546,6 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs */ list_move(&se->group_node, &rq->cfs_tasks); } -#endif if (!first) return; @@ -13310,10 +13582,8 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT_CACHED; - cfs_rq->min_vruntime = (u64)(-(1LL << 20)); -#ifdef CONFIG_SMP + cfs_rq->zero_vruntime = (u64)(-(1LL << 20)); raw_spin_lock_init(&cfs_rq->removed.lock); -#endif } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -13328,10 +13598,8 @@ static void task_change_group_fair(struct task_struct *p) detach_task_cfs_rq(p); -#ifdef CONFIG_SMP /* Tell se's cfs_rq has been changed -- migrated */ p->se.avg.last_update_time = 0; -#endif set_task_rq(p, task_cpu(p)); attach_task_cfs_rq(p); } @@ -13615,6 +13883,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task */ DEFINE_SCHED_CLASS(fair) = { + .queue_mask = 2, + .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, @@ -13623,12 +13893,10 @@ DEFINE_SCHED_CLASS(fair) = { .wakeup_preempt = check_preempt_wakeup_fair, .pick_task = pick_task_fair, - .pick_next_task = __pick_next_task_fair, + .pick_next_task = pick_next_task_fair, .put_prev_task = put_prev_task_fair, .set_next_task = set_next_task_fair, -#ifdef CONFIG_SMP - .balance = balance_fair, .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair, @@ -13637,13 +13905,13 @@ DEFINE_SCHED_CLASS(fair) = { .task_dead = task_dead_fair, .set_cpus_allowed = set_cpus_allowed_fair, -#endif .task_tick = task_tick_fair, .task_fork = task_fork_fair, .reweight_task = reweight_task_fair, .prio_changed = prio_changed_fair, + .switching_from = switching_from_fair, .switched_from = switched_from_fair, .switched_to = switched_to_fair, @@ -13700,7 +13968,6 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) __init void init_sched_fair_class(void) { -#ifdef CONFIG_SMP int i; for_each_possible_cpu(i) { @@ -13722,6 +13989,4 @@ __init void init_sched_fair_class(void) nohz.next_blocked = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); #endif -#endif /* SMP */ - } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 3c12d9f93331..980d92bab8ab 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -29,7 +29,7 @@ SCHED_FEAT(PREEMPT_SHORT, true) * wakeup-preemption), since its likely going to consume data we * touched, increases cache locality. */ -SCHED_FEAT(NEXT_BUDDY, false) +SCHED_FEAT(NEXT_BUDDY, true) /* * Allow completely ignoring cfs_rq->next; which can be set from various @@ -121,3 +121,8 @@ SCHED_FEAT(WA_BIAS, true) SCHED_FEAT(UTIL_EST, true) SCHED_FEAT(LATENCY_WARN, false) + +/* + * Do newidle balancing proportional to its success rate using randomization. + */ +SCHED_FEAT(NI_RANDOM, true) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 2c85c86b455f..c174afe1dd17 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -6,6 +6,11 @@ * (NOTE: these are not related to SCHED_IDLE batch scheduled * tasks which are handled in sched/fair.c ) */ +#include <linux/cpuidle.h> +#include <linux/suspend.h> +#include <linux/livepatch.h> +#include "sched.h" +#include "smp.h" /* Linker adds these: start and end of __cpuidle functions */ extern char __cpuidle_text_start[], __cpuidle_text_end[]; @@ -47,7 +52,7 @@ static int __init cpu_idle_nopoll_setup(char *__unused) return 1; } __setup("hlt", cpu_idle_nopoll_setup); -#endif +#endif /* CONFIG_GENERIC_IDLE_POLL_SETUP */ static noinline int __cpuidle cpu_idle_poll(void) { @@ -95,10 +100,10 @@ static inline void cond_tick_broadcast_exit(void) if (static_branch_unlikely(&arch_needs_tick_broadcast)) tick_broadcast_exit(); } -#else +#else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE: */ static inline void cond_tick_broadcast_enter(void) { } static inline void cond_tick_broadcast_exit(void) { } -#endif +#endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE */ /** * default_idle_call - Default CPU idle routine. @@ -126,12 +131,13 @@ void __cpuidle default_idle_call(void) } static int call_cpuidle_s2idle(struct cpuidle_driver *drv, - struct cpuidle_device *dev) + struct cpuidle_device *dev, + u64 max_latency_ns) { if (current_clr_polling_and_test()) return -EBUSY; - return cpuidle_enter_s2idle(drv, dev); + return cpuidle_enter_s2idle(drv, dev, max_latency_ns); } static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, @@ -200,12 +206,13 @@ static void cpuidle_idle_call(void) u64 max_latency_ns; if (idle_should_enter_s2idle()) { + max_latency_ns = cpu_wakeup_latency_qos_limit() * + NSEC_PER_USEC; - entered_state = call_cpuidle_s2idle(drv, dev); + entered_state = call_cpuidle_s2idle(drv, dev, + max_latency_ns); if (entered_state > 0) goto exit_idle; - - max_latency_ns = U64_MAX; } else { max_latency_ns = dev->forced_idle_latency_limit_ns; } @@ -427,7 +434,6 @@ void cpu_startup_entry(enum cpuhp_state state) * idle-task scheduling class. */ -#ifdef CONFIG_SMP static int select_task_rq_idle(struct task_struct *p, int cpu, int flags) { @@ -439,7 +445,6 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { return WARN_ON_ONCE(1); } -#endif /* * Idle tasks are unconditionally rescheduled: @@ -449,9 +454,11 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) resched_curr(rq); } +static void update_curr_idle(struct rq *rq); + static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - dl_server_update_idle_time(rq, prev); + update_curr_idle(rq); scx_update_idle(rq, false, true); } @@ -463,7 +470,7 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir next->se.exec_start = rq_clock_task(rq); } -struct task_struct *pick_task_idle(struct rq *rq) +struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf) { scx_update_idle(rq, true, false); return rq->idle; @@ -493,21 +500,36 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) */ static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) { + update_curr_idle(rq); } -static void switched_to_idle(struct rq *rq, struct task_struct *p) +static void switching_to_idle(struct rq *rq, struct task_struct *p) { BUG(); } static void -prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) +prio_changed_idle(struct rq *rq, struct task_struct *p, u64 oldprio) { + if (p->prio == oldprio) + return; + BUG(); } static void update_curr_idle(struct rq *rq) { + struct sched_entity *se = &rq->idle->se; + u64 now = rq_clock_task(rq); + s64 delta_exec; + + delta_exec = now - se->exec_start; + if (unlikely(delta_exec <= 0)) + return; + + se->exec_start = now; + + dl_server_update_idle(&rq->fair_server, delta_exec); } /* @@ -515,6 +537,8 @@ static void update_curr_idle(struct rq *rq) */ DEFINE_SCHED_CLASS(idle) = { + .queue_mask = 0, + /* no enqueue/yield_task for idle tasks */ /* dequeue is not valid, we print a debug message there: */ @@ -526,15 +550,13 @@ DEFINE_SCHED_CLASS(idle) = { .put_prev_task = put_prev_task_idle, .set_next_task = set_next_task_idle, -#ifdef CONFIG_SMP .balance = balance_idle, .select_task_rq = select_task_rq_idle, .set_cpus_allowed = set_cpus_allowed_common, -#endif .task_tick = task_tick_idle, .prio_changed = prio_changed_idle, - .switched_to = switched_to_idle, + .switching_to = switching_to_idle, .update_curr = update_curr_idle, }; diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 81bc8b329ef1..3ad0d6df6a0a 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -7,6 +7,8 @@ * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker * */ +#include <linux/sched/isolation.h> +#include "sched.h" enum hk_flags { HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN), @@ -40,7 +42,7 @@ int housekeeping_any_cpu(enum hk_type type) if (cpu < nr_cpu_ids) return cpu; - cpu = cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask); + cpu = cpumask_any_and_distribute(housekeeping.cpumasks[type], cpu_online_mask); if (likely(cpu < nr_cpu_ids)) return cpu; /* @@ -165,6 +167,29 @@ static int __init housekeeping_setup(char *str, unsigned long flags) } } + /* + * Check the combination of nohz_full and isolcpus=domain, + * necessary to avoid problems with the timer migration + * hierarchy. managed_irq is ignored by this check since it + * isn't considered in the timer migration logic. + */ + iter_flags = housekeeping.flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN); + type = find_first_bit(&iter_flags, HK_TYPE_MAX); + /* + * Pass the check if none of these flags were previously set or + * are not in the current selection. + */ + iter_flags = flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN); + first_cpu = (type == HK_TYPE_MAX || !iter_flags) ? 0 : + cpumask_first_and_and(cpu_present_mask, + housekeeping_staging, housekeeping.cpumasks[type]); + if (first_cpu >= min(nr_cpu_ids, setup_max_cpus)) { + pr_warn("Housekeeping: must include one present CPU " + "neither in nohz_full= nor in isolcpus=domain, " + "ignoring setting %s\n", str); + goto free_housekeeping_staging; + } + iter_flags = flags & ~housekeeping.flags; for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index c48900b856a2..b601e0243d0e 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -6,6 +6,8 @@ * figure. Its a silly number but people think its important. We go through * great pains to make it work on big machines and tickless kernels. */ +#include <linux/sched/nohz.h> +#include "sched.h" /* * Global load-average calculations @@ -80,7 +82,7 @@ long calc_load_fold_active(struct rq *this_rq, long adjust) long nr_active, delta = 0; nr_active = this_rq->nr_running - adjust; - nr_active += (int)this_rq->nr_uninterruptible; + nr_active += (long)this_rq->nr_uninterruptible; if (nr_active != this_rq->calc_load_active) { delta = nr_active - this_rq->calc_load_active; @@ -333,12 +335,12 @@ static void calc_global_nohz(void) smp_wmb(); calc_load_idx++; } -#else /* !CONFIG_NO_HZ_COMMON */ +#else /* !CONFIG_NO_HZ_COMMON: */ static inline long calc_load_nohz_read(void) { return 0; } static inline void calc_global_nohz(void) { } -#endif /* CONFIG_NO_HZ_COMMON */ +#endif /* !CONFIG_NO_HZ_COMMON */ /* * calc_load - update the avenrun load estimates 10 ticks after the diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 809194cd779f..623445603725 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -4,6 +4,8 @@ * * membarrier system call */ +#include <uapi/linux/membarrier.h> +#include "sched.h" /* * For documentation purposes, here are some membarrier ordering @@ -197,7 +199,7 @@ static void ipi_rseq(void *info) * is negligible. */ smp_mb(); - rseq_preempt(current); + rseq_sched_switch_event(current); } static void ipi_sync_rq_state(void *info) @@ -405,9 +407,9 @@ static int membarrier_private_expedited(int flags, int cpu_id) * membarrier, we will end up with some thread in the mm * running without a core sync. * - * For RSEQ, don't rseq_preempt() the caller. User code - * is not supposed to issue syscalls at all from inside an - * rseq critical section. + * For RSEQ, don't invoke rseq_sched_switch_event() on the + * caller. User code is not supposed to issue syscalls at + * all from inside an rseq critical section. */ if (flags != MEMBARRIER_FLAG_SYNC_CORE) { preempt_disable(); diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 7a8534a2deff..fa83bbaf4f3e 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -23,6 +23,7 @@ * Move PELT related code from fair.c into this pelt.c file * Author: Vincent Guittot <vincent.guittot@linaro.org> */ +#include "pelt.h" /* * Approximate: @@ -413,7 +414,7 @@ int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) return 0; } -#endif +#endif /* CONFIG_SCHED_HW_PRESSURE */ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ /* @@ -466,7 +467,7 @@ int update_irq_load_avg(struct rq *rq, u64 running) return ret; } -#endif +#endif /* CONFIG_HAVE_SCHED_AVG_IRQ */ /* * Load avg and utiliztion metrics need to be updated periodically and before diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index f4f6a0875c66..f921302dc40f 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -1,4 +1,8 @@ -#ifdef CONFIG_SMP +// SPDX-License-Identifier: GPL-2.0 +#ifndef _KERNEL_SCHED_PELT_H +#define _KERNEL_SCHED_PELT_H +#include "sched.h" + #include "sched-pelt.h" int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); @@ -15,7 +19,7 @@ static inline u64 hw_load_avg(struct rq *rq) { return READ_ONCE(rq->avg_hw.load_avg); } -#else +#else /* !CONFIG_SCHED_HW_PRESSURE: */ static inline int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) { @@ -26,7 +30,7 @@ static inline u64 hw_load_avg(struct rq *rq) { return 0; } -#endif +#endif /* !CONFIG_SCHED_HW_PRESSURE */ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ int update_irq_load_avg(struct rq *rq, u64 running); @@ -158,7 +162,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { u64 throttled; - if (unlikely(cfs_rq->throttle_count)) + if (unlikely(cfs_rq->pelt_clock_throttled)) throttled = U64_MAX; else throttled = cfs_rq->throttled_clock_pelt_time; @@ -169,68 +173,17 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) /* rq->task_clock normalized against any time this cfs_rq has spent throttled */ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { - if (unlikely(cfs_rq->throttle_count)) + if (unlikely(cfs_rq->pelt_clock_throttled)) return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time; return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time; } -#else +#else /* !CONFIG_CFS_BANDWIDTH: */ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { } static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { return rq_clock_pelt(rq_of(cfs_rq)); } -#endif - -#else - -static inline int -update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) -{ - return 0; -} - -static inline int -update_rt_rq_load_avg(u64 now, struct rq *rq, int running) -{ - return 0; -} - -static inline int -update_dl_rq_load_avg(u64 now, struct rq *rq, int running) -{ - return 0; -} - -static inline int -update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) -{ - return 0; -} - -static inline u64 hw_load_avg(struct rq *rq) -{ - return 0; -} - -static inline int -update_irq_load_avg(struct rq *rq, u64 running) -{ - return 0; -} - -static inline u64 rq_clock_pelt(struct rq *rq) -{ - return rq_clock_task(rq); -} - -static inline void -update_rq_clock_pelt(struct rq *rq, s64 delta) { } - -static inline void -update_idle_rq_clock_pelt(struct rq *rq) { } - -static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { } -#endif - +#endif /* !CONFIG_CFS_BANDWIDTH */ +#endif /* _KERNEL_SCHED_PELT_H */ diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 1396674fa722..59fdb7ebbf22 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -136,6 +136,10 @@ * cost-wise, yet way more sensitive and accurate than periodic * sampling of the aggregate task states would be. */ +#include <linux/sched/clock.h> +#include <linux/workqueue.h> +#include <linux/psi.h> +#include "sched.h" static int psi_bug __read_mostly; @@ -172,17 +176,35 @@ struct psi_group psi_system = { .pcpu = &system_group_pcpu, }; +static DEFINE_PER_CPU(seqcount_t, psi_seq) = SEQCNT_ZERO(psi_seq); + +static inline void psi_write_begin(int cpu) +{ + write_seqcount_begin(per_cpu_ptr(&psi_seq, cpu)); +} + +static inline void psi_write_end(int cpu) +{ + write_seqcount_end(per_cpu_ptr(&psi_seq, cpu)); +} + +static inline u32 psi_read_begin(int cpu) +{ + return read_seqcount_begin(per_cpu_ptr(&psi_seq, cpu)); +} + +static inline bool psi_read_retry(int cpu, u32 seq) +{ + return read_seqcount_retry(per_cpu_ptr(&psi_seq, cpu), seq); +} + static void psi_avgs_work(struct work_struct *work); static void poll_timer_fn(struct timer_list *t); static void group_init(struct psi_group *group) { - int cpu; - group->enabled = true; - for_each_possible_cpu(cpu) - seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); group->avg_last_update = sched_clock(); group->avg_next_update = group->avg_last_update + psi_period; mutex_init(&group->avgs_lock); @@ -262,14 +284,14 @@ static void get_recent_times(struct psi_group *group, int cpu, /* Snapshot a coherent view of the CPU state */ do { - seq = read_seqcount_begin(&groupc->seq); + seq = psi_read_begin(cpu); now = cpu_clock(cpu); memcpy(times, groupc->times, sizeof(groupc->times)); state_mask = groupc->state_mask; state_start = groupc->state_start; if (cpu == current_cpu) memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); - } while (read_seqcount_retry(&groupc->seq, seq)); + } while (psi_read_retry(cpu, seq)); /* Calculate state time deltas against the previous snapshot */ for (s = 0; s < NR_PSI_STATES; s++) { @@ -733,7 +755,7 @@ static int psi_rtpoll_worker(void *data) static void poll_timer_fn(struct timer_list *t) { - struct psi_group *group = from_timer(group, t, rtpoll_timer); + struct psi_group *group = timer_container_of(group, t, rtpoll_timer); atomic_set(&group->rtpoll_wakeup, 1); wake_up_interruptible(&group->rtpoll_wait); @@ -768,31 +790,21 @@ static void record_times(struct psi_group_cpu *groupc, u64 now) groupc->times[PSI_NONIDLE] += delta; } +#define for_each_group(iter, group) \ + for (typeof(group) iter = group; iter; iter = iter->parent) + static void psi_group_change(struct psi_group *group, int cpu, unsigned int clear, unsigned int set, - bool wake_clock) + u64 now, bool wake_clock) { struct psi_group_cpu *groupc; unsigned int t, m; u32 state_mask; - u64 now; lockdep_assert_rq_held(cpu_rq(cpu)); groupc = per_cpu_ptr(group->pcpu, cpu); /* - * First we update the task counts according to the state - * change requested through the @clear and @set bits. - * - * Then if the cgroup PSI stats accounting enabled, we - * assess the aggregate resource states this CPU's tasks - * have been in since the last change, and account any - * SOME and FULL time these may have resulted in. - */ - write_seqcount_begin(&groupc->seq); - now = cpu_clock(cpu); - - /* * Start with TSK_ONCPU, which doesn't have a corresponding * task count - it's just a boolean flag directly encoded in * the state mask. Clear, set, or carry the current state if @@ -843,7 +855,6 @@ static void psi_group_change(struct psi_group *group, int cpu, groupc->state_mask = state_mask; - write_seqcount_end(&groupc->seq); return; } @@ -864,8 +875,6 @@ static void psi_group_change(struct psi_group *group, int cpu, groupc->state_mask = state_mask; - write_seqcount_end(&groupc->seq); - if (state_mask & group->rtpoll_states) psi_schedule_rtpoll_work(group, 1, false); @@ -900,24 +909,29 @@ static void psi_flags_change(struct task_struct *task, int clear, int set) void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); - struct psi_group *group; + u64 now; if (!task->pid) return; psi_flags_change(task, clear, set); - group = task_psi_group(task); - do { - psi_group_change(group, cpu, clear, set, true); - } while ((group = group->parent)); + psi_write_begin(cpu); + now = cpu_clock(cpu); + for_each_group(group, task_psi_group(task)) + psi_group_change(group, cpu, clear, set, now, true); + psi_write_end(cpu); } void psi_task_switch(struct task_struct *prev, struct task_struct *next, bool sleep) { - struct psi_group *group, *common = NULL; + struct psi_group *common = NULL; int cpu = task_cpu(prev); + u64 now; + + psi_write_begin(cpu); + now = cpu_clock(cpu); if (next->pid) { psi_flags_change(next, 0, TSK_ONCPU); @@ -926,16 +940,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, * ancestors with @prev, those will already have @prev's * TSK_ONCPU bit set, and we can stop the iteration there. */ - group = task_psi_group(next); - do { - if (per_cpu_ptr(group->pcpu, cpu)->state_mask & - PSI_ONCPU) { + for_each_group(group, task_psi_group(next)) { + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + + if (groupc->state_mask & PSI_ONCPU) { common = group; break; } - - psi_group_change(group, cpu, 0, TSK_ONCPU, true); - } while ((group = group->parent)); + psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); + } } if (prev->pid) { @@ -968,12 +981,11 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, psi_flags_change(prev, clear, set); - group = task_psi_group(prev); - do { + for_each_group(group, task_psi_group(prev)) { if (group == common) break; - psi_group_change(group, cpu, clear, set, wake_clock); - } while ((group = group->parent)); + psi_group_change(group, cpu, clear, set, now, wake_clock); + } /* * TSK_ONCPU is handled up to the common ancestor. If there are @@ -983,20 +995,21 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, */ if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { clear &= ~TSK_ONCPU; - for (; group; group = group->parent) - psi_group_change(group, cpu, clear, set, wake_clock); + for_each_group(group, common) + psi_group_change(group, cpu, clear, set, now, wake_clock); } } + psi_write_end(cpu); } #ifdef CONFIG_IRQ_TIME_ACCOUNTING void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) { int cpu = task_cpu(curr); - struct psi_group *group; struct psi_group_cpu *groupc; s64 delta; u64 irq; + u64 now; if (static_branch_likely(&psi_disabled) || !irqtime_enabled()) return; @@ -1005,8 +1018,7 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st return; lockdep_assert_rq_held(rq); - group = task_psi_group(curr); - if (prev && task_psi_group(prev) == group) + if (prev && task_psi_group(prev) == task_psi_group(curr)) return; irq = irq_time_read(cpu); @@ -1015,27 +1027,24 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st return; rq->psi_irq_time = irq; - do { - u64 now; + psi_write_begin(cpu); + now = cpu_clock(cpu); + for_each_group(group, task_psi_group(curr)) { if (!group->enabled) continue; groupc = per_cpu_ptr(group->pcpu, cpu); - write_seqcount_begin(&groupc->seq); - now = cpu_clock(cpu); - record_times(groupc, now); groupc->times[PSI_IRQ_FULL] += delta; - write_seqcount_end(&groupc->seq); - if (group->rtpoll_states & (1 << PSI_IRQ_FULL)) psi_schedule_rtpoll_work(group, 1, false); - } while ((group = group->parent)); + } + psi_write_end(cpu); } -#endif +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ /** * psi_memstall_enter - mark the beginning of a memory stall section @@ -1221,12 +1230,14 @@ void psi_cgroup_restart(struct psi_group *group) return; for_each_possible_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; + u64 now; - rq_lock_irq(rq, &rf); - psi_group_change(group, cpu, 0, 0, true); - rq_unlock_irq(rq, &rf); + guard(rq_lock_irq)(cpu_rq(cpu)); + + psi_write_begin(cpu); + now = cpu_clock(cpu); + psi_group_change(group, cpu, 0, 0, now, true); + psi_write_end(cpu); } } #endif /* CONFIG_CGROUPS */ @@ -1651,7 +1662,7 @@ static const struct proc_ops psi_irq_proc_ops = { .proc_poll = psi_fop_poll, .proc_release = psi_fop_release, }; -#endif +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ static int __init psi_proc_init(void) { diff --git a/kernel/sched/rq-offsets.c b/kernel/sched/rq-offsets.c new file mode 100644 index 000000000000..a23747bbe25b --- /dev/null +++ b/kernel/sched/rq-offsets.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 +#define COMPILE_OFFSETS +#include <linux/kbuild.h> +#include <linux/types.h> +#include "sched.h" + +int main(void) +{ + DEFINE(RQ_nr_pinned, offsetof(struct rq, nr_pinned)); + + return 0; +} diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index fa03ec3ed56a..f1867fe8e5c5 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -4,6 +4,9 @@ * policies) */ +#include "sched.h" +#include "pelt.h" + int sched_rr_timeslice = RR_TIMESLICE; /* More than 4 hours if BW_SHIFT equals 20. */ static const u64 max_rt_runtime = MAX_BW; @@ -60,7 +63,7 @@ static int __init sched_rt_sysctl_init(void) return 0; } late_initcall(sched_rt_sysctl_init); -#endif +#endif /* CONFIG_SYSCTL */ void init_rt_rq(struct rt_rq *rt_rq) { @@ -75,12 +78,10 @@ void init_rt_rq(struct rt_rq *rt_rq) /* delimiter for bitsearch: */ __set_bit(MAX_RT_PRIO, array->bitmap); -#if defined CONFIG_SMP rt_rq->highest_prio.curr = MAX_RT_PRIO-1; rt_rq->highest_prio.next = MAX_RT_PRIO-1; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); -#endif /* CONFIG_SMP */ /* We start is dequeued state, because no RT tasks are queued */ rt_rq->rt_queued = 0; @@ -89,6 +90,7 @@ void init_rt_rq(struct rt_rq *rt_rq) rt_rq->rt_throttled = 0; rt_rq->rt_runtime = 0; raw_spin_lock_init(&rt_rq->rt_runtime_lock); + rt_rq->tg = &root_task_group; #endif } @@ -175,11 +177,14 @@ static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) { + /* Cannot fold with non-CONFIG_RT_GROUP_SCHED version, layout */ + WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group); return rt_rq->rq; } static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) { + WARN_ON(!rt_group_sched_enabled() && rt_se->rt_rq->tg != &root_task_group); return rt_se->rt_rq; } @@ -187,11 +192,15 @@ static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) { struct rt_rq *rt_rq = rt_se->rt_rq; + WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group); return rt_rq->rq; } void unregister_rt_sched_group(struct task_group *tg) { + if (!rt_group_sched_enabled()) + return; + if (tg->rt_se) destroy_rt_bandwidth(&tg->rt_bandwidth); } @@ -200,6 +209,9 @@ void free_rt_sched_group(struct task_group *tg) { int i; + if (!rt_group_sched_enabled()) + return; + for_each_possible_cpu(i) { if (tg->rt_rq) kfree(tg->rt_rq[i]); @@ -244,6 +256,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) struct sched_rt_entity *rt_se; int i; + if (!rt_group_sched_enabled()) + return 1; + tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL); if (!tg->rt_rq) goto err; @@ -277,7 +292,7 @@ err: return 0; } -#else /* CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ #define rt_entity_is_task(rt_se) (1) @@ -313,9 +328,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { return 1; } -#endif /* CONFIG_RT_GROUP_SCHED */ - -#ifdef CONFIG_SMP +#endif /* !CONFIG_RT_GROUP_SCHED */ static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) { @@ -416,21 +429,6 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) } } -#else - -static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) -{ -} - -static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) -{ -} - -static inline void rt_queue_push_tasks(struct rq *rq) -{ -} -#endif /* CONFIG_SMP */ - static void enqueue_top_rt_rq(struct rt_rq *rt_rq); static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count); @@ -471,20 +469,17 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) return cpu_cap >= min(min_cap, max_cap); } -#else +#else /* !CONFIG_UCLAMP_TASK: */ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) { return true; } -#endif +#endif /* !CONFIG_UCLAMP_TASK */ #ifdef CONFIG_RT_GROUP_SCHED static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) { - if (!rt_rq->tg) - return RUNTIME_INF; - return rt_rq->rt_runtime; } @@ -497,6 +492,11 @@ typedef struct task_group *rt_rq_iter_t; static inline struct task_group *next_task_group(struct task_group *tg) { + if (!rt_group_sched_enabled()) { + WARN_ON(tg != &root_task_group); + return NULL; + } + do { tg = list_entry_rcu(tg->list.next, typeof(struct task_group), list); @@ -509,9 +509,9 @@ static inline struct task_group *next_task_group(struct task_group *tg) } #define for_each_rt_rq(rt_rq, iter, rq) \ - for (iter = container_of(&task_groups, typeof(*iter), list); \ - (iter = next_task_group(iter)) && \ - (rt_rq = iter->rt_rq[cpu_of(rq)]);) + for (iter = &root_task_group; \ + iter && (rt_rq = iter->rt_rq[cpu_of(rq)]); \ + iter = next_task_group(iter)) #define for_each_sched_rt_entity(rt_se) \ for (; rt_se; rt_se = rt_se->parent) @@ -578,17 +578,10 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) return p->prio != p->normal_prio; } -#ifdef CONFIG_SMP static inline const struct cpumask *sched_rt_period_mask(void) { return this_rq()->rd->span; } -#else -static inline const struct cpumask *sched_rt_period_mask(void) -{ - return cpu_online_mask; -} -#endif static inline struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) @@ -609,7 +602,6 @@ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) rt_rq->rt_time < rt_b->rt_runtime); } -#ifdef CONFIG_SMP /* * We ran out of runtime, see if we can borrow some from our neighbours. */ @@ -782,9 +774,6 @@ static void balance_runtime(struct rt_rq *rt_rq) raw_spin_lock(&rt_rq->rt_runtime_lock); } } -#else /* !CONFIG_SMP */ -static inline void balance_runtime(struct rt_rq *rt_rq) {} -#endif /* CONFIG_SMP */ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { @@ -914,7 +903,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) return 0; } -#else /* !CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ typedef struct rt_rq *rt_rq_iter_t; @@ -961,12 +950,10 @@ struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) return &cpu_rq(cpu)->rt; } -#ifdef CONFIG_SMP static void __enable_runtime(struct rq *rq) { } static void __disable_runtime(struct rq *rq) { } -#endif -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ static inline int rt_se_prio(struct sched_rt_entity *rt_se) { @@ -1017,7 +1004,7 @@ static void update_curr_rt(struct rq *rq) do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); } } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ } static void @@ -1059,20 +1046,17 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq) cpufreq_update_util(rq, 0); } -#if defined CONFIG_SMP - static void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); -#ifdef CONFIG_RT_GROUP_SCHED /* * Change rq's cpupri only if rt_rq is the top queue. */ - if (&rq->rt != rt_rq) + if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq) return; -#endif + if (rq->online && prio < prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, prio); } @@ -1082,27 +1066,16 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); -#ifdef CONFIG_RT_GROUP_SCHED /* * Change rq's cpupri only if rt_rq is the top queue. */ - if (&rq->rt != rt_rq) + if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq) return; -#endif + if (rq->online && rt_rq->highest_prio.curr != prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); } -#else /* CONFIG_SMP */ - -static inline -void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} -static inline -void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} - -#endif /* CONFIG_SMP */ - -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED static void inc_rt_prio(struct rt_rq *rt_rq, int prio) { @@ -1141,13 +1114,6 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio) dec_rt_prio_smp(rt_rq, prio, prev_prio); } -#else - -static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {} -static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {} - -#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */ - #ifdef CONFIG_RT_GROUP_SCHED static void @@ -1156,8 +1122,7 @@ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) if (rt_se_boosted(rt_se)) rt_rq->rt_nr_boosted++; - if (rt_rq->tg) - start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); + start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); } static void @@ -1169,7 +1134,7 @@ dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); } -#else /* CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ static void inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) @@ -1179,7 +1144,7 @@ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) static inline void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ static inline unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) @@ -1257,11 +1222,9 @@ static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_arr static inline struct sched_statistics * __schedstats_from_rt_se(struct sched_rt_entity *rt_se) { -#ifdef CONFIG_RT_GROUP_SCHED /* schedstats is not supported for rt group. */ if (!rt_entity_is_task(rt_se)) return NULL; -#endif return &rt_task_of(rt_se)->stats; } @@ -1477,6 +1440,9 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) enqueue_rt_entity(rt_se, flags); + if (task_is_blocked(p)) + return; + if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); } @@ -1524,10 +1490,9 @@ static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head) static void yield_task_rt(struct rq *rq) { - requeue_task_rt(rq, rq->curr, 0); + requeue_task_rt(rq, rq->donor, 0); } -#ifdef CONFIG_SMP static int find_lowest_rq(struct task_struct *task); static int @@ -1642,7 +1607,6 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq); } -#endif /* CONFIG_SMP */ /* * Preempt the current task with a newly woken task if needed: @@ -1656,7 +1620,6 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) return; } -#ifdef CONFIG_SMP /* * If: * @@ -1671,7 +1634,6 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) */ if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr)) check_preempt_equal_prio(rq, p); -#endif } static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first) @@ -1733,7 +1695,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) return rt_task_of(rt_se); } -static struct task_struct *pick_task_rt(struct rq *rq) +static struct task_struct *pick_task_rt(struct rq *rq, struct rq_flags *rf) { struct task_struct *p; @@ -1757,6 +1719,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_s update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); + if (task_is_blocked(p)) + return; /* * The previous task needs to be made eligible for pushing * if it is still active @@ -1765,8 +1729,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_s enqueue_pushable_task(rq, p); } -#ifdef CONFIG_SMP - /* Only try algorithms three times */ #define RT_MAX_TRIES 3 @@ -1883,6 +1845,27 @@ static int find_lowest_rq(struct task_struct *task) return -1; } +static struct task_struct *pick_next_pushable_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_tasks(rq)) + return NULL; + + p = plist_first_entry(&rq->rt.pushable_tasks, + struct task_struct, pushable_tasks); + + BUG_ON(rq->cpu != task_cpu(p)); + BUG_ON(task_current(rq, p)); + BUG_ON(task_current_donor(rq, p)); + BUG_ON(p->nr_cpus_allowed <= 1); + + BUG_ON(!task_on_rq_queued(p)); + BUG_ON(!rt_task(p)); + + return p; +} + /* Will lock the rq it finds */ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) { @@ -1913,18 +1896,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) /* * We had to unlock the run queue. In * the mean time, task could have - * migrated already or had its affinity changed. - * Also make sure that it wasn't scheduled on its rq. + * migrated already or had its affinity changed, + * therefore check if the task is still at the + * head of the pushable tasks list. * It is possible the task was scheduled, set * "migrate_disabled" and then got preempted, so we must * check the task migration disable flag here too. */ - if (unlikely(task_rq(task) != rq || + if (unlikely(is_migration_disabled(task) || !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || - task_on_cpu(rq, task) || - !rt_task(task) || - is_migration_disabled(task) || - !task_on_rq_queued(task))) { + task != pick_next_pushable_task(rq))) { double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; @@ -1944,27 +1925,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) return lowest_rq; } -static struct task_struct *pick_next_pushable_task(struct rq *rq) -{ - struct task_struct *p; - - if (!has_pushable_tasks(rq)) - return NULL; - - p = plist_first_entry(&rq->rt.pushable_tasks, - struct task_struct, pushable_tasks); - - BUG_ON(rq->cpu != task_cpu(p)); - BUG_ON(task_current(rq, p)); - BUG_ON(task_current_donor(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); - - BUG_ON(!task_on_rq_queued(p)); - BUG_ON(!rt_task(p)); - - return p; -} - /* * If the current CPU has more than one RT task, see if the non * running task can migrate over to a CPU that is running a task @@ -2442,7 +2402,6 @@ void __init init_sched_rt_class(void) GFP_KERNEL, cpu_to_node(i)); } } -#endif /* CONFIG_SMP */ /* * When switching a task to RT, we may overload the runqueue @@ -2466,10 +2425,8 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) * then see if we can move to another run queue. */ if (task_on_rq_queued(p)) { -#ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) rt_queue_push_tasks(rq); -#endif /* CONFIG_SMP */ if (p->prio < rq->donor->prio && cpu_online(cpu_of(rq))) resched_curr(rq); } @@ -2480,13 +2437,15 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) * us to initiate a push or pull. */ static void -prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) +prio_changed_rt(struct rq *rq, struct task_struct *p, u64 oldprio) { if (!task_on_rq_queued(p)) return; + if (p->prio == oldprio) + return; + if (task_current_donor(rq, p)) { -#ifdef CONFIG_SMP /* * If our priority decreases while running, we * may need to pull tasks to this runqueue. @@ -2500,11 +2459,6 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) */ if (p->prio > rq->rt.highest_prio.curr) resched_curr(rq); -#else - /* For UP simply resched on drop of prio */ - if (oldprio < p->prio) - resched_curr(rq); -#endif /* CONFIG_SMP */ } else { /* * This task is not running, but if it is @@ -2540,9 +2494,9 @@ static void watchdog(struct rq *rq, struct task_struct *p) } } } -#else +#else /* !CONFIG_POSIX_TIMERS: */ static inline void watchdog(struct rq *rq, struct task_struct *p) { } -#endif +#endif /* !CONFIG_POSIX_TIMERS */ /* * scheduler tick hitting a task of our scheduling class. @@ -2602,18 +2556,21 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu) { struct rt_rq *rt_rq; -#ifdef CONFIG_RT_GROUP_SCHED +#ifdef CONFIG_RT_GROUP_SCHED // XXX maybe add task_rt_rq(), see also sched_rt_period_rt_rq rt_rq = task_group(p)->rt_rq[cpu]; + WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group); #else rt_rq = &cpu_rq(cpu)->rt; #endif return rt_rq_throttled(rt_rq); } -#endif +#endif /* CONFIG_SCHED_CORE */ DEFINE_SCHED_CLASS(rt) = { + .queue_mask = 4, + .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, @@ -2624,7 +2581,6 @@ DEFINE_SCHED_CLASS(rt) = { .put_prev_task = put_prev_task_rt, .set_next_task = set_next_task_rt, -#ifdef CONFIG_SMP .balance = balance_rt, .select_task_rq = select_task_rq_rt, .set_cpus_allowed = set_cpus_allowed_common, @@ -2633,14 +2589,13 @@ DEFINE_SCHED_CLASS(rt) = { .task_woken = task_woken_rt, .switched_from = switched_from_rt, .find_lock_rq = find_lock_lowest_rq, -#endif .task_tick = task_tick_rt, .get_rr_interval = get_rr_interval_rt, - .prio_changed = prio_changed_rt, .switched_to = switched_to_rt, + .prio_changed = prio_changed_rt, .update_curr = update_curr_rt, @@ -2713,6 +2668,9 @@ static int tg_rt_schedulable(struct task_group *tg, void *data) tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg)) return -EBUSY; + if (WARN_ON(!rt_group_sched_enabled() && tg != &root_task_group)) + return -EBUSY; + total = to_ratio(period, runtime); /* @@ -2868,13 +2826,13 @@ static int sched_rt_global_constraints(void) int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) { /* Don't accept real-time tasks when there is no way for them to run */ - if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) + if (rt_group_sched_enabled() && rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) return 0; return 1; } -#else /* !CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ #ifdef CONFIG_SYSCTL static int sched_rt_global_constraints(void) @@ -2882,7 +2840,7 @@ static int sched_rt_global_constraints(void) return 0; } #endif /* CONFIG_SYSCTL */ -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SYSCTL static int sched_rt_global_validate(void) @@ -2938,6 +2896,12 @@ undo: sched_domains_mutex_unlock(); mutex_unlock(&mutex); + /* + * After changing maximum available bandwidth for DEADLINE, we need to + * recompute per root domain and per cpus variables accordingly. + */ + rebuild_sched_domains(); + return ret; } diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h index c529706bed11..6803cfec7a1e 100644 --- a/kernel/sched/sched-pelt.h +++ b/kernel/sched/sched-pelt.h @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Generated by Documentation/scheduler/sched-pelt; do not modify. */ +#include <linux/types.h> static const u32 runnable_avg_yN_inv[] __maybe_unused = { 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 47972f34ea70..d30cca6870f5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -5,6 +5,7 @@ #ifndef _KERNEL_SCHED_SCHED_H #define _KERNEL_SCHED_SCHED_H +#include <linux/prandom.h> #include <linux/sched/affinity.h> #include <linux/sched/autogroup.h> #include <linux/sched/cpufreq.h> @@ -20,7 +21,6 @@ #include <linux/sched/task_flags.h> #include <linux/sched/task.h> #include <linux/sched/topology.h> - #include <linux/atomic.h> #include <linux/bitmap.h> #include <linux/bug.h> @@ -69,6 +69,7 @@ #include <linux/wait_bit.h> #include <linux/workqueue_api.h> #include <linux/delayacct.h> +#include <linux/mmu_context.h> #include <trace/events/power.h> #include <trace/events/sched.h> @@ -364,29 +365,54 @@ extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s6 * * dl_se::rq -- runqueue we belong to. * - * dl_se::server_has_tasks() -- used on bandwidth enforcement; we 'stop' the - * server when it runs out of tasks to run. - * * dl_se::server_pick() -- nested pick_next_task(); we yield the period if this * returns NULL. * * dl_server_update() -- called from update_curr_common(), propagates runtime * to the server. * - * dl_server_start() - * dl_server_stop() -- start/stop the server when it has (no) tasks. + * dl_server_start() -- start the server when it has tasks; it will stop + * automatically when there are no more tasks, per + * dl_se::server_pick() returning NULL. + * + * dl_server_stop() -- (force) stop the server; use when updating + * parameters. * * dl_server_init() -- initializes the server. + * + * When started the dl_server will (per dl_defer) schedule a timer for its + * zero-laxity point -- that is, unlike regular EDF tasks which run ASAP, a + * server will run at the very end of its period. + * + * This is done such that any runtime from the target class can be accounted + * against the server -- through dl_server_update() above -- such that when it + * becomes time to run, it might already be out of runtime and get deferred + * until the next period. In this case dl_server_timer() will alternate + * between defer and replenish but never actually enqueue the server. + * + * Only when the target class does not manage to exhaust the server's runtime + * (there's actualy starvation in the given period), will the dl_server get on + * the runqueue. Once queued it will pick tasks from the target class and run + * them until either its runtime is exhaused, at which point its back to + * dl_server_timer, or until there are no more tasks to run, at which point + * the dl_server stops itself. + * + * By stopping at this point the dl_server retains bandwidth, which, if a new + * task wakes up imminently (starting the server again), can be used -- + * subject to CBS wakeup rules -- without having to wait for the next period. + * + * Additionally, because of the dl_defer behaviour the start/stop behaviour is + * naturally thottled to once per period, avoiding high context switch + * workloads from spamming the hrtimer program/cancel paths. */ +extern void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec); extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec); extern void dl_server_start(struct sched_dl_entity *dl_se); extern void dl_server_stop(struct sched_dl_entity *dl_se); extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, - dl_server_has_tasks_f has_tasks, dl_server_pick_f pick_task); +extern void sched_init_dl_servers(void); -extern void dl_server_update_idle_time(struct rq *rq, - struct task_struct *p); extern void fair_server_init(struct rq *rq); extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq); extern int dl_server_apply_params(struct sched_dl_entity *dl_se, @@ -401,6 +427,19 @@ static inline bool dl_server_active(struct sched_dl_entity *dl_se) extern struct list_head task_groups; +#ifdef CONFIG_GROUP_SCHED_BANDWIDTH +extern const u64 max_bw_quota_period_us; + +/* + * default period for group bandwidth. + * default: 0.1s, units: microseconds + */ +static inline u64 default_bw_period_us(void) +{ + return 100000ULL; +} +#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */ + struct cfs_bandwidth { #ifdef CONFIG_CFS_BANDWIDTH raw_spinlock_t lock; @@ -424,7 +463,7 @@ struct cfs_bandwidth { int nr_burst; u64 throttled_time; u64 burst_time; -#endif +#endif /* CONFIG_CFS_BANDWIDTH */ }; /* Task group related information */ @@ -442,15 +481,13 @@ struct task_group { /* runqueue "owned" by this group on each CPU */ struct cfs_rq **cfs_rq; unsigned long shares; -#ifdef CONFIG_SMP /* * load_avg can be heavily contended at clock tick time, so put * it in its own cache-line separated from the fields above which * will also be accessed at each tick. */ atomic_long_t load_avg ____cacheline_aligned; -#endif -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity **rt_se; @@ -459,10 +496,7 @@ struct task_group { struct rt_bandwidth rt_bandwidth; #endif -#ifdef CONFIG_EXT_GROUP_SCHED - u32 scx_flags; /* SCX_TG_* */ - u32 scx_weight; -#endif + struct scx_task_group scx; struct rcu_head rcu; struct list_head list; @@ -531,7 +565,7 @@ extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); extern void online_fair_sched_group(struct task_group *tg); extern void unregister_fair_sched_group(struct task_group *tg); -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline void free_fair_sched_group(struct task_group *tg) { } static inline int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { @@ -539,7 +573,7 @@ static inline int alloc_fair_sched_group(struct task_group *tg, struct task_grou } static inline void online_fair_sched_group(struct task_group *tg) { } static inline void unregister_fair_sched_group(struct task_group *tg) { } -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, @@ -573,25 +607,20 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); extern int sched_group_set_idle(struct task_group *tg, long idle); -#ifdef CONFIG_SMP extern void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next); -#else /* !CONFIG_SMP */ -static inline void set_task_rq_fair(struct sched_entity *se, - struct cfs_rq *prev, struct cfs_rq *next) { } -#endif /* CONFIG_SMP */ -#else /* !CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; } static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; } -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* !CONFIG_FAIR_GROUP_SCHED */ -#else /* CONFIG_CGROUP_SCHED */ +#else /* !CONFIG_CGROUP_SCHED: */ struct cfs_bandwidth { }; static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; } -#endif /* CONFIG_CGROUP_SCHED */ +#endif /* !CONFIG_CGROUP_SCHED */ extern void unregister_rt_sched_group(struct task_group *tg); extern void free_rt_sched_group(struct task_group *tg); @@ -652,10 +681,10 @@ struct cfs_rq { s64 avg_vruntime; u64 avg_load; - u64 min_vruntime; + u64 zero_vruntime; #ifdef CONFIG_SCHED_CORE unsigned int forceidle_seq; - u64 min_vruntime_fi; + u64 zero_vruntime_fi; #endif struct rb_root_cached tasks_timeline; @@ -667,7 +696,6 @@ struct cfs_rq { struct sched_entity *curr; struct sched_entity *next; -#ifdef CONFIG_SMP /* * CFS load tracking */ @@ -699,7 +727,6 @@ struct cfs_rq { u64 last_h_load_update; struct sched_entity *h_load_next; #endif /* CONFIG_FAIR_GROUP_SCHED */ -#endif /* CONFIG_SMP */ #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ @@ -732,10 +759,12 @@ struct cfs_rq { u64 throttled_clock_pelt_time; u64 throttled_clock_self; u64 throttled_clock_self_time; - int throttled; + bool throttled:1; + bool pelt_clock_throttled:1; int throttle_count; struct list_head throttled_list; struct list_head throttled_csd_list; + struct list_head throttled_limbo_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -750,10 +779,10 @@ enum scx_rq_flags { */ SCX_RQ_ONLINE = 1 << 0, SCX_RQ_CAN_STOP_TICK = 1 << 1, - SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */ SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */ SCX_RQ_BYPASSING = 1 << 4, SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */ + SCX_RQ_BAL_CB_PENDING = 1 << 6, /* must queue a cb after dispatching */ SCX_RQ_IN_WAKEUP = 1 << 16, SCX_RQ_IN_BALANCE = 1 << 17, @@ -774,10 +803,12 @@ struct scx_rq { cpumask_var_t cpus_to_kick_if_idle; cpumask_var_t cpus_to_preempt; cpumask_var_t cpus_to_wait; - unsigned long pnt_seq; + unsigned long kick_sync; + local_t reenq_local_deferred; struct balance_callback deferred_bal_cb; struct irq_work deferred_irq_work; struct irq_work kick_cpus_irq_work; + struct scx_dispatch_q bypass_dsq; }; #endif /* CONFIG_SCHED_CLASS_EXT */ @@ -796,32 +827,28 @@ struct rt_rq { struct rt_prio_array active; unsigned int rt_nr_running; unsigned int rr_nr_running; -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED struct { int curr; /* highest queued rt task prio */ -#ifdef CONFIG_SMP int next; /* next highest */ -#endif } highest_prio; -#endif -#ifdef CONFIG_SMP bool overloaded; struct plist_head pushable_tasks; -#endif /* CONFIG_SMP */ int rt_queued; #ifdef CONFIG_RT_GROUP_SCHED int rt_throttled; - u64 rt_time; - u64 rt_runtime; + u64 rt_time; /* consumed RT time, goes up in update_curr_rt */ + u64 rt_runtime; /* allotted RT time, "slice" from rt_bandwidth, RT sharing/balancing */ /* Nests inside the rq lock: */ raw_spinlock_t rt_runtime_lock; unsigned int rt_nr_boosted; - struct rq *rq; - struct task_group *tg; + struct rq *rq; /* this is always top-level rq, cache? */ +#endif +#ifdef CONFIG_CGROUP_SCHED + struct task_group *tg; /* this tg has "this" rt_rq on given CPU for runnable entities */ #endif }; @@ -837,7 +864,6 @@ struct dl_rq { unsigned int dl_nr_running; -#ifdef CONFIG_SMP /* * Deadline values of the currently executing and the * earliest ready task on this rq. Caching these facilitates @@ -857,9 +883,7 @@ struct dl_rq { * of the leftmost (earliest deadline) element. */ struct rb_root_cached pushable_dl_tasks_root; -#else - struct dl_bw dl_bw; -#endif + /* * "Active utilization" for this runqueue: increased when a * task wakes up (becomes TASK_RUNNING) and decreased when a @@ -930,7 +954,6 @@ static inline long se_runnable(struct sched_entity *se) #endif /* !CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_SMP /* * XXX we want to get rid of these helpers and use the full load resolution. */ @@ -1006,7 +1029,7 @@ struct root_domain { /* These atomics are updated outside of a lock */ atomic_t rto_loop_next; atomic_t rto_loop_start; -#endif +#endif /* HAVE_RT_PUSH_IPI */ /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. @@ -1041,7 +1064,6 @@ static inline void set_rd_overloaded(struct root_domain *rd, int status) #ifdef HAVE_RT_PUSH_IPI extern void rto_push_irq_work_func(struct irq_work *work); #endif -#endif /* CONFIG_SMP */ #ifdef CONFIG_UCLAMP_TASK /* @@ -1098,6 +1120,8 @@ struct rq { /* runqueue lock: */ raw_spinlock_t __lock; + /* Per class runqueue modification mask; bits in class order. */ + unsigned int queue_mask; unsigned int nr_running; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; @@ -1105,18 +1129,14 @@ struct rq { unsigned int numa_migrate_on; #endif #ifdef CONFIG_NO_HZ_COMMON -#ifdef CONFIG_SMP unsigned long last_blocked_load_update_tick; unsigned int has_blocked_load; call_single_data_t nohz_csd; -#endif /* CONFIG_SMP */ unsigned int nohz_tick_stopped; atomic_t nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ -#ifdef CONFIG_SMP unsigned int ttwu_pending; -#endif u64 nr_switches; #ifdef CONFIG_UCLAMP_TASK @@ -1147,12 +1167,17 @@ struct rq { * one CPU and if it got migrated afterwards it may decrease * it on another CPU. Always updated under the runqueue lock: */ - unsigned int nr_uninterruptible; + unsigned long nr_uninterruptible; +#ifdef CONFIG_SCHED_PROXY_EXEC + struct task_struct __rcu *donor; /* Scheduling context */ + struct task_struct __rcu *curr; /* Execution context */ +#else union { struct task_struct __rcu *donor; /* Scheduler context */ struct task_struct __rcu *curr; /* Execution context */ }; +#endif struct sched_dl_entity *dl_server; struct task_struct *idle; struct task_struct *stop; @@ -1181,7 +1206,6 @@ struct rq { int membarrier_state; #endif -#ifdef CONFIG_SMP struct root_domain *rd; struct sched_domain __rcu *sd; @@ -1222,7 +1246,6 @@ struct rq { #ifdef CONFIG_HOTPLUG_CPU struct rcuwait hotplug_wait; #endif -#endif /* CONFIG_SMP */ #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; @@ -1240,9 +1263,7 @@ struct rq { long calc_load_active; #ifdef CONFIG_SCHED_HRTICK -#ifdef CONFIG_SMP call_single_data_t hrtick_csd; -#endif struct hrtimer hrtick_timer; ktime_t hrtick_time; #endif @@ -1269,9 +1290,7 @@ struct rq { struct cpuidle_state *idle_state; #endif -#ifdef CONFIG_SMP unsigned int nr_pinned; -#endif unsigned int push_busy; struct cpu_stop_work push_work; @@ -1292,12 +1311,12 @@ struct rq { unsigned int core_forceidle_seq; unsigned int core_forceidle_occupation; u64 core_forceidle_start; -#endif +#endif /* CONFIG_SCHED_CORE */ /* Scratch cpumask to be temporarily used under rq_lock */ cpumask_var_t scratch_mask; -#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP) +#ifdef CONFIG_CFS_BANDWIDTH call_single_data_t cfsb_csd; struct list_head cfsb_csd_list; #endif @@ -1311,35 +1330,33 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) return cfs_rq->rq; } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) { return container_of(cfs_rq, struct rq, cfs); } -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static inline int cpu_of(struct rq *rq) { -#ifdef CONFIG_SMP return rq->cpu; -#else - return 0; -#endif } #define MDF_PUSH 0x01 static inline bool is_migration_disabled(struct task_struct *p) { -#ifdef CONFIG_SMP return p->migration_disabled; -#else - return false; -#endif } DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +DECLARE_PER_CPU(struct rnd_state, sched_rnd_state); + +static inline u32 sched_rng(void) +{ + return prandom_u32_state(this_cpu_ptr(&sched_rnd_state)); +} #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() this_cpu_ptr(&runqueues) @@ -1347,10 +1364,17 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) +#ifdef CONFIG_SCHED_PROXY_EXEC +static inline void rq_set_donor(struct rq *rq, struct task_struct *t) +{ + rcu_assign_pointer(rq->donor, t); +} +#else static inline void rq_set_donor(struct rq *rq, struct task_struct *t) { /* Do nothing */ } +#endif #ifdef CONFIG_SCHED_CORE static inline struct cpumask *sched_group_span(struct sched_group *sg); @@ -1416,6 +1440,9 @@ static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p) if (!sched_core_enabled(rq)) return true; + if (rq->core->core_cookie == p->core_cookie) + return true; + for_each_cpu(cpu, cpu_smt_mask(cpu_of(rq))) { if (!available_idle_cpu(cpu)) { idle_core = false; @@ -1427,7 +1454,7 @@ static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p) * A CPU in an idle core is always the best choice for tasks with * cookies. */ - return idle_core || rq->core->core_cookie == p->core_cookie; + return idle_core; } static inline bool sched_group_cookie_match(struct rq *rq, @@ -1499,6 +1526,24 @@ static inline bool sched_group_cookie_match(struct rq *rq, #endif /* !CONFIG_SCHED_CORE */ +#ifdef CONFIG_RT_GROUP_SCHED +# ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED +DECLARE_STATIC_KEY_FALSE(rt_group_sched); +static inline bool rt_group_sched_enabled(void) +{ + return static_branch_unlikely(&rt_group_sched); +} +# else /* !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED: */ +DECLARE_STATIC_KEY_TRUE(rt_group_sched); +static inline bool rt_group_sched_enabled(void) +{ + return static_branch_likely(&rt_group_sched); +} +# endif /* !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ +# define rt_group_sched_enabled() false +#endif /* !CONFIG_RT_GROUP_SCHED */ + static inline void lockdep_assert_rq_held(struct rq *rq) { lockdep_assert_held(__rq_lockp(rq)); @@ -1555,9 +1600,9 @@ static inline void update_idle_core(struct rq *rq) __update_idle_core(rq); } -#else +#else /* !CONFIG_SCHED_SMT: */ static inline void update_idle_core(struct rq *rq) { } -#endif +#endif /* !CONFIG_SCHED_SMT */ #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1717,10 +1762,10 @@ extern struct balance_callback balance_push_callback; #ifdef CONFIG_SCHED_CLASS_EXT extern const struct sched_class ext_sched_class; -DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); /* SCX BPF scheduler loaded */ +DECLARE_STATIC_KEY_FALSE(__scx_enabled); /* SCX BPF scheduler loaded */ DECLARE_STATIC_KEY_FALSE(__scx_switched_all); /* all fair class tasks on SCX */ -#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled) +#define scx_enabled() static_branch_unlikely(&__scx_enabled) #define scx_switched_all() static_branch_unlikely(&__scx_switched_all) static inline void scx_rq_clock_update(struct rq *rq, u64 clock) @@ -1738,7 +1783,7 @@ static inline void scx_rq_clock_invalidate(struct rq *rq) WRITE_ONCE(rq->scx.flags, rq->scx.flags & ~SCX_RQ_CLK_VALID); } -#else /* !CONFIG_SCHED_CLASS_EXT */ +#else /* !CONFIG_SCHED_CLASS_EXT: */ #define scx_enabled() false #define scx_switched_all() false @@ -1762,9 +1807,7 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); rf->clock_update_flags = 0; -#ifdef CONFIG_SMP WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback); -#endif } static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) @@ -1795,7 +1838,8 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(p->pi_lock) __acquires(rq->lock); -static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) +static inline void +__task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); @@ -1807,8 +1851,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) __releases(rq->lock) __releases(p->pi_lock) { - rq_unpin_lock(rq, rf); - raw_spin_rq_unlock(rq); + __task_rq_unlock(rq, p, rf); raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); } @@ -1817,6 +1860,11 @@ DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct, task_rq_unlock(_T->rq, _T->lock, &_T->rf), struct rq *rq; struct rq_flags rf) +DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct, + _T->rq = __task_rq_lock(_T->lock, &_T->rf), + __task_rq_unlock(_T->rq, _T->lock, &_T->rf), + struct rq *rq; struct rq_flags rf) + static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { @@ -1931,19 +1979,17 @@ extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *p, struct task_struct *t, int cpu, int scpu); -extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); +extern void init_numa_balancing(u64 clone_flags, struct task_struct *p); #else /* !CONFIG_NUMA_BALANCING: */ static inline void -init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +init_numa_balancing(u64 clone_flags, struct task_struct *p) { } #endif /* !CONFIG_NUMA_BALANCING */ -#ifdef CONFIG_SMP - static inline void queue_balance_callback(struct rq *rq, struct balance_callback *head, @@ -2109,8 +2155,6 @@ static inline const struct cpumask *task_user_cpus(struct task_struct *p) return p->user_cpus_ptr; } -#endif /* CONFIG_SMP */ - #ifdef CONFIG_CGROUP_SCHED /* @@ -2146,9 +2190,16 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #endif #ifdef CONFIG_RT_GROUP_SCHED + /* + * p->rt.rt_rq is NULL initially and it is easier to assign + * root_task_group's rt_rq than switching in rt_rq_of_se() + * Clobbers tg(!) + */ + if (!rt_group_sched_enabled()) + tg = &root_task_group; p->rt.rt_rq = tg->rt_rq[cpu]; p->rt.parent = tg->rt_se[cpu]; -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ } #else /* !CONFIG_CGROUP_SCHED: */ @@ -2174,7 +2225,8 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) smp_wmb(); WRITE_ONCE(task_thread_info(p)->cpu, cpu); p->wake_cpu = cpu; -#endif + rseq_sched_set_ids_changed(p); +#endif /* CONFIG_SMP */ } /* @@ -2252,13 +2304,17 @@ static inline int task_current_donor(struct rq *rq, struct task_struct *p) return rq->donor == p; } +static inline bool task_is_blocked(struct task_struct *p) +{ + if (!sched_proxy_exec()) + return false; + + return !!p->blocked_on; +} + static inline int task_on_cpu(struct rq *rq, struct task_struct *p) { -#ifdef CONFIG_SMP return p->on_cpu; -#else - return task_current(rq, p); -#endif } static inline int task_on_rq_queued(struct task_struct *p) @@ -2281,11 +2337,9 @@ static inline int task_on_rq_migrating(struct task_struct *p) #define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ #define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */ -#ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); static_assert(WF_FORK == SD_BALANCE_FORK); static_assert(WF_TTWU == SD_BALANCE_WAKE); -#endif /* * To aid in avoiding the subversion of "niceness" due to uneven distribution @@ -2305,8 +2359,7 @@ extern const u32 sched_prio_to_wmult[40]; /* * {de,en}queue flags: * - * DEQUEUE_SLEEP - task is no longer runnable - * ENQUEUE_WAKEUP - task just became runnable + * SLEEP/WAKEUP - task is no-longer/just-became runnable * * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks * are in a known state which allows modification. Such pairs @@ -2319,37 +2372,46 @@ extern const u32 sched_prio_to_wmult[40]; * * MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE) * + * DELAYED - de/re-queue a sched_delayed task + * + * CLASS - going to update p->sched_class; makes sched_change call the + * various switch methods. + * * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) * ENQUEUE_MIGRATED - the task was migrated during wakeup * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called * + * XXX SAVE/RESTORE in combination with CLASS doesn't really make sense, but + * SCHED_DEADLINE seems to rely on this for now. */ -#define DEQUEUE_SLEEP 0x01 /* Matches ENQUEUE_WAKEUP */ -#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ -#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ -#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ -#define DEQUEUE_SPECIAL 0x10 -#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ -#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */ - -#define ENQUEUE_WAKEUP 0x01 -#define ENQUEUE_RESTORE 0x02 -#define ENQUEUE_MOVE 0x04 -#define ENQUEUE_NOCLOCK 0x08 - -#define ENQUEUE_HEAD 0x10 -#define ENQUEUE_REPLENISH 0x20 -#ifdef CONFIG_SMP -#define ENQUEUE_MIGRATED 0x40 -#else -#define ENQUEUE_MIGRATED 0x00 -#endif -#define ENQUEUE_INITIAL 0x80 -#define ENQUEUE_MIGRATING 0x100 -#define ENQUEUE_DELAYED 0x200 -#define ENQUEUE_RQ_SELECTED 0x400 +#define DEQUEUE_SLEEP 0x0001 /* Matches ENQUEUE_WAKEUP */ +#define DEQUEUE_SAVE 0x0002 /* Matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x0004 /* Matches ENQUEUE_MOVE */ +#define DEQUEUE_NOCLOCK 0x0008 /* Matches ENQUEUE_NOCLOCK */ + +#define DEQUEUE_MIGRATING 0x0010 /* Matches ENQUEUE_MIGRATING */ +#define DEQUEUE_DELAYED 0x0020 /* Matches ENQUEUE_DELAYED */ +#define DEQUEUE_CLASS 0x0040 /* Matches ENQUEUE_CLASS */ + +#define DEQUEUE_SPECIAL 0x00010000 +#define DEQUEUE_THROTTLE 0x00020000 + +#define ENQUEUE_WAKEUP 0x0001 +#define ENQUEUE_RESTORE 0x0002 +#define ENQUEUE_MOVE 0x0004 +#define ENQUEUE_NOCLOCK 0x0008 + +#define ENQUEUE_MIGRATING 0x0010 +#define ENQUEUE_DELAYED 0x0020 +#define ENQUEUE_CLASS 0x0040 + +#define ENQUEUE_HEAD 0x00010000 +#define ENQUEUE_REPLENISH 0x00020000 +#define ENQUEUE_MIGRATED 0x00040000 +#define ENQUEUE_INITIAL 0x00080000 +#define ENQUEUE_RQ_SELECTED 0x00100000 #define RETRY_TASK ((void *)-1UL) @@ -2366,16 +2428,61 @@ struct sched_class { #ifdef CONFIG_UCLAMP_TASK int uclamp_enabled; #endif + /* + * idle: 0 + * ext: 1 + * fair: 2 + * rt: 4 + * dl: 8 + * stop: 16 + */ + unsigned int queue_mask; + /* + * move_queued_task/activate_task/enqueue_task: rq->lock + * ttwu_do_activate/activate_task/enqueue_task: rq->lock + * wake_up_new_task/activate_task/enqueue_task: task_rq_lock + * ttwu_runnable/enqueue_task: task_rq_lock + * proxy_task_current: rq->lock + * sched_change_end + */ void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); + /* + * move_queued_task/deactivate_task/dequeue_task: rq->lock + * __schedule/block_task/dequeue_task: rq->lock + * proxy_task_current: rq->lock + * wait_task_inactive: task_rq_lock + * sched_change_begin + */ bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); + + /* + * do_sched_yield: rq->lock + */ void (*yield_task) (struct rq *rq); + /* + * yield_to: rq->lock (double) + */ bool (*yield_to_task)(struct rq *rq, struct task_struct *p); + /* + * move_queued_task: rq->lock + * __migrate_swap_task: rq->lock + * ttwu_do_activate: rq->lock + * ttwu_runnable: task_rq_lock + * wake_up_new_task: task_rq_lock + */ void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); + /* + * schedule/pick_next_task/prev_balance: rq->lock + */ int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); - struct task_struct *(*pick_task)(struct rq *rq); + + /* + * schedule/pick_next_task: rq->lock + */ + struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf); /* * Optional! When implemented pick_next_task() should be equivalent to: * @@ -2385,57 +2492,123 @@ struct sched_class { * set_next_task_first(next); * } */ - struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev); + struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf); + /* + * sched_change: + * __schedule: rq->lock + */ void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next); void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); -#ifdef CONFIG_SMP + /* + * select_task_rq: p->pi_lock + * sched_exec: p->pi_lock + */ int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); + /* + * set_task_cpu: p->pi_lock || rq->lock (ttwu like) + */ void (*migrate_task_rq)(struct task_struct *p, int new_cpu); + /* + * ttwu_do_activate: rq->lock + * wake_up_new_task: task_rq_lock + */ void (*task_woken)(struct rq *this_rq, struct task_struct *task); + /* + * do_set_cpus_allowed: task_rq_lock + sched_change + */ void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx); + /* + * sched_set_rq_{on,off}line: rq->lock + */ void (*rq_online)(struct rq *rq); void (*rq_offline)(struct rq *rq); + /* + * push_cpu_stop: p->pi_lock && rq->lock + */ struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); -#endif + /* + * hrtick: rq->lock + * sched_tick: rq->lock + * sched_tick_remote: rq->lock + */ void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); + /* + * sched_cgroup_fork: p->pi_lock + */ void (*task_fork)(struct task_struct *p); + /* + * finish_task_switch: no locks + */ void (*task_dead)(struct task_struct *p); /* - * The switched_from() call is allowed to drop rq->lock, therefore we - * cannot assume the switched_from/switched_to pair is serialized by - * rq->lock. They are however serialized by p->pi_lock. + * sched_change + */ + void (*switching_from)(struct rq *this_rq, struct task_struct *task); + void (*switched_from) (struct rq *this_rq, struct task_struct *task); + void (*switching_to) (struct rq *this_rq, struct task_struct *task); + void (*switched_to) (struct rq *this_rq, struct task_struct *task); + u64 (*get_prio) (struct rq *this_rq, struct task_struct *task); + void (*prio_changed) (struct rq *this_rq, struct task_struct *task, + u64 oldprio); + + /* + * set_load_weight: task_rq_lock + sched_change + * __setscheduler_parms: task_rq_lock + sched_change */ - void (*switching_to) (struct rq *this_rq, struct task_struct *task); - void (*switched_from)(struct rq *this_rq, struct task_struct *task); - void (*switched_to) (struct rq *this_rq, struct task_struct *task); void (*reweight_task)(struct rq *this_rq, struct task_struct *task, const struct load_weight *lw); - void (*prio_changed) (struct rq *this_rq, struct task_struct *task, - int oldprio); + /* + * sched_rr_get_interval: task_rq_lock + */ unsigned int (*get_rr_interval)(struct rq *rq, struct task_struct *task); + /* + * task_sched_runtime: task_rq_lock + */ void (*update_curr)(struct rq *rq); #ifdef CONFIG_FAIR_GROUP_SCHED + /* + * sched_change_group: task_rq_lock + sched_change + */ void (*task_change_group)(struct task_struct *p); #endif #ifdef CONFIG_SCHED_CORE + /* + * pick_next_task: rq->lock + * try_steal_cookie: rq->lock (double) + */ int (*task_is_throttled)(struct task_struct *p, int cpu); #endif }; +/* + * Does not nest; only used around sched_class::pick_task() rq-lock-breaks. + */ +static inline void rq_modified_clear(struct rq *rq) +{ + rq->queue_mask = 0; +} + +static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class) +{ + unsigned int mask = class->queue_mask; + return rq->queue_mask & ~((mask << 1) - 1); +} + static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { WARN_ON_ONCE(rq->donor != prev); @@ -2461,7 +2634,7 @@ static inline void put_prev_set_next_task(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - WARN_ON_ONCE(rq->curr != prev); + WARN_ON_ONCE(rq->donor != prev); __put_prev_set_next_dl_server(rq, prev, next); @@ -2547,16 +2720,15 @@ static inline bool sched_fair_runnable(struct rq *rq) return rq->cfs.nr_queued > 0; } -extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); -extern struct task_struct *pick_task_idle(struct rq *rq); +extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf); +extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf); #define SCA_CHECK 0x01 #define SCA_MIGRATE_DISABLE 0x02 #define SCA_MIGRATE_ENABLE 0x04 #define SCA_USER 0x08 -#ifdef CONFIG_SMP - extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void sched_balance_trigger(struct rq *rq); @@ -2580,7 +2752,7 @@ static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) static inline cpumask_t *alloc_user_cpus_ptr(int node) { /* - * See do_set_cpus_allowed() above for the rcu_head usage. + * See set_cpus_allowed_force() above for the rcu_head usage. */ int size = max_t(int, cpumask_size(), sizeof(struct rcu_head)); @@ -2608,26 +2780,6 @@ static inline struct task_struct *get_push_task(struct rq *rq) extern int push_cpu_stop(void *arg); -#else /* !CONFIG_SMP: */ - -static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) -{ - return true; -} - -static inline int __set_cpus_allowed_ptr(struct task_struct *p, - struct affinity_context *ctx) -{ - return set_cpus_allowed_ptr(p, ctx->new_mask); -} - -static inline cpumask_t *alloc_user_cpus_ptr(int node) -{ - return NULL; -} - -#endif /* !CONFIG_SMP */ - #ifdef CONFIG_CPU_IDLE static inline void idle_set_state(struct rq *rq, @@ -2677,6 +2829,8 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); extern void init_dl_entity(struct sched_dl_entity *dl_se); +extern void init_cfs_throttle_work(struct task_struct *p); + #define BW_SHIFT 20 #define BW_UNIT (1 << BW_SHIFT) #define RATIO_SHIFT 8 @@ -2723,10 +2877,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count) call_trace_sched_update_nr_running(rq, count); } -#ifdef CONFIG_SMP if (prev_nr < 2 && rq->nr_running >= 2) set_rd_overloaded(rq->rd, 1); -#endif sched_update_tick_dependency(rq); } @@ -2892,10 +3044,7 @@ unsigned long arch_scale_freq_capacity(int cpu) static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) { rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); - /* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */ -#ifdef CONFIG_SMP rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); -#endif } #define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \ @@ -2904,8 +3053,6 @@ static inline class_##name##_t class_##name##_constructor(type *lock, type *lock { class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \ _lock; return _t; } -#ifdef CONFIG_SMP - static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) { #ifdef CONFIG_SCHED_CORE @@ -2928,7 +3075,7 @@ static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) /* * __sched_core_flip() relies on SMT having cpu-id lock order. */ -#endif +#endif /* CONFIG_SCHED_CORE */ return rq1->cpu < rq2->cpu; } @@ -3065,42 +3212,6 @@ extern void set_rq_offline(struct rq *rq); extern bool sched_smp_initialized; -#else /* !CONFIG_SMP: */ - -/* - * double_rq_lock - safely lock two runqueues - * - * Note this does not disable interrupts like task_rq_lock, - * you need to do so manually before calling. - */ -static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) - __acquires(rq1->lock) - __acquires(rq2->lock) -{ - WARN_ON_ONCE(!irqs_disabled()); - WARN_ON_ONCE(rq1 != rq2); - raw_spin_rq_lock(rq1); - __acquire(rq2->lock); /* Fake it out ;) */ - double_rq_clock_clear_update(rq1, rq2); -} - -/* - * double_rq_unlock - safely unlock two runqueues - * - * Note this does not restore interrupts like task_rq_unlock, - * you need to do so manually after calling. - */ -static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) - __releases(rq1->lock) - __releases(rq2->lock) -{ - WARN_ON_ONCE(rq1 != rq2); - raw_spin_rq_unlock(rq1); - __release(rq2->lock); -} - -#endif /* !CONFIG_SMP */ - DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq, double_rq_lock(_T->lock, _T->lock2), double_rq_unlock(_T->lock, _T->lock2)) @@ -3119,6 +3230,7 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); extern void resched_latency_warn(int cpu, u64 latency); + #ifdef CONFIG_NUMA_BALANCING extern void show_numa_stats(struct task_struct *p, struct seq_file *m); extern void @@ -3158,7 +3270,7 @@ extern void nohz_balance_exit_idle(struct rq *rq); static inline void nohz_balance_exit_idle(struct rq *rq) { } #endif /* !CONFIG_NO_HZ_COMMON */ -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +#ifdef CONFIG_NO_HZ_COMMON extern void nohz_run_idle_balance(int cpu); #else static inline void nohz_run_idle_balance(int cpu) { } @@ -3228,14 +3340,14 @@ static inline u64 irq_time_read(int cpu) return total; } -#else +#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ static inline int irqtime_enabled(void) { return 0; } -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ +#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ #ifdef CONFIG_CPU_FREQ @@ -3284,8 +3396,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { } # define arch_scale_freq_invariant() false #endif -#ifdef CONFIG_SMP - unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, unsigned long *min, unsigned long *max); @@ -3329,10 +3439,6 @@ static inline unsigned long cpu_util_rt(struct rq *rq) return READ_ONCE(rq->avg_rt.util_avg); } -#else /* !CONFIG_SMP */ -static inline bool update_other_load_avgs(struct rq *rq) { return false; } -#endif /* CONFIG_SMP */ - #ifdef CONFIG_UCLAMP_TASK unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); @@ -3509,15 +3615,13 @@ static inline bool sched_energy_enabled(void) return static_branch_unlikely(&sched_energy_present); } -extern struct cpufreq_governor schedutil_gov; - -#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ +#else /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL): */ #define perf_domain_span(pd) NULL static inline bool sched_energy_enabled(void) { return false; } -#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ +#endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ #ifdef CONFIG_MEMBARRIER @@ -3543,7 +3647,7 @@ static inline void membarrier_switch_mm(struct rq *rq, WRITE_ONCE(rq->membarrier_state, membarrier_state); } -#else /* !CONFIG_MEMBARRIER :*/ +#else /* !CONFIG_MEMBARRIER: */ static inline void membarrier_switch_mm(struct rq *rq, struct mm_struct *prev_mm, @@ -3553,7 +3657,6 @@ static inline void membarrier_switch_mm(struct rq *rq, #endif /* !CONFIG_MEMBARRIER */ -#ifdef CONFIG_SMP static inline bool is_per_cpu_kthread(struct task_struct *p) { if (!(p->flags & PF_KTHREAD)) @@ -3564,7 +3667,6 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) return true; } -#endif extern void swake_up_all_locked(struct swait_queue_head *q); extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); @@ -3580,290 +3682,216 @@ extern const char *preempt_modes[]; #ifdef CONFIG_SCHED_MM_CID -#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */ -#define MM_CID_SCAN_DELAY 100 /* 100ms */ +static __always_inline bool cid_on_cpu(unsigned int cid) +{ + return cid & MM_CID_ONCPU; +} -extern raw_spinlock_t cid_lock; -extern int use_cid_lock; +static __always_inline bool cid_in_transit(unsigned int cid) +{ + return cid & MM_CID_TRANSIT; +} -extern void sched_mm_cid_migrate_from(struct task_struct *t); -extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t); -extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr); -extern void init_sched_mm_cid(struct task_struct *t); +static __always_inline unsigned int cpu_cid_to_cid(unsigned int cid) +{ + return cid & ~MM_CID_ONCPU; +} -static inline void __mm_cid_put(struct mm_struct *mm, int cid) +static __always_inline unsigned int cid_to_cpu_cid(unsigned int cid) { - if (cid < 0) - return; - cpumask_clear_cpu(cid, mm_cidmask(mm)); + return cid | MM_CID_ONCPU; } -/* - * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to - * the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to - * be held to transition to other states. - * - * State transitions synchronized with cmpxchg or try_cmpxchg need to be - * consistent across CPUs, which prevents use of this_cpu_cmpxchg. - */ -static inline void mm_cid_put_lazy(struct task_struct *t) +static __always_inline unsigned int cid_to_transit_cid(unsigned int cid) { - struct mm_struct *mm = t->mm; - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; - int cid; + return cid | MM_CID_TRANSIT; +} - lockdep_assert_irqs_disabled(); - cid = __this_cpu_read(pcpu_cid->cid); - if (!mm_cid_is_lazy_put(cid) || - !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) - return; - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); +static __always_inline unsigned int cid_from_transit_cid(unsigned int cid) +{ + return cid & ~MM_CID_TRANSIT; } -static inline int mm_cid_pcpu_unset(struct mm_struct *mm) +static __always_inline bool cid_on_task(unsigned int cid) { - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; - int cid, res; + /* True if none of the MM_CID_ONCPU, MM_CID_TRANSIT, MM_CID_UNSET bits is set */ + return cid < MM_CID_TRANSIT; +} - lockdep_assert_irqs_disabled(); - cid = __this_cpu_read(pcpu_cid->cid); - for (;;) { - if (mm_cid_is_unset(cid)) - return MM_CID_UNSET; - /* - * Attempt transition from valid or lazy-put to unset. - */ - res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET); - if (res == cid) - break; - cid = res; - } - return cid; +static __always_inline void mm_drop_cid(struct mm_struct *mm, unsigned int cid) +{ + clear_bit(cid, mm_cidmask(mm)); } -static inline void mm_cid_put(struct mm_struct *mm) +static __always_inline void mm_unset_cid_on_task(struct task_struct *t) { - int cid; + unsigned int cid = t->mm_cid.cid; - lockdep_assert_irqs_disabled(); - cid = mm_cid_pcpu_unset(mm); - if (cid == MM_CID_UNSET) - return; - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); + t->mm_cid.cid = MM_CID_UNSET; + if (cid_on_task(cid)) + mm_drop_cid(t->mm, cid); } -static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) +static __always_inline void mm_drop_cid_on_cpu(struct mm_struct *mm, struct mm_cid_pcpu *pcp) { - struct cpumask *cidmask = mm_cidmask(mm); - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; - int cid, max_nr_cid, allowed_max_nr_cid; + /* Clear the ONCPU bit, but do not set UNSET in the per CPU storage */ + pcp->cid = cpu_cid_to_cid(pcp->cid); + mm_drop_cid(mm, pcp->cid); +} - /* - * After shrinking the number of threads or reducing the number - * of allowed cpus, reduce the value of max_nr_cid so expansion - * of cid allocation will preserve cache locality if the number - * of threads or allowed cpus increase again. - */ - max_nr_cid = atomic_read(&mm->max_nr_cid); - while ((allowed_max_nr_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), - atomic_read(&mm->mm_users))), - max_nr_cid > allowed_max_nr_cid) { - /* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */ - if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)) { - max_nr_cid = allowed_max_nr_cid; - break; - } - } - /* Try to re-use recent cid. This improves cache locality. */ - cid = __this_cpu_read(pcpu_cid->recent_cid); - if (!mm_cid_is_unset(cid) && cid < max_nr_cid && - !cpumask_test_and_set_cpu(cid, cidmask)) - return cid; - /* - * Expand cid allocation if the maximum number of concurrency - * IDs allocated (max_nr_cid) is below the number cpus allowed - * and number of threads. Expanding cid allocation as much as - * possible improves cache locality. - */ - cid = max_nr_cid; - while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) { - /* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */ - if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1)) - continue; - if (!cpumask_test_and_set_cpu(cid, cidmask)) - return cid; - } - /* - * Find the first available concurrency id. - * Retry finding first zero bit if the mask is temporarily - * filled. This only happens during concurrent remote-clear - * which owns a cid without holding a rq lock. - */ - for (;;) { - cid = cpumask_first_zero(cidmask); - if (cid < READ_ONCE(mm->nr_cpus_allowed)) - break; - cpu_relax(); - } - if (cpumask_test_and_set_cpu(cid, cidmask)) - return -1; +static inline unsigned int __mm_get_cid(struct mm_struct *mm, unsigned int max_cids) +{ + unsigned int cid = find_first_zero_bit(mm_cidmask(mm), max_cids); + if (cid >= max_cids) + return MM_CID_UNSET; + if (test_and_set_bit(cid, mm_cidmask(mm))) + return MM_CID_UNSET; return cid; } -/* - * Save a snapshot of the current runqueue time of this cpu - * with the per-cpu cid value, allowing to estimate how recently it was used. - */ -static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm) +static inline unsigned int mm_get_cid(struct mm_struct *mm) { - struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq)); + unsigned int cid = __mm_get_cid(mm, READ_ONCE(mm->mm_cid.max_cids)); - lockdep_assert_rq_held(rq); - WRITE_ONCE(pcpu_cid->time, rq->clock); + while (cid == MM_CID_UNSET) { + cpu_relax(); + cid = __mm_get_cid(mm, num_possible_cpus()); + } + return cid; } -static inline int __mm_cid_get(struct rq *rq, struct task_struct *t, - struct mm_struct *mm) +static inline unsigned int mm_cid_converge(struct mm_struct *mm, unsigned int orig_cid, + unsigned int max_cids) { - int cid; + unsigned int new_cid, cid = cpu_cid_to_cid(orig_cid); - /* - * All allocations (even those using the cid_lock) are lock-free. If - * use_cid_lock is set, hold the cid_lock to perform cid allocation to - * guarantee forward progress. - */ - if (!READ_ONCE(use_cid_lock)) { - cid = __mm_cid_try_get(t, mm); - if (cid >= 0) - goto end; - raw_spin_lock(&cid_lock); - } else { - raw_spin_lock(&cid_lock); - cid = __mm_cid_try_get(t, mm); - if (cid >= 0) - goto unlock; + /* Is it in the optimal CID space? */ + if (likely(cid < max_cids)) + return orig_cid; + + /* Try to find one in the optimal space. Otherwise keep the provided. */ + new_cid = __mm_get_cid(mm, max_cids); + if (new_cid != MM_CID_UNSET) { + mm_drop_cid(mm, cid); + /* Preserve the ONCPU mode of the original CID */ + return new_cid | (orig_cid & MM_CID_ONCPU); } + return orig_cid; +} - /* - * cid concurrently allocated. Retry while forcing following - * allocations to use the cid_lock to ensure forward progress. - */ - WRITE_ONCE(use_cid_lock, 1); - /* - * Set use_cid_lock before allocation. Only care about program order - * because this is only required for forward progress. - */ - barrier(); - /* - * Retry until it succeeds. It is guaranteed to eventually succeed once - * all newcoming allocations observe the use_cid_lock flag set. - */ - do { - cid = __mm_cid_try_get(t, mm); - cpu_relax(); - } while (cid < 0); - /* - * Allocate before clearing use_cid_lock. Only care about - * program order because this is for forward progress. - */ - barrier(); - WRITE_ONCE(use_cid_lock, 0); -unlock: - raw_spin_unlock(&cid_lock); -end: - mm_cid_snapshot_time(rq, mm); +static __always_inline void mm_cid_update_task_cid(struct task_struct *t, unsigned int cid) +{ + if (t->mm_cid.cid != cid) { + t->mm_cid.cid = cid; + rseq_sched_set_ids_changed(t); + } +} - return cid; +static __always_inline void mm_cid_update_pcpu_cid(struct mm_struct *mm, unsigned int cid) +{ + __this_cpu_write(mm->mm_cid.pcpu->cid, cid); } -static inline int mm_cid_get(struct rq *rq, struct task_struct *t, - struct mm_struct *mm) +static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid) { - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; - struct cpumask *cpumask; - int cid; + unsigned int max_cids, tcid = t->mm_cid.cid; + struct mm_struct *mm = t->mm; - lockdep_assert_rq_held(rq); - cpumask = mm_cidmask(mm); - cid = __this_cpu_read(pcpu_cid->cid); - if (mm_cid_is_valid(cid)) { - mm_cid_snapshot_time(rq, mm); - return cid; - } - if (mm_cid_is_lazy_put(cid)) { - if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); + max_cids = READ_ONCE(mm->mm_cid.max_cids); + /* Optimize for the common case where both have the ONCPU bit set */ + if (likely(cid_on_cpu(cpu_cid & tcid))) { + if (likely(cpu_cid_to_cid(cpu_cid) < max_cids)) { + mm_cid_update_task_cid(t, cpu_cid); + return; + } + /* Try to converge into the optimal CID space */ + cpu_cid = mm_cid_converge(mm, cpu_cid, max_cids); + } else { + /* Hand over or drop the task owned CID */ + if (cid_on_task(tcid)) { + if (cid_on_cpu(cpu_cid)) + mm_unset_cid_on_task(t); + else + cpu_cid = cid_to_cpu_cid(tcid); + } + /* Still nothing, allocate a new one */ + if (!cid_on_cpu(cpu_cid)) + cpu_cid = cid_to_cpu_cid(mm_get_cid(mm)); } - cid = __mm_cid_get(rq, t, mm); - __this_cpu_write(pcpu_cid->cid, cid); - __this_cpu_write(pcpu_cid->recent_cid, cid); - - return cid; + mm_cid_update_pcpu_cid(mm, cpu_cid); + mm_cid_update_task_cid(t, cpu_cid); } -static inline void switch_mm_cid(struct rq *rq, - struct task_struct *prev, - struct task_struct *next) +static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid) { - /* - * Provide a memory barrier between rq->curr store and load of - * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition. - * - * Should be adapted if context_switch() is modified. - */ - if (!next->mm) { // to kernel - /* - * user -> kernel transition does not guarantee a barrier, but - * we can use the fact that it performs an atomic operation in - * mmgrab(). - */ - if (prev->mm) // from user - smp_mb__after_mmgrab(); - /* - * kernel -> kernel transition does not change rq->curr->mm - * state. It stays NULL. - */ - } else { // to user - /* - * kernel -> user transition does not provide a barrier - * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu]. - * Provide it here. - */ - if (!prev->mm) { // from kernel - smp_mb(); - } else { // from user - /* - * user->user transition relies on an implicit - * memory barrier in switch_mm() when - * current->mm changes. If the architecture - * switch_mm() does not have an implicit memory - * barrier, it is emitted here. If current->mm - * is unchanged, no barrier is needed. - */ - smp_mb__after_switch_mm(); + unsigned int max_cids, tcid = t->mm_cid.cid; + struct mm_struct *mm = t->mm; + + max_cids = READ_ONCE(mm->mm_cid.max_cids); + /* Optimize for the common case, where both have the ONCPU bit clear */ + if (likely(cid_on_task(tcid | cpu_cid))) { + if (likely(tcid < max_cids)) { + mm_cid_update_pcpu_cid(mm, tcid); + return; } + /* Try to converge into the optimal CID space */ + tcid = mm_cid_converge(mm, tcid, max_cids); + } else { + /* Hand over or drop the CPU owned CID */ + if (cid_on_cpu(cpu_cid)) { + if (cid_on_task(tcid)) + mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu)); + else + tcid = cpu_cid_to_cid(cpu_cid); + } + /* Still nothing, allocate a new one */ + if (!cid_on_task(tcid)) + tcid = mm_get_cid(mm); + /* Set the transition mode flag if required */ + tcid |= READ_ONCE(mm->mm_cid.transit); } - if (prev->mm_cid_active) { - mm_cid_snapshot_time(rq, prev->mm); - mm_cid_put_lazy(prev); - prev->mm_cid = -1; - } - if (next->mm_cid_active) - next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm); + mm_cid_update_pcpu_cid(mm, tcid); + mm_cid_update_task_cid(t, tcid); +} + +static __always_inline void mm_cid_schedin(struct task_struct *next) +{ + struct mm_struct *mm = next->mm; + unsigned int cpu_cid; + + if (!next->mm_cid.active) + return; + + cpu_cid = __this_cpu_read(mm->mm_cid.pcpu->cid); + if (likely(!READ_ONCE(mm->mm_cid.percpu))) + mm_cid_from_task(next, cpu_cid); + else + mm_cid_from_cpu(next, cpu_cid); +} + +static __always_inline void mm_cid_schedout(struct task_struct *prev) +{ + /* During mode transitions CIDs are temporary and need to be dropped */ + if (likely(!cid_in_transit(prev->mm_cid.cid))) + return; + + mm_drop_cid(prev->mm, cid_from_transit_cid(prev->mm_cid.cid)); + prev->mm_cid.cid = MM_CID_UNSET; +} + +static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) +{ + mm_cid_schedout(prev); + mm_cid_schedin(next); } #else /* !CONFIG_SCHED_MM_CID: */ -static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { } -static inline void sched_mm_cid_migrate_from(struct task_struct *t) { } -static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { } -static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } -static inline void init_sched_mm_cid(struct task_struct *t) { } +static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { } #endif /* !CONFIG_SCHED_MM_CID */ extern u64 avg_vruntime(struct cfs_rq *cfs_rq); extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); -#ifdef CONFIG_SMP static inline void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task) { @@ -3884,7 +3912,6 @@ bool task_is_pushable(struct rq *rq, struct task_struct *p, int cpu) return false; } -#endif #ifdef CONFIG_RT_MUTEXES @@ -3919,45 +3946,42 @@ extern void set_load_weight(struct task_struct *p, bool update_load); extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags); -extern void check_class_changing(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class); -extern void check_class_changed(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class, - int oldprio); - -#ifdef CONFIG_SMP extern struct balance_callback *splice_balance_callbacks(struct rq *rq); extern void balance_callbacks(struct rq *rq, struct balance_callback *head); -#else - -static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) -{ - return NULL; -} -static inline void balance_callbacks(struct rq *rq, struct balance_callback *head) -{ -} - -#endif +/* + * The 'sched_change' pattern is the safe, easy and slow way of changing a + * task's scheduling properties. It dequeues a task, such that the scheduler + * is fully unaware of it; at which point its properties can be modified; + * after which it is enqueued again. + * + * Typically this must be called while holding task_rq_lock, since most/all + * properties are serialized under those locks. There is currently one + * exception to this rule in sched/ext which only holds rq->lock. + */ -#ifdef CONFIG_SCHED_CLASS_EXT /* - * Used by SCX in the enable/disable paths to move tasks between sched_classes - * and establish invariants. + * This structure is a temporary, used to preserve/convey the queueing state + * of the task between sched_change_begin() and sched_change_end(). Ensuring + * the task's queueing state is idempotent across the operation. */ -struct sched_enq_and_set_ctx { +struct sched_change_ctx { + u64 prio; struct task_struct *p; - int queue_flags; + int flags; bool queued; bool running; }; -void sched_deq_and_put_task(struct task_struct *p, int queue_flags, - struct sched_enq_and_set_ctx *ctx); -void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx); +struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags); +void sched_change_end(struct sched_change_ctx *ctx); -#endif /* CONFIG_SCHED_CLASS_EXT */ +DEFINE_CLASS(sched_change, struct sched_change_ctx *, + sched_change_end(_T), + sched_change_begin(p, flags), + struct task_struct *p, unsigned int flags) + +DEFINE_CLASS_IS_UNCONDITIONAL(sched_change) #include "ext.h" diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h index 21ac44428bb0..7f151d96dba9 100644 --- a/kernel/sched/smp.h +++ b/kernel/sched/smp.h @@ -1,8 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _KERNEL_SCHED_SMP_H +#define _KERNEL_SCHED_SMP_H + /* * Scheduler internal SMP callback types and methods between the scheduler * and other internal parts of the core kernel: */ +#include <linux/types.h> extern void sched_ttwu_pending(void *arg); @@ -13,3 +18,5 @@ extern void flush_smp_call_function_queue(void); #else static inline void flush_smp_call_function_queue(void) { } #endif + +#endif /* _KERNEL_SCHED_SMP_H */ diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 4346fd81c31f..d1c9429a4ac5 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -2,6 +2,7 @@ /* * /proc/schedstat implementation */ +#include "sched.h" void __update_stats_wait_start(struct rq *rq, struct task_struct *p, struct sched_statistics *stats) @@ -114,10 +115,8 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "timestamp %lu\n", jiffies); } else { struct rq *rq; -#ifdef CONFIG_SMP struct sched_domain *sd; int dcount = 0; -#endif cpu = (unsigned long)(v - 2); rq = cpu_rq(cpu); @@ -132,7 +131,6 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "\n"); -#ifdef CONFIG_SMP /* domain-specific stats */ rcu_read_lock(); for_each_domain(cpu, sd) { @@ -163,7 +161,6 @@ static int show_schedstat(struct seq_file *seq, void *v) sd->ttwu_move_balance); } rcu_read_unlock(); -#endif } return 0; } diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 452826df6ae1..c903f1a42891 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -112,10 +112,10 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, bool sleep); #ifdef CONFIG_IRQ_TIME_ACCOUNTING void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev); -#else +#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) {} -#endif /*CONFIG_IRQ_TIME_ACCOUNTING */ +#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ /* * PSI tracks state that persists across sleeps, such as iowaits and * memory stalls. As a result, it has to distinguish between sleeps, @@ -180,8 +180,13 @@ static inline void psi_dequeue(struct task_struct *p, int flags) * avoid walking all ancestors twice, psi_task_switch() handles * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU. * Do nothing here. + * + * In the SCHED_PROXY_EXECUTION case we may do sleeping + * dequeues that are not followed by a task switch, so check + * TSK_ONCPU is set to ensure the task switch is imminent. + * Otherwise clear the flags as usual. */ - if (flags & DEQUEUE_SLEEP) + if ((flags & DEQUEUE_SLEEP) && (p->psi_flags & TSK_ONCPU)) return; /* @@ -206,7 +211,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) rq = __task_rq_lock(p, &rf); psi_task_change(p, p->psi_flags, 0); - __task_rq_unlock(rq, &rf); + __task_rq_unlock(rq, p, &rf); } } @@ -220,7 +225,7 @@ static inline void psi_sched_switch(struct task_struct *prev, psi_task_switch(prev, next, sleep); } -#else /* CONFIG_PSI */ +#else /* !CONFIG_PSI: */ static inline void psi_enqueue(struct task_struct *p, bool migrate) {} static inline void psi_dequeue(struct task_struct *p, bool migrate) {} static inline void psi_ttwu_dequeue(struct task_struct *p) {} @@ -229,7 +234,7 @@ static inline void psi_sched_switch(struct task_struct *prev, bool sleep) {} static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) {} -#endif /* CONFIG_PSI */ +#endif /* !CONFIG_PSI */ #ifdef CONFIG_SCHED_INFO /* @@ -334,6 +339,6 @@ sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *n # define sched_info_enqueue(rq, t) do { } while (0) # define sched_info_dequeue(rq, t) do { } while (0) # define sched_info_switch(rq, t, next) do { } while (0) -#endif /* CONFIG_SCHED_INFO */ +#endif /* !CONFIG_SCHED_INFO */ #endif /* _KERNEL_STATS_H */ diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 058dd42e3d9b..4f9192be4b5b 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -7,8 +7,8 @@ * * See kernel/stop_machine.c */ +#include "sched.h" -#ifdef CONFIG_SMP static int select_task_rq_stop(struct task_struct *p, int cpu, int flags) { @@ -20,7 +20,6 @@ balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { return sched_stop_runnable(rq); } -#endif /* CONFIG_SMP */ static void wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags) @@ -33,7 +32,7 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool fir stop->se.exec_start = rq_clock_task(rq); } -static struct task_struct *pick_task_stop(struct rq *rq) +static struct task_struct *pick_task_stop(struct rq *rq, struct rq_flags *rf) { if (!sched_stop_runnable(rq)) return NULL; @@ -76,14 +75,17 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) { } -static void switched_to_stop(struct rq *rq, struct task_struct *p) +static void switching_to_stop(struct rq *rq, struct task_struct *p) { BUG(); /* its impossible to change to this class */ } static void -prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) +prio_changed_stop(struct rq *rq, struct task_struct *p, u64 oldprio) { + if (p->prio == oldprio) + return; + BUG(); /* how!?, what priority? */ } @@ -96,6 +98,8 @@ static void update_curr_stop(struct rq *rq) */ DEFINE_SCHED_CLASS(stop) = { + .queue_mask = 16, + .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, .yield_task = yield_task_stop, @@ -106,15 +110,13 @@ DEFINE_SCHED_CLASS(stop) = { .put_prev_task = put_prev_task_stop, .set_next_task = set_next_task_stop, -#ifdef CONFIG_SMP .balance = balance_stop, .select_task_rq = select_task_rq_stop, .set_cpus_allowed = set_cpus_allowed_common, -#endif .task_tick = task_tick_stop, .prio_changed = prio_changed_stop, - .switched_to = switched_to_stop, + .switching_to = switching_to_stop, .update_curr = update_curr_stop, }; diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 72505cd3b60a..0fef6496c4c8 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -2,6 +2,7 @@ /* * <linux/swait.h> (simple wait queues ) implementation: */ +#include "sched.h" void __init_swait_queue_head(struct swait_queue_head *q, const char *name, struct lock_class_key *key) diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index c326de1344fb..0496dc29ed0f 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -64,8 +64,6 @@ static int effective_prio(struct task_struct *p) void set_user_nice(struct task_struct *p, long nice) { - bool queued, running; - struct rq *rq; int old_prio; if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) @@ -74,10 +72,7 @@ void set_user_nice(struct task_struct *p, long nice) * We have to be careful, if called from sys_setpriority(), * the task might be in the middle of scheduling on another CPU. */ - CLASS(task_rq_lock, rq_guard)(p); - rq = rq_guard.rq; - - update_rq_clock(rq); + guard(task_rq_lock)(p); /* * The RT priorities are set via sched_setscheduler(), but we still @@ -90,28 +85,12 @@ void set_user_nice(struct task_struct *p, long nice) return; } - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); - if (running) - put_prev_task(rq, p); - - p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p, true); - old_prio = p->prio; - p->prio = effective_prio(p); - - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); - - /* - * If the task increased its priority or is running and - * lowered its priority, then reschedule its CPU: - */ - p->sched_class->prio_changed(rq, p, old_prio); + scoped_guard (sched_change, p, DEQUEUE_SAVE) { + p->static_prio = NICE_TO_PRIO(nice); + set_load_weight(p, true); + old_prio = p->prio; + p->prio = effective_prio(p); + } } EXPORT_SYMBOL(set_user_nice); @@ -174,7 +153,7 @@ SYSCALL_DEFINE1(nice, int, increment) return 0; } -#endif +#endif /* __ARCH_WANT_SYS_NICE */ /** * task_prio - return the priority value of a given task. @@ -209,10 +188,8 @@ int idle_cpu(int cpu) if (rq->nr_running) return 0; -#ifdef CONFIG_SMP if (rq->ttwu_pending) return 0; -#endif return 1; } @@ -255,8 +232,7 @@ int sched_core_idle_cpu(int cpu) return idle_cpu(cpu); } - -#endif +#endif /* CONFIG_SCHED_CORE */ /** * find_process_by_pid - find a process with a matching PID value. @@ -448,7 +424,7 @@ static inline int uclamp_validate(struct task_struct *p, } static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { } -#endif +#endif /* !CONFIG_UCLAMP_TASK */ /* * Allow unprivileged RT tasks to decrease priority. @@ -518,7 +494,7 @@ int __sched_setscheduler(struct task_struct *p, bool user, bool pi) { int oldpolicy = -1, policy = attr->sched_policy; - int retval, oldprio, newprio, queued, running; + int retval, oldprio, newprio; const struct sched_class *prev_class, *next_class; struct balance_callback *head; struct rq_flags rf; @@ -634,14 +610,14 @@ change: * Do not allow real-time tasks into groups that have no runtime * assigned. */ - if (rt_bandwidth_enabled() && rt_policy(policy) && + if (rt_group_sched_enabled() && + rt_bandwidth_enabled() && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0 && !task_group_is_autogroup(task_group(p))) { retval = -EPERM; goto unlock; } -#endif -#ifdef CONFIG_SMP +#endif /* CONFIG_RT_GROUP_SCHED */ if (dl_bandwidth_enabled() && dl_policy(policy) && !(attr->sched_flags & SCHED_FLAG_SUGOV)) { cpumask_t *span = rq->rd->span; @@ -657,7 +633,6 @@ change: goto unlock; } } -#endif } /* Re-check policy now with rq lock held: */ @@ -699,38 +674,27 @@ change: prev_class = p->sched_class; next_class = __setscheduler_class(policy, newprio); - if (prev_class != next_class && p->se.sched_delayed) - dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); - - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - if (queued) - dequeue_task(rq, p, queue_flags); - if (running) - put_prev_task(rq, p); + if (prev_class != next_class) + queue_flags |= DEQUEUE_CLASS; - if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { - __setscheduler_params(p, attr); - p->sched_class = next_class; - p->prio = newprio; - } - __setscheduler_uclamp(p, attr); - check_class_changing(rq, p, prev_class); + scoped_guard (sched_change, p, queue_flags) { - if (queued) { - /* - * We enqueue to tail when the priority of a task is - * increased (user space view). - */ - if (oldprio < p->prio) - queue_flags |= ENQUEUE_HEAD; + if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { + __setscheduler_params(p, attr); + p->sched_class = next_class; + p->prio = newprio; + } + __setscheduler_uclamp(p, attr); - enqueue_task(rq, p, queue_flags); + if (scope->queued) { + /* + * We enqueue to tail when the priority of a task is + * increased (user space view). + */ + if (oldprio < p->prio) + scope->flags |= ENQUEUE_HEAD; + } } - if (running) - set_next_task(rq, p); - - check_class_changed(rq, p, prev_class, oldprio); /* Avoid rq from going away on us: */ preempt_disable(); @@ -860,6 +824,19 @@ void sched_set_fifo_low(struct task_struct *p) } EXPORT_SYMBOL_GPL(sched_set_fifo_low); +/* + * Used when the primary interrupt handler is forced into a thread, in addition + * to the (always threaded) secondary handler. The secondary handler gets a + * slightly lower priority so that the primary handler can preempt it, thereby + * emulating the behavior of a non-PREEMPT_RT system where the primary handler + * runs in hard interrupt context. + */ +void sched_set_fifo_secondary(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 - 1 }; + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); +} + void sched_set_normal(struct task_struct *p, int nice) { struct sched_attr attr = { @@ -1119,7 +1096,6 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, return copy_struct_to_user(uattr, usize, &kattr, sizeof(kattr), NULL); } -#ifdef CONFIG_SMP int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) { /* @@ -1148,7 +1124,6 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) return 0; } -#endif /* CONFIG_SMP */ int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx) { @@ -1241,7 +1216,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE); if (user_mask) { cpumask_copy(user_mask, in_mask); - } else if (IS_ENABLED(CONFIG_SMP)) { + } else { return -ENOMEM; } @@ -1357,7 +1332,7 @@ static void do_sched_yield(void) rq = this_rq_lock_irq(&rf); schedstat_inc(rq->yld_count); - current->sched_class->yield_task(rq); + rq->donor->sched_class->yield_task(rq); preempt_disable(); rq_unlock_irq(rq, &rf); @@ -1426,12 +1401,13 @@ EXPORT_SYMBOL(yield); */ int __sched yield_to(struct task_struct *p, bool preempt) { - struct task_struct *curr = current; + struct task_struct *curr; struct rq *rq, *p_rq; int yielded = 0; scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { rq = this_rq(); + curr = rq->donor; again: p_rq = task_rq(p); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index f1ebc60d967f..cf643a5ddedd 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -3,7 +3,9 @@ * Scheduler topology setup/handling methods */ +#include <linux/sched/isolation.h> #include <linux/bsearch.h> +#include "sched.h" DEFINE_MUTEX(sched_domains_mutex); void sched_domains_mutex_lock(void) @@ -87,7 +89,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!(sd->flags & SD_OVERLAP) && + if (!(sd->flags & SD_NUMA) && cpumask_intersects(groupmask, sched_group_span(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); @@ -100,7 +102,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, group->sgc->id, cpumask_pr_args(sched_group_span(group))); - if ((sd->flags & SD_OVERLAP) && + if ((sd->flags & SD_NUMA) && !cpumask_equal(group_balance_mask(group), sched_group_span(group))) { printk(KERN_CONT " mask=%*pbl", cpumask_pr_args(group_balance_mask(group))); @@ -212,8 +214,6 @@ static bool sched_energy_update; static bool sched_is_eas_possible(const struct cpumask *cpu_mask) { bool any_asym_capacity = false; - struct cpufreq_policy *policy; - struct cpufreq_governor *gov; int i; /* EAS is enabled for asymmetric CPU capacity topologies. */ @@ -248,25 +248,12 @@ static bool sched_is_eas_possible(const struct cpumask *cpu_mask) return false; } - /* Do not attempt EAS if schedutil is not being used. */ - for_each_cpu(i, cpu_mask) { - policy = cpufreq_cpu_get(i); - if (!policy) { - if (sched_debug()) { - pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d", - cpumask_pr_args(cpu_mask), i); - } - return false; - } - gov = policy->governor; - cpufreq_cpu_put(policy); - if (gov != &schedutil_gov) { - if (sched_debug()) { - pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n", - cpumask_pr_args(cpu_mask)); - } - return false; + if (!cpufreq_ready_for_eas(cpu_mask)) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS: cpufreq is not ready\n", + cpumask_pr_args(cpu_mask)); } + return false; } return true; @@ -328,7 +315,7 @@ static int __init sched_energy_aware_sysctl_init(void) } late_initcall(sched_energy_aware_sysctl_init); -#endif +#endif /* CONFIG_PROC_SYSCTL */ static void free_pd(struct perf_domain *pd) { @@ -464,9 +451,9 @@ free: return false; } -#else +#else /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL): */ static void free_pd(struct perf_domain *pd) { } -#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/ +#endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ static void free_rootdomain(struct rcu_head *rcu) { @@ -1333,6 +1320,60 @@ next: update_group_capacity(sd, cpu); } +/* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */ +void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) +{ + int asym_prefer_cpu = cpu; + struct sched_domain *sd; + + guard(rcu)(); + + for_each_domain(cpu, sd) { + struct sched_group *sg; + int group_cpu; + + if (!(sd->flags & SD_ASYM_PACKING)) + continue; + + /* + * Groups of overlapping domain are replicated per NUMA + * node and will require updating "asym_prefer_cpu" on + * each local copy. + * + * If you are hitting this warning, consider moving + * "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu" + * which is shared by all the overlapping groups. + */ + WARN_ON_ONCE(sd->flags & SD_NUMA); + + sg = sd->groups; + if (cpu != sg->asym_prefer_cpu) { + /* + * Since the parent is a superset of the current group, + * if the cpu is not the "asym_prefer_cpu" at the + * current level, it cannot be the preferred CPU at a + * higher levels either. + */ + if (!sched_asym_prefer(cpu, sg->asym_prefer_cpu)) + return; + + WRITE_ONCE(sg->asym_prefer_cpu, cpu); + continue; + } + + /* Ranking has improved; CPU is still the preferred one. */ + if (new_prio >= old_prio) + continue; + + for_each_cpu(group_cpu, sched_group_span(sg)) { + if (sched_asym_prefer(group_cpu, asym_prefer_cpu)) + asym_prefer_cpu = group_cpu; + } + + WRITE_ONCE(sg->asym_prefer_cpu, asym_prefer_cpu); + } +} + /* * Set of available CPUs grouped by their corresponding capacities * Each list entry contains a CPU mask reflecting CPUs that share the same @@ -1549,13 +1590,19 @@ static void claim_allocations(int cpu, struct sched_domain *sd) #ifdef CONFIG_NUMA enum numa_topology_type sched_numa_topology_type; +/* + * sched_domains_numa_distance is derived from sched_numa_node_distance + * and provides a simplified view of NUMA distances used specifically + * for building NUMA scheduling domains. + */ static int sched_domains_numa_levels; -static int sched_domains_curr_level; +static int sched_numa_node_levels; int sched_max_numa_distance; static int *sched_domains_numa_distance; +static int *sched_numa_node_distance; static struct cpumask ***sched_domains_numa_masks; -#endif +#endif /* CONFIG_NUMA */ /* * SD_flags allowed in topology descriptions. @@ -1591,14 +1638,7 @@ sd_init(struct sched_domain_topology_level *tl, int sd_id, sd_weight, sd_flags = 0; struct cpumask *sd_span; -#ifdef CONFIG_NUMA - /* - * Ugly hack to pass state to sd_numa_mask()... - */ - sched_domains_curr_level = tl->numa_level; -#endif - - sd_weight = cpumask_weight(tl->mask(cpu)); + sd_weight = cpumask_weight(tl->mask(tl, cpu)); if (tl->sd_flags) sd_flags = (*tl->sd_flags)(); @@ -1629,6 +1669,12 @@ sd_init(struct sched_domain_topology_level *tl, .last_balance = jiffies, .balance_interval = sd_weight, + + /* 50% success rate */ + .newidle_call = 512, + .newidle_success = 256, + .newidle_ratio = 512, + .max_newidle_lb_cost = 0, .last_decay_max_lb_cost = jiffies, .child = child, @@ -1636,7 +1682,7 @@ sd_init(struct sched_domain_topology_level *tl, }; sd_span = sched_domain_span(sd); - cpumask_and(sd_span, cpu_map, tl->mask(cpu)); + cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu)); sd_id = cpumask_first(sd_span); sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map); @@ -1671,7 +1717,7 @@ sd_init(struct sched_domain_topology_level *tl, SD_WAKE_AFFINE); } -#endif +#endif /* CONFIG_NUMA */ } else { sd->cache_nice_tries = 1; } @@ -1691,22 +1737,63 @@ sd_init(struct sched_domain_topology_level *tl, return sd; } +#ifdef CONFIG_SCHED_SMT +int cpu_smt_flags(void) +{ + return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; +} + +const struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_smt_mask(cpu); +} +#endif + +#ifdef CONFIG_SCHED_CLUSTER +int cpu_cluster_flags(void) +{ + return SD_CLUSTER | SD_SHARE_LLC; +} + +const struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_clustergroup_mask(cpu); +} +#endif + +#ifdef CONFIG_SCHED_MC +int cpu_core_flags(void) +{ + return SD_SHARE_LLC; +} + +const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_coregroup_mask(cpu); +} +#endif + +const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_node_mask(cpu); +} + /* * Topology list, bottom-up. */ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), #endif #ifdef CONFIG_SCHED_CLUSTER - { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) }, + SDTL_INIT(tl_cls_mask, cpu_cluster_flags, CLS), #endif #ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + SDTL_INIT(tl_mc_mask, cpu_core_flags, MC), #endif - { cpu_cpu_mask, SD_INIT_NAME(PKG) }, + SDTL_INIT(tl_pkg_mask, NULL, PKG), { NULL, }, }; @@ -1727,10 +1814,14 @@ void __init set_sched_topology(struct sched_domain_topology_level *tl) } #ifdef CONFIG_NUMA +static int cpu_numa_flags(void) +{ + return SD_NUMA; +} -static const struct cpumask *sd_numa_mask(int cpu) +static const struct cpumask *sd_numa_mask(struct sched_domain_topology_level *tl, int cpu) { - return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; + return sched_domains_numa_masks[tl->numa_level][cpu_to_node(cpu)]; } static void sched_numa_warn(const char *str) @@ -1767,10 +1858,10 @@ bool find_numa_distance(int distance) return true; rcu_read_lock(); - distances = rcu_dereference(sched_domains_numa_distance); + distances = rcu_dereference(sched_numa_node_distance); if (!distances) goto unlock; - for (i = 0; i < sched_domains_numa_levels; i++) { + for (i = 0; i < sched_numa_node_levels; i++) { if (distances[i] == distance) { found = true; break; @@ -1846,14 +1937,34 @@ static void init_numa_topology_type(int offline_node) #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) -void sched_init_numa(int offline_node) +/* + * An architecture could modify its NUMA distance, to change + * grouping of NUMA nodes and number of NUMA levels when creating + * NUMA level sched domains. + * + * A NUMA level is created for each unique + * arch_sched_node_distance. + */ +static int numa_node_dist(int i, int j) { - struct sched_domain_topology_level *tl; - unsigned long *distance_map; + return node_distance(i, j); +} + +int arch_sched_node_distance(int from, int to) + __weak __alias(numa_node_dist); + +static bool modified_sched_node_distance(void) +{ + return numa_node_dist != arch_sched_node_distance; +} + +static int sched_record_numa_dist(int offline_node, int (*n_dist)(int, int), + int **dist, int *levels) +{ + unsigned long *distance_map __free(bitmap) = NULL; int nr_levels = 0; int i, j; int *distances; - struct cpumask ***masks; /* * O(nr_nodes^2) de-duplicating selection sort -- in order to find the @@ -1861,17 +1972,16 @@ void sched_init_numa(int offline_node) */ distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL); if (!distance_map) - return; + return -ENOMEM; bitmap_zero(distance_map, NR_DISTANCE_VALUES); for_each_cpu_node_but(i, offline_node) { for_each_cpu_node_but(j, offline_node) { - int distance = node_distance(i, j); + int distance = n_dist(i, j); if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) { sched_numa_warn("Invalid distance value range"); - bitmap_free(distance_map); - return; + return -EINVAL; } bitmap_set(distance_map, distance, 1); @@ -1884,18 +1994,46 @@ void sched_init_numa(int offline_node) nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES); distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL); - if (!distances) { - bitmap_free(distance_map); - return; - } + if (!distances) + return -ENOMEM; for (i = 0, j = 0; i < nr_levels; i++, j++) { j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j); distances[i] = j; } - rcu_assign_pointer(sched_domains_numa_distance, distances); + *dist = distances; + *levels = nr_levels; + + return 0; +} + +void sched_init_numa(int offline_node) +{ + struct sched_domain_topology_level *tl; + int nr_levels, nr_node_levels; + int i, j; + int *distances, *domain_distances; + struct cpumask ***masks; - bitmap_free(distance_map); + /* Record the NUMA distances from SLIT table */ + if (sched_record_numa_dist(offline_node, numa_node_dist, &distances, + &nr_node_levels)) + return; + + /* Record modified NUMA distances for building sched domains */ + if (modified_sched_node_distance()) { + if (sched_record_numa_dist(offline_node, arch_sched_node_distance, + &domain_distances, &nr_levels)) { + kfree(distances); + return; + } + } else { + domain_distances = distances; + nr_levels = nr_node_levels; + } + rcu_assign_pointer(sched_numa_node_distance, distances); + WRITE_ONCE(sched_max_numa_distance, distances[nr_node_levels - 1]); + WRITE_ONCE(sched_numa_node_levels, nr_node_levels); /* * 'nr_levels' contains the number of unique distances @@ -1913,6 +2051,8 @@ void sched_init_numa(int offline_node) * * We reset it to 'nr_levels' at the end of this function. */ + rcu_assign_pointer(sched_domains_numa_distance, domain_distances); + sched_domains_numa_levels = 0; masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); @@ -1938,10 +2078,13 @@ void sched_init_numa(int offline_node) masks[i][j] = mask; for_each_cpu_node_but(k, offline_node) { - if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) + if (sched_debug() && + (arch_sched_node_distance(j, k) != + arch_sched_node_distance(k, j))) sched_numa_warn("Node-distance not symmetric"); - if (node_distance(j, k) > sched_domains_numa_distance[i]) + if (arch_sched_node_distance(j, k) > + sched_domains_numa_distance[i]) continue; cpumask_or(mask, mask, cpumask_of_node(k)); @@ -1967,30 +2110,20 @@ void sched_init_numa(int offline_node) /* * Add the NUMA identity distance, aka single NODE. */ - tl[i++] = (struct sched_domain_topology_level){ - .mask = sd_numa_mask, - .numa_level = 0, - SD_INIT_NAME(NODE) - }; + tl[i++] = SDTL_INIT(sd_numa_mask, NULL, NODE); /* * .. and append 'j' levels of NUMA goodness. */ for (j = 1; j < nr_levels; i++, j++) { - tl[i] = (struct sched_domain_topology_level){ - .mask = sd_numa_mask, - .sd_flags = cpu_numa_flags, - .flags = SDTL_OVERLAP, - .numa_level = j, - SD_INIT_NAME(NUMA) - }; + tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA); + tl[i].numa_level = j; } sched_domain_topology_saved = sched_domain_topology; sched_domain_topology = tl; sched_domains_numa_levels = nr_levels; - WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]); init_numa_topology_type(offline_node); } @@ -1998,14 +2131,18 @@ void sched_init_numa(int offline_node) static void sched_reset_numa(void) { - int nr_levels, *distances; + int nr_levels, *distances, *dom_distances = NULL; struct cpumask ***masks; nr_levels = sched_domains_numa_levels; + sched_numa_node_levels = 0; sched_domains_numa_levels = 0; sched_max_numa_distance = 0; sched_numa_topology_type = NUMA_DIRECT; - distances = sched_domains_numa_distance; + distances = sched_numa_node_distance; + if (sched_numa_node_distance != sched_domains_numa_distance) + dom_distances = sched_domains_numa_distance; + rcu_assign_pointer(sched_numa_node_distance, NULL); rcu_assign_pointer(sched_domains_numa_distance, NULL); masks = sched_domains_numa_masks; rcu_assign_pointer(sched_domains_numa_masks, NULL); @@ -2014,6 +2151,7 @@ static void sched_reset_numa(void) synchronize_rcu(); kfree(distances); + kfree(dom_distances); for (i = 0; i < nr_levels && masks; i++) { if (!masks[i]) continue; @@ -2060,7 +2198,8 @@ void sched_domains_numa_masks_set(unsigned int cpu) continue; /* Set ourselves in the remote node's masks */ - if (node_distance(j, node) <= sched_domains_numa_distance[i]) + if (arch_sched_node_distance(j, node) <= + sched_domains_numa_distance[i]) cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); } } @@ -2098,7 +2237,7 @@ int sched_numa_find_closest(const struct cpumask *cpus, int cpu) for (i = 0; i < sched_domains_numa_levels; i++) { if (!masks[i][j]) break; - cpu = cpumask_any_and(cpus, masks[i][j]); + cpu = cpumask_any_and_distribute(cpus, masks[i][j]); if (cpu < nr_cpu_ids) { found = cpu; break; @@ -2169,6 +2308,8 @@ int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node) goto unlock; hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp); + if (!hop_masks) + goto unlock; hop = hop_masks - k.masks; ret = hop ? @@ -2294,7 +2435,7 @@ static void __sdt_free(const struct cpumask *cpu_map) if (sdd->sd) { sd = *per_cpu_ptr(sdd->sd, j); - if (sd && (sd->flags & SD_OVERLAP)) + if (sd && (sd->flags & SD_NUMA)) free_sched_groups(sd->groups, 0); kfree(*per_cpu_ptr(sdd->sd, j)); } @@ -2347,35 +2488,58 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve /* * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for - * any two given CPUs at this (non-NUMA) topology level. + * any two given CPUs on non-NUMA topology levels. */ -static bool topology_span_sane(struct sched_domain_topology_level *tl, - const struct cpumask *cpu_map, int cpu) +static bool topology_span_sane(const struct cpumask *cpu_map) { - int i = cpu + 1; + struct sched_domain_topology_level *tl; + struct cpumask *covered, *id_seen; + int cpu; - /* NUMA levels are allowed to overlap */ - if (tl->flags & SDTL_OVERLAP) - return true; + lockdep_assert_held(&sched_domains_mutex); + covered = sched_domains_tmpmask; + id_seen = sched_domains_tmpmask2; + + for_each_sd_topology(tl) { + int tl_common_flags = 0; + + if (tl->sd_flags) + tl_common_flags = (*tl->sd_flags)(); + + /* NUMA levels are allowed to overlap */ + if (tl_common_flags & SD_NUMA) + continue; + + cpumask_clear(covered); + cpumask_clear(id_seen); - /* - * Non-NUMA levels cannot partially overlap - they must be either - * completely equal or completely disjoint. Otherwise we can end up - * breaking the sched_group lists - i.e. a later get_group() pass - * breaks the linking done for an earlier span. - */ - for_each_cpu_from(i, cpu_map) { /* - * We should 'and' all those masks with 'cpu_map' to exactly - * match the topology we're about to build, but that can only - * remove CPUs, which only lessens our ability to detect - * overlaps + * Non-NUMA levels cannot partially overlap - they must be either + * completely equal or completely disjoint. Otherwise we can end up + * breaking the sched_group lists - i.e. a later get_group() pass + * breaks the linking done for an earlier span. */ - if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) && - cpumask_intersects(tl->mask(cpu), tl->mask(i))) - return false; - } + for_each_cpu(cpu, cpu_map) { + const struct cpumask *tl_cpu_mask = tl->mask(tl, cpu); + int id; + + /* lowest bit set in this mask is used as a unique id */ + id = cpumask_first(tl_cpu_mask); + if (cpumask_test_cpu(id, id_seen)) { + /* First CPU has already been seen, ensure identical spans */ + if (!cpumask_equal(tl->mask(tl, id), tl_cpu_mask)) + return false; + } else { + /* First CPU hasn't been seen before, ensure it's a completely new span */ + if (cpumask_intersects(tl_cpu_mask, covered)) + return false; + + cpumask_or(covered, covered, tl_cpu_mask); + cpumask_set_cpu(id, id_seen); + } + } + } return true; } @@ -2408,27 +2572,25 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att sd = NULL; for_each_sd_topology(tl) { - if (WARN_ON(!topology_span_sane(tl, cpu_map, i))) - goto error; - sd = build_sched_domain(tl, cpu_map, attr, sd, i); has_asym |= sd->flags & SD_ASYM_CPUCAPACITY; if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; - if (tl->flags & SDTL_OVERLAP) - sd->flags |= SD_OVERLAP; if (cpumask_equal(cpu_map, sched_domain_span(sd))) break; } } + if (WARN_ON(!topology_span_sane(cpu_map))) + goto error; + /* Build the groups for the domains */ for_each_cpu(i, cpu_map) { for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { sd->span_weight = cpumask_weight(sched_domain_span(sd)); - if (sd->flags & SD_OVERLAP) { + if (sd->flags & SD_NUMA) { if (build_overlap_sched_groups(sd, i)) goto error; } else { diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 51e38f5f4701..20f27e2cf7ae 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -4,6 +4,7 @@ * * (C) 2004 Nadia Yvette Chambers, Oracle */ +#include "sched.h" void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) { @@ -40,13 +41,31 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_ { unsigned long flags; - wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY; + wq_entry->flags |= WQ_FLAG_PRIORITY; spin_lock_irqsave(&wq_head->lock, flags); __add_wait_queue(wq_head, wq_entry); spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL_GPL(add_wait_queue_priority); +int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head, + struct wait_queue_entry *wq_entry) +{ + struct list_head *head = &wq_head->head; + + wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY; + + guard(spinlock_irqsave)(&wq_head->lock); + + if (!list_empty(head) && + (list_first_entry(head, typeof(*wq_entry), entry)->flags & WQ_FLAG_PRIORITY)) + return -EBUSY; + + list_add(&wq_entry->entry, head); + return 0; +} +EXPORT_SYMBOL_GPL(add_wait_queue_priority_exclusive); + void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; @@ -64,7 +83,7 @@ EXPORT_SYMBOL(remove_wait_queue); * the non-exclusive tasks. Normally, exclusive tasks will be at the end of * the list and any non-exclusive tasks will be woken first. A priority task * may be at the head of the list, and can consume the event without any other - * tasks being woken. + * tasks being woken if it's also an exclusive task. * * There are circumstances in which we can try to wake a task which has already * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index b410b61cec95..1088d3b7012c 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -1,5 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only +#include <linux/sched/debug.h> +#include "sched.h" + /* * The implementation of the wait_bit*() and related waiting APIs: */ diff --git a/kernel/scs.c b/kernel/scs.c index d7809affe740..772488afd5b9 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -135,7 +135,7 @@ static void scs_check_usage(struct task_struct *tsk) if (!IS_ENABLED(CONFIG_DEBUG_STACK_USAGE)) return; - for (p = task_scs(tsk); p < __scs_magic(tsk); ++p) { + for (p = task_scs(tsk); p < __scs_magic(task_scs(tsk)); ++p) { if (!READ_ONCE_NOCHECK(*p)) break; used += sizeof(*p); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 41aa761c7738..25f62867a16d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -741,6 +741,26 @@ out: } #ifdef SECCOMP_ARCH_NATIVE +static bool seccomp_uprobe_exception(struct seccomp_data *sd) +{ +#if defined __NR_uretprobe || defined __NR_uprobe +#ifdef SECCOMP_ARCH_COMPAT + if (sd->arch == SECCOMP_ARCH_NATIVE) +#endif + { +#ifdef __NR_uretprobe + if (sd->nr == __NR_uretprobe) + return true; +#endif +#ifdef __NR_uprobe + if (sd->nr == __NR_uprobe) + return true; +#endif + } +#endif + return false; +} + /** * seccomp_is_const_allow - check if filter is constant allow with given data * @fprog: The BPF programs @@ -758,13 +778,8 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, return false; /* Our single exception to filtering. */ -#ifdef __NR_uretprobe -#ifdef SECCOMP_ARCH_COMPAT - if (sd->arch == SECCOMP_ARCH_NATIVE) -#endif - if (sd->nr == __NR_uretprobe) - return true; -#endif + if (seccomp_uprobe_exception(sd)) + return true; for (pc = 0; pc < fprog->len; pc++) { struct sock_filter *insn = &fprog->filter[pc]; @@ -1043,6 +1058,9 @@ static const int mode1_syscalls[] = { #ifdef __NR_uretprobe __NR_uretprobe, #endif +#ifdef __NR_uprobe + __NR_uprobe, +#endif -1, /* negative terminated */ }; @@ -1139,7 +1157,7 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn static bool should_sleep_killable(struct seccomp_filter *match, struct seccomp_knotif *n) { - return match->wait_killable_recv && n->state == SECCOMP_NOTIFY_SENT; + return match->wait_killable_recv && n->state >= SECCOMP_NOTIFY_SENT; } static int seccomp_do_user_notification(int this_syscall, @@ -1186,13 +1204,11 @@ static int seccomp_do_user_notification(int this_syscall, if (err != 0) { /* - * Check to see if the notifcation got picked up and - * whether we should switch to wait killable. + * Check to see whether we should switch to wait + * killable. Only return the interrupted error if not. */ - if (!wait_killable && should_sleep_killable(match, &n)) - continue; - - goto interrupted; + if (!(!wait_killable && should_sleep_killable(match, &n))) + goto interrupted; } addfd = list_first_entry_or_null(&n.addfd, diff --git a/kernel/signal.c b/kernel/signal.c index f8859faa26c5..e42b8bd6922f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3016,7 +3016,7 @@ relock: * first and our do_group_exit call below will use * that value and ignore the one we pass it. */ - do_coredump(&ksig->info); + vfs_coredump(&ksig->info); } /* @@ -3125,7 +3125,6 @@ void exit_signals(struct task_struct *tsk) cgroup_threadgroup_change_begin(tsk); if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) { - sched_mm_cid_exit_signals(tsk); tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk); return; @@ -3136,7 +3135,6 @@ void exit_signals(struct task_struct *tsk) * From now this task is not visible for group-wide signals, * see wants_signal(), do_signal_stop(). */ - sched_mm_cid_exit_signals(tsk); tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk); @@ -4067,6 +4065,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, { struct pid *pid; enum pid_type type; + int ret; /* Enforce flags be set to 0 until we add an extension. */ if (flags & ~PIDFD_SEND_SIGNAL_FLAGS) @@ -4108,7 +4107,10 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, } } - return do_pidfd_send_signal(pid, sig, type, info, flags); + ret = do_pidfd_send_signal(pid, sig, type, info, flags); + put_pid(pid); + + return ret; } static int @@ -4981,9 +4983,20 @@ static const struct ctl_table signal_debug_table[] = { #endif }; +static const struct ctl_table signal_table[] = { + { + .procname = "print-fatal-signals", + .data = &print_fatal_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +}; + static int __init init_signal_sysctls(void) { register_sysctl_init("debug", signal_debug_table); + register_sysctl_init("kernel", signal_table); return 0; } early_initcall(init_signal_sysctls); diff --git a/kernel/smp.c b/kernel/smp.c index 974f3a3962e8..f349960f79ca 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -86,13 +86,15 @@ int smpcfd_dead_cpu(unsigned int cpu) int smpcfd_dying_cpu(unsigned int cpu) { /* - * The IPIs for the smp-call-function callbacks queued by other - * CPUs might arrive late, either due to hardware latencies or - * because this CPU disabled interrupts (inside stop-machine) - * before the IPIs were sent. So flush out any pending callbacks - * explicitly (without waiting for the IPIs to arrive), to - * ensure that the outgoing CPU doesn't go offline with work - * still pending. + * The IPIs for the smp-call-function callbacks queued by other CPUs + * might arrive late, either due to hardware latencies or because this + * CPU disabled interrupts (inside stop-machine) before the IPIs were + * sent. So flush out any pending callbacks explicitly (without waiting + * for the IPIs to arrive), to ensure that the outgoing CPU doesn't go + * offline with work still pending. + * + * This runs with interrupts disabled inside the stopper task invoked by + * stop_machine(), ensuring mutually exclusive CPU offlining and IPI flush. */ __flush_smp_call_function_queue(false); irq_work_run(); @@ -418,6 +420,10 @@ void __smp_call_single_queue(int cpu, struct llist_node *node) */ static int generic_exec_single(int cpu, call_single_data_t *csd) { + /* + * Preemption already disabled here so stopper cannot run on this CPU, + * ensuring mutually exclusive CPU offlining and last IPI flush. + */ if (cpu == smp_processor_id()) { smp_call_func_t func = csd->func; void *info = csd->info; @@ -638,8 +644,10 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, int err; /* - * prevent preemption and reschedule on another processor, - * as well as CPU removal + * Prevent preemption and reschedule on another CPU, as well as CPU + * removal. This prevents stopper from running on this CPU, thus + * providing mutual exclusion of the below cpu_online() check and + * IPI sending ensuring IPI are not missed by CPU going offline. */ this_cpu = get_cpu(); @@ -741,32 +749,19 @@ EXPORT_SYMBOL_GPL(smp_call_function_single_async); * * Selection preference: * 1) current cpu if in @mask - * 2) any cpu of current node if in @mask - * 3) any other online cpu in @mask + * 2) nearest cpu in @mask, based on NUMA topology */ int smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, void *info, int wait) { unsigned int cpu; - const struct cpumask *nodemask; int ret; /* Try for same CPU (cheapest) */ cpu = get_cpu(); - if (cpumask_test_cpu(cpu, mask)) - goto call; - - /* Try for same node. */ - nodemask = cpumask_of_node(cpu_to_node(cpu)); - for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids; - cpu = cpumask_next_and(cpu, nodemask, mask)) { - if (cpu_online(cpu)) - goto call; - } + if (!cpumask_test_cpu(cpu, mask)) + cpu = sched_numa_find_nth_cpu(mask, 0, cpu_to_node(cpu)); - /* Any online will do: smp_call_function_single handles nr_cpu_ids. */ - cpu = cpumask_any_and(mask, cpu_online_mask); -call: ret = smp_call_function_single(cpu, func, info, wait); put_cpu(); return ret; @@ -792,7 +787,6 @@ static void smp_call_function_many_cond(const struct cpumask *mask, bool wait = scf_flags & SCF_WAIT; int nr_cpus = 0; bool run_remote = false; - bool run_local = false; lockdep_assert_preemption_disabled(); @@ -814,19 +808,8 @@ static void smp_call_function_many_cond(const struct cpumask *mask, */ WARN_ON_ONCE(!in_task()); - /* Check if we need local execution. */ - if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask) && - (!cond_func || cond_func(this_cpu, info))) - run_local = true; - /* Check if we need remote execution, i.e., any CPU excluding this one. */ - cpu = cpumask_first_and(mask, cpu_online_mask); - if (cpu == this_cpu) - cpu = cpumask_next_and(cpu, mask, cpu_online_mask); - if (cpu < nr_cpu_ids) - run_remote = true; - - if (run_remote) { + if (cpumask_any_and_but(mask, cpu_online_mask, this_cpu) < nr_cpu_ids) { cfd = this_cpu_ptr(&cfd_data); cpumask_and(cfd->cpumask, mask, cpu_online_mask); __cpumask_clear_cpu(this_cpu, cfd->cpumask); @@ -840,6 +823,9 @@ static void smp_call_function_many_cond(const struct cpumask *mask, continue; } + /* Work is enqueued on a remote CPU. */ + run_remote = true; + csd_lock(csd); if (wait) csd->node.u_flags |= CSD_TYPE_SYNC; @@ -851,6 +837,10 @@ static void smp_call_function_many_cond(const struct cpumask *mask, #endif trace_csd_queue_cpu(cpu, _RET_IP_, func, csd); + /* + * Kick the remote CPU if this is the first work + * item enqueued. + */ if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) { __cpumask_set_cpu(cpu, cfd->cpumask_ipi); nr_cpus++; @@ -869,7 +859,9 @@ static void smp_call_function_many_cond(const struct cpumask *mask, send_call_function_ipi_mask(cfd->cpumask_ipi); } - if (run_local) { + /* Check if we need local execution. */ + if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask) && + (!cond_func || cond_func(this_cpu, info))) { unsigned long flags; local_irq_save(flags); @@ -892,16 +884,15 @@ static void smp_call_function_many_cond(const struct cpumask *mask, * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. - * @wait: Bitmask that controls the operation. If %SCF_WAIT is set, wait - * (atomically) until function has completed on other CPUs. If - * %SCF_RUN_LOCAL is set, the function will also be run locally - * if the local CPU is set in the @cpumask. - * - * If @wait is true, then returns once @func has returned. + * @wait: If true, wait (atomically) until function has completed + * on other CPUs. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. Preemption * must be disabled when calling this function. + * + * @func is not called on the local CPU even if @mask contains it. Consider + * using on_each_cpu_cond_mask() instead if this is not desirable. */ void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) @@ -1026,7 +1017,7 @@ void __init smp_init(void) * @cond_func: A callback function that is passed a cpu id and * the info parameter. The function is called * with preemption disabled. The function should - * return a blooean value indicating whether to IPI + * return a boolean value indicating whether to IPI * the specified CPU. * @func: The function to run on all applicable CPUs. * This must be fast and non-blocking. @@ -1097,6 +1088,28 @@ void wake_up_all_idle_cpus(void) EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); /** + * cpus_peek_for_pending_ipi - Check for pending IPI for CPUs + * @mask: The CPU mask for the CPUs to check. + * + * This function walks through the @mask to check if there are any pending IPIs + * scheduled, for any of the CPUs in the @mask. It does not guarantee + * correctness as it only provides a racy snapshot. + * + * Returns true if there is a pending IPI scheduled and false otherwise. + */ +bool cpus_peek_for_pending_ipi(const struct cpumask *mask) +{ + unsigned int cpu; + + for_each_cpu(cpu, mask) { + if (!llist_empty(per_cpu_ptr(&call_single_queue, cpu))) + return true; + } + + return false; +} + +/** * struct smp_call_on_cpu_struct - Call a function on a specific CPU * @work: &work_struct * @done: &completion to signal diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 1992b62e980b..4503b60ce9bd 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -18,8 +18,6 @@ #include "smpboot.h" -#ifdef CONFIG_SMP - #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD /* * For the hotplug case we keep the task structs around and reuse @@ -76,8 +74,6 @@ void __init idle_threads_init(void) } #endif -#endif /* #ifdef CONFIG_SMP */ - static LIST_HEAD(hotplug_threads); static DEFINE_MUTEX(smpboot_threads_lock); diff --git a/kernel/softirq.c b/kernel/softirq.c index 513b1945987c..77198911b8dd 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -165,7 +165,11 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) /* First entry of a task into a BH disabled section? */ if (!current->softirq_disable_cnt) { if (preemptible()) { - local_lock(&softirq_ctrl.lock); + if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) + local_lock(&softirq_ctrl.lock); + else + migrate_disable(); + /* Required to meet the RCU bottomhalf requirements. */ rcu_read_lock(); } else { @@ -177,17 +181,34 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) * Track the per CPU softirq disabled state. On RT this is per CPU * state to allow preemption of bottom half disabled sections. */ - newcnt = __this_cpu_add_return(softirq_ctrl.cnt, cnt); - /* - * Reflect the result in the task state to prevent recursion on the - * local lock and to make softirq_count() & al work. - */ - current->softirq_disable_cnt = newcnt; + if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) { + newcnt = this_cpu_add_return(softirq_ctrl.cnt, cnt); + /* + * Reflect the result in the task state to prevent recursion on the + * local lock and to make softirq_count() & al work. + */ + current->softirq_disable_cnt = newcnt; - if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && newcnt == cnt) { - raw_local_irq_save(flags); - lockdep_softirqs_off(ip); - raw_local_irq_restore(flags); + if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && newcnt == cnt) { + raw_local_irq_save(flags); + lockdep_softirqs_off(ip); + raw_local_irq_restore(flags); + } + } else { + bool sirq_dis = false; + + if (!current->softirq_disable_cnt) + sirq_dis = true; + + this_cpu_add(softirq_ctrl.cnt, cnt); + current->softirq_disable_cnt += cnt; + WARN_ON_ONCE(current->softirq_disable_cnt < 0); + + if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && sirq_dis) { + raw_local_irq_save(flags); + lockdep_softirqs_off(ip); + raw_local_irq_restore(flags); + } } } EXPORT_SYMBOL(__local_bh_disable_ip); @@ -195,23 +216,42 @@ EXPORT_SYMBOL(__local_bh_disable_ip); static void __local_bh_enable(unsigned int cnt, bool unlock) { unsigned long flags; + bool sirq_en = false; int newcnt; - DEBUG_LOCKS_WARN_ON(current->softirq_disable_cnt != - this_cpu_read(softirq_ctrl.cnt)); + if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) { + DEBUG_LOCKS_WARN_ON(current->softirq_disable_cnt != + this_cpu_read(softirq_ctrl.cnt)); + if (softirq_count() == cnt) + sirq_en = true; + } else { + if (current->softirq_disable_cnt == cnt) + sirq_en = true; + } - if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && softirq_count() == cnt) { + if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && sirq_en) { raw_local_irq_save(flags); lockdep_softirqs_on(_RET_IP_); raw_local_irq_restore(flags); } - newcnt = __this_cpu_sub_return(softirq_ctrl.cnt, cnt); - current->softirq_disable_cnt = newcnt; + if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) { + newcnt = this_cpu_sub_return(softirq_ctrl.cnt, cnt); + current->softirq_disable_cnt = newcnt; - if (!newcnt && unlock) { - rcu_read_unlock(); - local_unlock(&softirq_ctrl.lock); + if (!newcnt && unlock) { + rcu_read_unlock(); + local_unlock(&softirq_ctrl.lock); + } + } else { + current->softirq_disable_cnt -= cnt; + this_cpu_sub(softirq_ctrl.cnt, cnt); + if (unlock && !current->softirq_disable_cnt) { + migrate_enable(); + rcu_read_unlock(); + } else { + WARN_ON_ONCE(current->softirq_disable_cnt < 0); + } } } @@ -228,7 +268,10 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) lock_map_release(&bh_lock_map); local_irq_save(flags); - curcnt = __this_cpu_read(softirq_ctrl.cnt); + if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) + curcnt = this_cpu_read(softirq_ctrl.cnt); + else + curcnt = current->softirq_disable_cnt; /* * If this is not reenabling soft interrupts, no point in trying to @@ -805,6 +848,58 @@ static bool tasklet_clear_sched(struct tasklet_struct *t) return false; } +#ifdef CONFIG_PREEMPT_RT +struct tasklet_sync_callback { + spinlock_t cb_lock; + atomic_t cb_waiters; +}; + +static DEFINE_PER_CPU(struct tasklet_sync_callback, tasklet_sync_callback) = { + .cb_lock = __SPIN_LOCK_UNLOCKED(tasklet_sync_callback.cb_lock), + .cb_waiters = ATOMIC_INIT(0), +}; + +static void tasklet_lock_callback(void) +{ + spin_lock(this_cpu_ptr(&tasklet_sync_callback.cb_lock)); +} + +static void tasklet_unlock_callback(void) +{ + spin_unlock(this_cpu_ptr(&tasklet_sync_callback.cb_lock)); +} + +static void tasklet_callback_cancel_wait_running(void) +{ + struct tasklet_sync_callback *sync_cb = this_cpu_ptr(&tasklet_sync_callback); + + atomic_inc(&sync_cb->cb_waiters); + spin_lock(&sync_cb->cb_lock); + atomic_dec(&sync_cb->cb_waiters); + spin_unlock(&sync_cb->cb_lock); +} + +static void tasklet_callback_sync_wait_running(void) +{ + struct tasklet_sync_callback *sync_cb = this_cpu_ptr(&tasklet_sync_callback); + + if (atomic_read(&sync_cb->cb_waiters)) { + spin_unlock(&sync_cb->cb_lock); + spin_lock(&sync_cb->cb_lock); + } +} + +#else /* !CONFIG_PREEMPT_RT: */ + +static void tasklet_lock_callback(void) { } +static void tasklet_unlock_callback(void) { } +static void tasklet_callback_sync_wait_running(void) { } + +#ifdef CONFIG_SMP +static void tasklet_callback_cancel_wait_running(void) { } +#endif +#endif /* !CONFIG_PREEMPT_RT */ + static void tasklet_action_common(struct tasklet_head *tl_head, unsigned int softirq_nr) { @@ -816,6 +911,7 @@ static void tasklet_action_common(struct tasklet_head *tl_head, tl_head->tail = &tl_head->head; local_irq_enable(); + tasklet_lock_callback(); while (list) { struct tasklet_struct *t = list; @@ -835,6 +931,7 @@ static void tasklet_action_common(struct tasklet_head *tl_head, } } tasklet_unlock(t); + tasklet_callback_sync_wait_running(); continue; } tasklet_unlock(t); @@ -847,6 +944,7 @@ static void tasklet_action_common(struct tasklet_head *tl_head, __raise_softirq_irqoff(softirq_nr); local_irq_enable(); } + tasklet_unlock_callback(); } static __latent_entropy void tasklet_action(void) @@ -897,12 +995,9 @@ void tasklet_unlock_spin_wait(struct tasklet_struct *t) /* * Prevent a live lock when current preempted soft * interrupt processing or prevents ksoftirqd from - * running. If the tasklet runs on a different CPU - * then this has no effect other than doing the BH - * disable/enable dance for nothing. + * running. */ - local_bh_disable(); - local_bh_enable(); + tasklet_callback_cancel_wait_running(); } else { cpu_relax(); } diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 5d2d0562115b..3fe6b0c99f3d 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -82,18 +82,15 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done) } static void __cpu_stop_queue_work(struct cpu_stopper *stopper, - struct cpu_stop_work *work, - struct wake_q_head *wakeq) + struct cpu_stop_work *work) { list_add_tail(&work->list, &stopper->works); - wake_q_add(wakeq, stopper->thread); } /* queue @work to @stopper. if offline, @work is completed immediately */ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - DEFINE_WAKE_Q(wakeq); unsigned long flags; bool enabled; @@ -101,12 +98,13 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) raw_spin_lock_irqsave(&stopper->lock, flags); enabled = stopper->enabled; if (enabled) - __cpu_stop_queue_work(stopper, work, &wakeq); + __cpu_stop_queue_work(stopper, work); else if (work->done) cpu_stop_signal_done(work->done); raw_spin_unlock_irqrestore(&stopper->lock, flags); - wake_up_q(&wakeq); + if (enabled) + wake_up_process(stopper->thread); preempt_enable(); return enabled; @@ -264,7 +262,6 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, { struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1); struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); - DEFINE_WAKE_Q(wakeq); int err; retry: @@ -300,8 +297,8 @@ retry: } err = 0; - __cpu_stop_queue_work(stopper1, work1, &wakeq); - __cpu_stop_queue_work(stopper2, work2, &wakeq); + __cpu_stop_queue_work(stopper1, work1); + __cpu_stop_queue_work(stopper2, work2); unlock: raw_spin_unlock(&stopper2->lock); @@ -316,7 +313,10 @@ unlock: goto retry; } - wake_up_q(&wakeq); + if (!err) { + wake_up_process(stopper1->thread); + wake_up_process(stopper2->thread); + } preempt_enable(); return err; diff --git a/kernel/sys.c b/kernel/sys.c index c434968e9f5d..8b58eece4e58 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -52,6 +52,7 @@ #include <linux/user_namespace.h> #include <linux/time_namespace.h> #include <linux/binfmts.h> +#include <linux/futex.h> #include <linux/sched.h> #include <linux/sched/autogroup.h> @@ -180,6 +181,35 @@ int fs_overflowgid = DEFAULT_FS_OVERFLOWGID; EXPORT_SYMBOL(fs_overflowuid); EXPORT_SYMBOL(fs_overflowgid); +static const struct ctl_table overflow_sysctl_table[] = { + { + .procname = "overflowuid", + .data = &overflowuid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_MAXOLDUID, + }, + { + .procname = "overflowgid", + .data = &overflowgid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_MAXOLDUID, + }, +}; + +static int __init init_overflow_sysctl(void) +{ + register_sysctl_init("kernel", overflow_sysctl_table); + return 0; +} + +postcore_initcall(init_overflow_sysctl); + /* * Returns true if current's euid is same as p's uid or euid, * or has CAP_SYS_NICE to p's user_ns. @@ -1704,6 +1734,7 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, struct rlimit old, new; struct task_struct *tsk; unsigned int checkflags = 0; + bool need_tasklist; int ret; if (old_rlim) @@ -1730,8 +1761,25 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, get_task_struct(tsk); rcu_read_unlock(); - ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, - old_rlim ? &old : NULL); + need_tasklist = !same_thread_group(tsk, current); + if (need_tasklist) { + /* + * Ensure we can't race with group exit or de_thread(), + * so tsk->group_leader can't be freed or changed until + * read_unlock(tasklist_lock) below. + */ + read_lock(&tasklist_lock); + if (!pid_alive(tsk)) + ret = -ESRCH; + } + + if (!ret) { + ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, + old_rlim ? &old : NULL); + } + + if (need_tasklist) + read_unlock(&tasklist_lock); if (!ret && old_rlim) { rlim_to_rlim64(&old, &old64); @@ -2342,54 +2390,14 @@ int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long st #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) -#ifdef CONFIG_ANON_VMA_NAME - -#define ANON_VMA_NAME_MAX_LEN 80 -#define ANON_VMA_NAME_INVALID_CHARS "\\`$[]" - -static inline bool is_valid_name_char(char ch) -{ - /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */ - return ch > 0x1f && ch < 0x7f && - !strchr(ANON_VMA_NAME_INVALID_CHARS, ch); -} - static int prctl_set_vma(unsigned long opt, unsigned long addr, unsigned long size, unsigned long arg) { - struct mm_struct *mm = current->mm; - const char __user *uname; - struct anon_vma_name *anon_name = NULL; int error; switch (opt) { case PR_SET_VMA_ANON_NAME: - uname = (const char __user *)arg; - if (uname) { - char *name, *pch; - - name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); - if (IS_ERR(name)) - return PTR_ERR(name); - - for (pch = name; *pch != '\0'; pch++) { - if (!is_valid_name_char(*pch)) { - kfree(name); - return -EINVAL; - } - } - /* anon_vma has its own copy */ - anon_name = anon_vma_name_alloc(name); - kfree(name); - if (!anon_name) - return -ENOMEM; - - } - - mmap_write_lock(mm); - error = madvise_set_anon_name(mm, addr, size, anon_name); - mmap_write_unlock(mm); - anon_vma_name_put(anon_name); + error = set_anon_vma_name(addr, size, (const char __user *)arg); break; default: error = -EINVAL; @@ -2398,21 +2406,13 @@ static int prctl_set_vma(unsigned long opt, unsigned long addr, return error; } -#else /* CONFIG_ANON_VMA_NAME */ -static int prctl_set_vma(unsigned long opt, unsigned long start, - unsigned long size, unsigned long arg) -{ - return -EINVAL; -} -#endif /* CONFIG_ANON_VMA_NAME */ - static inline unsigned long get_current_mdwe(void) { unsigned long ret = 0; - if (test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + if (mm_flags_test(MMF_HAS_MDWE, current->mm)) ret |= PR_MDWE_REFUSE_EXEC_GAIN; - if (test_bit(MMF_HAS_MDWE_NO_INHERIT, ¤t->mm->flags)) + if (mm_flags_test(MMF_HAS_MDWE_NO_INHERIT, current->mm)) ret |= PR_MDWE_NO_INHERIT; return ret; @@ -2445,9 +2445,9 @@ static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3, return -EPERM; /* Cannot unset the flags */ if (bits & PR_MDWE_NO_INHERIT) - set_bit(MMF_HAS_MDWE_NO_INHERIT, ¤t->mm->flags); + mm_flags_set(MMF_HAS_MDWE_NO_INHERIT, current->mm); if (bits & PR_MDWE_REFUSE_EXEC_GAIN) - set_bit(MMF_HAS_MDWE, ¤t->mm->flags); + mm_flags_set(MMF_HAS_MDWE, current->mm); return 0; } @@ -2470,6 +2470,51 @@ static int prctl_get_auxv(void __user *addr, unsigned long len) return sizeof(mm->saved_auxv); } +static int prctl_get_thp_disable(unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5) +{ + struct mm_struct *mm = current->mm; + + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + + /* If disabled, we return "1 | flags", otherwise 0. */ + if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm)) + return 1; + else if (mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, mm)) + return 1 | PR_THP_DISABLE_EXCEPT_ADVISED; + return 0; +} + +static int prctl_set_thp_disable(bool thp_disable, unsigned long flags, + unsigned long arg4, unsigned long arg5) +{ + struct mm_struct *mm = current->mm; + + if (arg4 || arg5) + return -EINVAL; + + /* Flags are only allowed when disabling. */ + if ((!thp_disable && flags) || (flags & ~PR_THP_DISABLE_EXCEPT_ADVISED)) + return -EINVAL; + if (mmap_write_lock_killable(current->mm)) + return -EINTR; + if (thp_disable) { + if (flags & PR_THP_DISABLE_EXCEPT_ADVISED) { + mm_flags_clear(MMF_DISABLE_THP_COMPLETELY, mm); + mm_flags_set(MMF_DISABLE_THP_EXCEPT_ADVISED, mm); + } else { + mm_flags_set(MMF_DISABLE_THP_COMPLETELY, mm); + mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, mm); + } + } else { + mm_flags_clear(MMF_DISABLE_THP_COMPLETELY, mm); + mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, mm); + } + mmap_write_unlock(current->mm); + return 0; +} + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2488,7 +2533,17 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = -EINVAL; break; } + /* + * Ensure that either: + * + * 1. Subsequent getppid() calls reflect the parent process having died. + * 2. forget_original_parent() will send the new me->pdeath_signal. + * + * Also prevent the read of me->pdeath_signal from being a data race. + */ + read_lock(&tasklist_lock); me->pdeath_signal = arg2; + read_unlock(&tasklist_lock); break; case PR_GET_PDEATHSIG: error = put_user(me->pdeath_signal, (int __user *)arg2); @@ -2643,20 +2698,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; return task_no_new_privs(current) ? 1 : 0; case PR_GET_THP_DISABLE: - if (arg2 || arg3 || arg4 || arg5) - return -EINVAL; - error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags); + error = prctl_get_thp_disable(arg2, arg3, arg4, arg5); break; case PR_SET_THP_DISABLE: - if (arg3 || arg4 || arg5) - return -EINVAL; - if (mmap_write_lock_killable(me->mm)) - return -EINTR; - if (arg2) - set_bit(MMF_DISABLE_THP, &me->mm->flags); - else - clear_bit(MMF_DISABLE_THP, &me->mm->flags); - mmap_write_unlock(me->mm); + error = prctl_set_thp_disable(arg2, arg3, arg4, arg5); break; case PR_MPX_ENABLE_MANAGEMENT: case PR_MPX_DISABLE_MANAGEMENT: @@ -2788,7 +2833,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - error = !!test_bit(MMF_VM_MERGE_ANY, &me->mm->flags); + error = !!mm_flags_test(MMF_VM_MERGE_ANY, me->mm); break; #endif case PR_RISCV_V_SET_CONTROL: @@ -2820,6 +2865,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; error = posixtimer_create_prctl(arg2); break; + case PR_FUTEX_HASH: + error = futex_hash_prctl(arg2, arg3, arg4); + break; default: trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5); error = -EINVAL; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index c00a86931f8c..bf5d05c635ff 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -392,3 +392,4 @@ COND_SYSCALL(setuid16); COND_SYSCALL(rseq); COND_SYSCALL(uretprobe); +COND_SYSCALL(uprobe); diff --git a/kernel/sysctl-test.c b/kernel/sysctl-test.c index eb2842bd0557..92f94ea28957 100644 --- a/kernel/sysctl-test.c +++ b/kernel/sysctl-test.c @@ -367,54 +367,6 @@ static void sysctl_test_api_dointvec_write_single_greater_int_max( KUNIT_EXPECT_EQ(test, 0, *((int *)table.data)); } -/* - * Test that registering an invalid extra value is not allowed. - */ -static void sysctl_test_register_sysctl_sz_invalid_extra_value( - struct kunit *test) -{ - unsigned char data = 0; - const struct ctl_table table_foo[] = { - { - .procname = "foo", - .data = &data, - .maxlen = sizeof(u8), - .mode = 0644, - .proc_handler = proc_dou8vec_minmax, - .extra1 = SYSCTL_FOUR, - .extra2 = SYSCTL_ONE_THOUSAND, - }, - }; - - const struct ctl_table table_bar[] = { - { - .procname = "bar", - .data = &data, - .maxlen = sizeof(u8), - .mode = 0644, - .proc_handler = proc_dou8vec_minmax, - .extra1 = SYSCTL_NEG_ONE, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - }; - - const struct ctl_table table_qux[] = { - { - .procname = "qux", - .data = &data, - .maxlen = sizeof(u8), - .mode = 0644, - .proc_handler = proc_dou8vec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO_HUNDRED, - }, - }; - - KUNIT_EXPECT_NULL(test, register_sysctl("foo", table_foo)); - KUNIT_EXPECT_NULL(test, register_sysctl("foo", table_bar)); - KUNIT_EXPECT_NOT_NULL(test, register_sysctl("foo", table_qux)); -} - static struct kunit_case sysctl_test_cases[] = { KUNIT_CASE(sysctl_test_api_dointvec_null_tbl_data), KUNIT_CASE(sysctl_test_api_dointvec_table_maxlen_unset), @@ -426,7 +378,6 @@ static struct kunit_case sysctl_test_cases[] = { KUNIT_CASE(sysctl_test_dointvec_write_happy_single_negative), KUNIT_CASE(sysctl_test_api_dointvec_write_single_less_int_min), KUNIT_CASE(sysctl_test_api_dointvec_write_single_greater_int_max), - KUNIT_CASE(sysctl_test_register_sysctl_sz_invalid_extra_value), {} }; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3b7a7308e35b..2cd767b9680e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1,76 +1,27 @@ // SPDX-License-Identifier: GPL-2.0-only /* * sysctl.c: General linux system control interface - * - * Begun 24 March 1995, Stephen Tweedie - * Added /proc support, Dec 1995 - * Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas. - * Added hooks for /proc/sys/net (minor, minor patch), 96/4/1, Mike Shaver. - * Added kernel/java-{interpreter,appletviewer}, 96/5/10, Mike Shaver. - * Dynamic registration fixes, Stephen Tweedie. - * Added kswapd-interval, ctrl-alt-del, printk stuff, 1/8/97, Chris Horn. - * Made sysctl support optional via CONFIG_SYSCTL, 1/10/97, Chris - * Horn. - * Added proc_doulongvec_ms_jiffies_minmax, 09/08/99, Carlos H. Bauer. - * Added proc_doulongvec_minmax, 09/08/99, Carlos H. Bauer. - * Changed linked lists to use list.h instead of lists.h, 02/24/00, Bill - * Wendling. - * The list_for_each() macro wasn't appropriate for the sysctl loop. - * Removed it and replaced it with older style, 03/23/00, Bill Wendling */ -#include <linux/module.h> #include <linux/sysctl.h> #include <linux/bitmap.h> -#include <linux/signal.h> -#include <linux/panic.h> -#include <linux/printk.h> #include <linux/proc_fs.h> -#include <linux/security.h> #include <linux/ctype.h> -#include <linux/filter.h> -#include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/kobject.h> -#include <linux/net.h> -#include <linux/sysrq.h> #include <linux/highuid.h> #include <linux/writeback.h> -#include <linux/ratelimit.h> #include <linux/initrd.h> -#include <linux/key.h> -#include <linux/times.h> #include <linux/limits.h> #include <linux/syscalls.h> -#include <linux/nfs_fs.h> -#include <linux/acpi.h> -#include <linux/reboot.h> -#include <linux/ftrace.h> -#include <linux/kmod.h> #include <linux/capability.h> -#include <linux/binfmts.h> -#include <linux/sched/sysctl.h> -#include <linux/mount.h> -#include <linux/pid.h> #include "../lib/kstrtox.h" #include <linux/uaccess.h> #include <asm/processor.h> -#ifdef CONFIG_X86 -#include <asm/nmi.h> -#include <asm/stacktrace.h> -#include <asm/io.h> -#endif -#ifdef CONFIG_SPARC -#include <asm/setup.h> -#endif -#ifdef CONFIG_RT_MUTEXES -#include <linux/rtmutex.h> -#endif - /* shared constants to be used in various sysctls */ const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 }; EXPORT_SYMBOL(sysctl_vals); @@ -103,7 +54,8 @@ static const int cap_last_cap = CAP_LAST_CAP; * to the buffer. * * These write modes control how current file position affects the behavior of - * updating sysctl values through the proc interface on each write. + * updating internal kernel (SYSCTL_USER_TO_KERN) sysctl values through the proc + * interface on each write. */ enum sysctl_writes_mode { SYSCTL_WRITES_LEGACY = -1, @@ -121,7 +73,7 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; #ifdef CONFIG_PROC_SYSCTL -static int _proc_do_string(char *data, int maxlen, int write, +static int _proc_do_string(char *data, int maxlen, int dir, char *buffer, size_t *lenp, loff_t *ppos) { size_t len; @@ -132,7 +84,7 @@ static int _proc_do_string(char *data, int maxlen, int write, return 0; } - if (write) { + if (SYSCTL_USER_TO_KERN(dir)) { if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) { /* Only continue writes not past the end of buffer. */ len = strlen(data); @@ -220,7 +172,7 @@ static bool proc_first_pos_non_zero_ignore(loff_t *ppos, /** * proc_dostring - read a string sysctl * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file + * @dir: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -234,13 +186,13 @@ static bool proc_first_pos_non_zero_ignore(loff_t *ppos, * * Returns 0 on success. */ -int proc_dostring(const struct ctl_table *table, int write, +int proc_dostring(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { - if (write) + if (SYSCTL_USER_TO_KERN(dir)) proc_first_pos_non_zero_ignore(ppos, table); - return _proc_do_string(table->data, table->maxlen, write, buffer, lenp, + return _proc_do_string(table->data, table->maxlen, dir, buffer, lenp, ppos); } @@ -402,74 +354,55 @@ static void proc_put_char(void **buf, size_t *size, char c) } } -static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - if (write) { - if (*negp) { - if (*lvalp > (unsigned long) INT_MAX + 1) - return -EINVAL; - WRITE_ONCE(*valp, -*lvalp); - } else { - if (*lvalp > (unsigned long) INT_MAX) - return -EINVAL; - WRITE_ONCE(*valp, *lvalp); - } - } else { - int val = READ_ONCE(*valp); - if (val < 0) { - *negp = true; - *lvalp = -(unsigned long)val; - } else { - *negp = false; - *lvalp = (unsigned long)val; - } - } - return 0; -} +static SYSCTL_USER_TO_KERN_INT_CONV(, SYSCTL_CONV_IDENTITY) +static SYSCTL_KERN_TO_USER_INT_CONV(, SYSCTL_CONV_IDENTITY) + +static SYSCTL_INT_CONV_CUSTOM(, sysctl_user_to_kern_int_conv, + sysctl_kern_to_user_int_conv, false) +static SYSCTL_INT_CONV_CUSTOM(_minmax, sysctl_user_to_kern_int_conv, + sysctl_kern_to_user_int_conv, true) + + +static SYSCTL_USER_TO_KERN_UINT_CONV(, SYSCTL_CONV_IDENTITY) -static int do_proc_douintvec_conv(unsigned long *lvalp, - unsigned int *valp, - int write, void *data) +int sysctl_kern_to_user_uint_conv(unsigned long *u_ptr, + const unsigned int *k_ptr) { - if (write) { - if (*lvalp > UINT_MAX) - return -EINVAL; - WRITE_ONCE(*valp, *lvalp); - } else { - unsigned int val = READ_ONCE(*valp); - *lvalp = (unsigned long)val; - } + unsigned int val = READ_ONCE(*k_ptr); + *u_ptr = (unsigned long)val; return 0; } +static SYSCTL_UINT_CONV_CUSTOM(, sysctl_user_to_kern_uint_conv, + sysctl_kern_to_user_uint_conv, false) +static SYSCTL_UINT_CONV_CUSTOM(_minmax, sysctl_user_to_kern_uint_conv, + sysctl_kern_to_user_uint_conv, true) + static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; -static int __do_proc_dointvec(void *tbl_data, const struct ctl_table *table, - int write, void *buffer, - size_t *lenp, loff_t *ppos, - int (*conv)(bool *negp, unsigned long *lvalp, int *valp, - int write, void *data), - void *data) +static int do_proc_dointvec(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(bool *negp, unsigned long *u_ptr, int *k_ptr, + int dir, const struct ctl_table *table)) { int *i, vleft, first = 1, err = 0; size_t left; char *p; - if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { + if (!table->data || !table->maxlen || !*lenp || + (*ppos && SYSCTL_KERN_TO_USER(dir))) { *lenp = 0; return 0; } - i = (int *) tbl_data; + i = (int *) table->data; vleft = table->maxlen / sizeof(*i); left = *lenp; if (!conv) - conv = do_proc_dointvec_conv; + conv = do_proc_int_conv; - if (write) { + if (SYSCTL_USER_TO_KERN(dir)) { if (proc_first_pos_non_zero_ignore(ppos, table)) goto out; @@ -482,7 +415,7 @@ static int __do_proc_dointvec(void *tbl_data, const struct ctl_table *table, unsigned long lval; bool neg; - if (write) { + if (SYSCTL_USER_TO_KERN(dir)) { proc_skip_spaces(&p, &left); if (!left) @@ -492,12 +425,12 @@ static int __do_proc_dointvec(void *tbl_data, const struct ctl_table *table, sizeof(proc_wspace_sep), NULL); if (err) break; - if (conv(&neg, &lval, i, 1, data)) { + if (conv(&neg, &lval, i, 1, table)) { err = -EINVAL; break; } } else { - if (conv(&neg, &lval, i, 0, data)) { + if (conv(&neg, &lval, i, 0, table)) { err = -EINVAL; break; } @@ -507,11 +440,11 @@ static int __do_proc_dointvec(void *tbl_data, const struct ctl_table *table, } } - if (!write && !first && left && !err) + if (SYSCTL_KERN_TO_USER(dir) && !first && left && !err) proc_put_char(&buffer, &left, '\n'); - if (write && !err && left) + if (SYSCTL_USER_TO_KERN(dir) && !err && left) proc_skip_spaces(&p, &left); - if (write && first) + if (SYSCTL_USER_TO_KERN(dir) && first) return err ? : -EINVAL; *lenp -= left; out: @@ -519,24 +452,11 @@ out: return err; } -static int do_proc_dointvec(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos, - int (*conv)(bool *negp, unsigned long *lvalp, int *valp, - int write, void *data), - void *data) -{ - return __do_proc_dointvec(table->data, table, write, - buffer, lenp, ppos, conv, data); -} - -static int do_proc_douintvec_w(unsigned int *tbl_data, - const struct ctl_table *table, - void *buffer, +static int do_proc_douintvec_w(const struct ctl_table *table, void *buffer, size_t *lenp, loff_t *ppos, - int (*conv)(unsigned long *lvalp, - unsigned int *valp, - int write, void *data), - void *data) + int (*conv)(unsigned long *u_ptr, + unsigned int *k_ptr, int dir, + const struct ctl_table *table)) { unsigned long lval; int err = 0; @@ -566,7 +486,7 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, goto out_free; } - if (conv(&lval, tbl_data, 1, data)) { + if (conv(&lval, (unsigned int *) table->data, 1, table)) { err = -EINVAL; goto out_free; } @@ -580,18 +500,16 @@ out_free: return 0; - /* This is in keeping with old __do_proc_dointvec() */ bail_early: *ppos += *lenp; return err; } -static int do_proc_douintvec_r(unsigned int *tbl_data, void *buffer, +static int do_proc_douintvec_r(const struct ctl_table *table, void *buffer, size_t *lenp, loff_t *ppos, - int (*conv)(unsigned long *lvalp, - unsigned int *valp, - int write, void *data), - void *data) + int (*conv)(unsigned long *u_ptr, + unsigned int *k_ptr, int dir, + const struct ctl_table *table)) { unsigned long lval; int err = 0; @@ -599,7 +517,7 @@ static int do_proc_douintvec_r(unsigned int *tbl_data, void *buffer, left = *lenp; - if (conv(&lval, tbl_data, 0, data)) { + if (conv(&lval, (unsigned int *) table->data, 0, table)) { err = -EINVAL; goto out; } @@ -617,23 +535,21 @@ out: return err; } -static int __do_proc_douintvec(void *tbl_data, const struct ctl_table *table, - int write, void *buffer, - size_t *lenp, loff_t *ppos, - int (*conv)(unsigned long *lvalp, - unsigned int *valp, - int write, void *data), - void *data) +static int do_proc_douintvec(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *u_ptr, + unsigned int *k_ptr, int dir, + const struct ctl_table *table)) { - unsigned int *i, vleft; + unsigned int vleft; - if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { + if (!table->data || !table->maxlen || !*lenp || + (*ppos && SYSCTL_KERN_TO_USER(dir))) { *lenp = 0; return 0; } - i = (unsigned int *) tbl_data; - vleft = table->maxlen / sizeof(*i); + vleft = table->maxlen / sizeof(unsigned int); /* * Arrays are not supported, keep this simple. *Do not* add @@ -645,29 +561,26 @@ static int __do_proc_douintvec(void *tbl_data, const struct ctl_table *table, } if (!conv) - conv = do_proc_douintvec_conv; + conv = do_proc_uint_conv; - if (write) - return do_proc_douintvec_w(i, table, buffer, lenp, ppos, - conv, data); - return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data); + if (SYSCTL_USER_TO_KERN(dir)) + return do_proc_douintvec_w(table, buffer, lenp, ppos, conv); + return do_proc_douintvec_r(table, buffer, lenp, ppos, conv); } -int do_proc_douintvec(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos, - int (*conv)(unsigned long *lvalp, - unsigned int *valp, - int write, void *data), - void *data) +int proc_douintvec_conv(const struct ctl_table *table, int dir, void *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *u_ptr, unsigned int *k_ptr, + int dir, const struct ctl_table *table)) { - return __do_proc_douintvec(table->data, table, write, - buffer, lenp, ppos, conv, data); + return do_proc_douintvec(table, dir, buffer, lenp, ppos, conv); } + /** * proc_dobool - read/write a bool * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file + * @dir: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -680,7 +593,7 @@ int do_proc_douintvec(const struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_dobool(const struct ctl_table *table, int write, void *buffer, +int proc_dobool(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tmp; @@ -696,10 +609,10 @@ int proc_dobool(const struct ctl_table *table, int write, void *buffer, tmp.data = &val; val = READ_ONCE(*data); - res = proc_dointvec(&tmp, write, buffer, lenp, ppos); + res = proc_dointvec(&tmp, dir, buffer, lenp, ppos); if (res) return res; - if (write) + if (SYSCTL_USER_TO_KERN(dir)) WRITE_ONCE(*data, val); return 0; } @@ -707,7 +620,7 @@ int proc_dobool(const struct ctl_table *table, int write, void *buffer, /** * proc_dointvec - read a vector of integers * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file + * @dir: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -717,16 +630,16 @@ int proc_dobool(const struct ctl_table *table, int write, void *buffer, * * Returns 0 on success. */ -int proc_dointvec(const struct ctl_table *table, int write, void *buffer, +int proc_dointvec(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL); + return do_proc_dointvec(table, dir, buffer, lenp, ppos, NULL); } /** * proc_douintvec - read a vector of unsigned integers * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file + * @dir: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -736,100 +649,17 @@ int proc_dointvec(const struct ctl_table *table, int write, void *buffer, * * Returns 0 on success. */ -int proc_douintvec(const struct ctl_table *table, int write, void *buffer, +int proc_douintvec(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_douintvec(table, write, buffer, lenp, ppos, - do_proc_douintvec_conv, NULL); -} - -/* - * Taint values can only be increased - * This means we can safely use a temporary. - */ -static int proc_taint(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table t; - unsigned long tmptaint = get_taint(); - int err; - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - t = *table; - t.data = &tmptaint; - err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); - if (err < 0) - return err; - - if (write) { - int i; - - /* - * If we are relying on panic_on_taint not producing - * false positives due to userspace input, bail out - * before setting the requested taint flags. - */ - if (panic_on_taint_nousertaint && (tmptaint & panic_on_taint)) - return -EINVAL; - - /* - * Poor man's atomic or. Not worth adding a primitive - * to everyone's atomic.h for this - */ - for (i = 0; i < TAINT_FLAGS_COUNT; i++) - if ((1UL << i) & tmptaint) - add_taint(i, LOCKDEP_STILL_OK); - } - - return err; -} - -/** - * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure - * @min: pointer to minimum allowable value - * @max: pointer to maximum allowable value - * - * The do_proc_dointvec_minmax_conv_param structure provides the - * minimum and maximum values for doing range checking for those sysctl - * parameters that use the proc_dointvec_minmax() handler. - */ -struct do_proc_dointvec_minmax_conv_param { - int *min; - int *max; -}; - -static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - int tmp, ret; - struct do_proc_dointvec_minmax_conv_param *param = data; - /* - * If writing, first do so via a temporary local int so we can - * bounds-check it before touching *valp. - */ - int *ip = write ? &tmp : valp; - - ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data); - if (ret) - return ret; - - if (write) { - if ((param->min && *param->min > tmp) || - (param->max && *param->max < tmp)) - return -EINVAL; - WRITE_ONCE(*valp, tmp); - } - - return 0; + return do_proc_douintvec(table, dir, buffer, lenp, ppos, + do_proc_uint_conv); } /** * proc_dointvec_minmax - read a vector of integers with min/max values * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file + * @dir: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -840,62 +670,20 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, * This routine will ensure the values are within the range specified by * table->extra1 (min) and table->extra2 (max). * - * Returns 0 on success or -EINVAL on write when the range check fails. + * Returns 0 on success or -EINVAL when the range check fails and + * SYSCTL_USER_TO_KERN(dir) == true */ -int proc_dointvec_minmax(const struct ctl_table *table, int write, +int proc_dointvec_minmax(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { - struct do_proc_dointvec_minmax_conv_param param = { - .min = (int *) table->extra1, - .max = (int *) table->extra2, - }; - return do_proc_dointvec(table, write, buffer, lenp, ppos, - do_proc_dointvec_minmax_conv, ¶m); -} - -/** - * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure - * @min: pointer to minimum allowable value - * @max: pointer to maximum allowable value - * - * The do_proc_douintvec_minmax_conv_param structure provides the - * minimum and maximum values for doing range checking for those sysctl - * parameters that use the proc_douintvec_minmax() handler. - */ -struct do_proc_douintvec_minmax_conv_param { - unsigned int *min; - unsigned int *max; -}; - -static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, - unsigned int *valp, - int write, void *data) -{ - int ret; - unsigned int tmp; - struct do_proc_douintvec_minmax_conv_param *param = data; - /* write via temporary local uint for bounds-checking */ - unsigned int *up = write ? &tmp : valp; - - ret = do_proc_douintvec_conv(lvalp, up, write, data); - if (ret) - return ret; - - if (write) { - if ((param->min && *param->min > tmp) || - (param->max && *param->max < tmp)) - return -ERANGE; - - WRITE_ONCE(*valp, tmp); - } - - return 0; + return do_proc_dointvec(table, dir, buffer, lenp, ppos, + do_proc_int_conv_minmax); } /** * proc_douintvec_minmax - read a vector of unsigned ints with min/max values * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file + * @dir: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -909,23 +697,20 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, * check for UINT_MAX to avoid having to support wrap around uses from * userspace. * - * Returns 0 on success or -ERANGE on write when the range check fails. + * Returns 0 on success or -ERANGE when range check failes and + * SYSCTL_USER_TO_KERN(dir) == true */ -int proc_douintvec_minmax(const struct ctl_table *table, int write, +int proc_douintvec_minmax(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { - struct do_proc_douintvec_minmax_conv_param param = { - .min = (unsigned int *) table->extra1, - .max = (unsigned int *) table->extra2, - }; - return do_proc_douintvec(table, write, buffer, lenp, ppos, - do_proc_douintvec_minmax_conv, ¶m); + return do_proc_douintvec(table, dir, buffer, lenp, ppos, + do_proc_uint_conv_minmax); } /** * proc_dou8vec_minmax - read a vector of unsigned chars with min/max values * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file + * @dir: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -937,86 +722,64 @@ int proc_douintvec_minmax(const struct ctl_table *table, int write, * This routine will ensure the values are within the range specified by * table->extra1 (min) and table->extra2 (max). * - * Returns 0 on success or an error on write when the range check fails. + * Returns 0 on success or an error on SYSCTL_USER_TO_KERN(dir) == true + * and the range check fails. */ -int proc_dou8vec_minmax(const struct ctl_table *table, int write, +int proc_dou8vec_minmax(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tmp; unsigned int min = 0, max = 255U, val; u8 *data = table->data; - struct do_proc_douintvec_minmax_conv_param param = { - .min = &min, - .max = &max, - }; int res; /* Do not support arrays yet. */ if (table->maxlen != sizeof(u8)) return -EINVAL; - if (table->extra1) - min = *(unsigned int *) table->extra1; - if (table->extra2) - max = *(unsigned int *) table->extra2; - tmp = *table; tmp.maxlen = sizeof(val); tmp.data = &val; + if (!tmp.extra1) + tmp.extra1 = (unsigned int *) &min; + if (!tmp.extra2) + tmp.extra2 = (unsigned int *) &max; + val = READ_ONCE(*data); - res = do_proc_douintvec(&tmp, write, buffer, lenp, ppos, - do_proc_douintvec_minmax_conv, ¶m); + res = do_proc_douintvec(&tmp, dir, buffer, lenp, ppos, + do_proc_uint_conv_minmax); if (res) return res; - if (write) + if (SYSCTL_USER_TO_KERN(dir)) WRITE_ONCE(*data, val); return 0; } EXPORT_SYMBOL_GPL(proc_dou8vec_minmax); -#ifdef CONFIG_MAGIC_SYSRQ -static int sysrq_sysctl_handler(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int tmp, ret; - - tmp = sysrq_mask(); - - ret = __do_proc_dointvec(&tmp, table, write, buffer, - lenp, ppos, NULL, NULL); - if (ret || !write) - return ret; - - if (write) - sysrq_toggle_support(tmp); - - return 0; -} -#endif - -static int __do_proc_doulongvec_minmax(void *data, - const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos, - unsigned long convmul, unsigned long convdiv) +static int do_proc_doulongvec_minmax(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos, + unsigned long convmul, + unsigned long convdiv) { unsigned long *i, *min, *max; int vleft, first = 1, err = 0; size_t left; char *p; - if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { + if (!table->data || !table->maxlen || !*lenp || + (*ppos && SYSCTL_KERN_TO_USER(dir))) { *lenp = 0; return 0; } - i = data; + i = table->data; min = table->extra1; max = table->extra2; vleft = table->maxlen / sizeof(unsigned long); left = *lenp; - if (write) { + if (SYSCTL_USER_TO_KERN(dir)) { if (proc_first_pos_non_zero_ignore(ppos, table)) goto out; @@ -1028,7 +791,7 @@ static int __do_proc_doulongvec_minmax(void *data, for (; left && vleft--; i++, first = 0) { unsigned long val; - if (write) { + if (SYSCTL_USER_TO_KERN(dir)) { bool neg; proc_skip_spaces(&p, &left); @@ -1057,11 +820,11 @@ static int __do_proc_doulongvec_minmax(void *data, } } - if (!write && !first && left && !err) + if (SYSCTL_KERN_TO_USER(dir) && !first && left && !err) proc_put_char(&buffer, &left, '\n'); - if (write && !err) + if (SYSCTL_USER_TO_KERN(dir) && !err) proc_skip_spaces(&p, &left); - if (write && first) + if (SYSCTL_USER_TO_KERN(dir) && first) return err ? : -EINVAL; *lenp -= left; out: @@ -1069,18 +832,18 @@ out: return err; } -static int do_proc_doulongvec_minmax(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, - unsigned long convdiv) +int proc_doulongvec_minmax_conv(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos, + unsigned long convmul, unsigned long convdiv) { - return __do_proc_doulongvec_minmax(table->data, table, write, - buffer, lenp, ppos, convmul, convdiv); + return do_proc_doulongvec_minmax(table, dir, buffer, lenp, ppos, + convmul, convdiv); } /** * proc_doulongvec_minmax - read a vector of long integers with min/max values * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file + * @dir: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -1093,238 +856,24 @@ static int do_proc_doulongvec_minmax(const struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_doulongvec_minmax(const struct ctl_table *table, int write, +int proc_doulongvec_minmax(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); -} - -/** - * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long - * values from/to the user buffer, treated as an ASCII string. The values - * are treated as milliseconds, and converted to jiffies when they are stored. - * - * This routine will ensure the values are within the range specified by - * table->extra1 (min) and table->extra2 (max). - * - * Returns 0 on success. - */ -int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_doulongvec_minmax(table, write, buffer, - lenp, ppos, HZ, 1000l); -} - - -static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - if (write) { - if (*lvalp > INT_MAX / HZ) - return 1; - if (*negp) - WRITE_ONCE(*valp, -*lvalp * HZ); - else - WRITE_ONCE(*valp, *lvalp * HZ); - } else { - int val = READ_ONCE(*valp); - unsigned long lval; - if (val < 0) { - *negp = true; - lval = -(unsigned long)val; - } else { - *negp = false; - lval = (unsigned long)val; - } - *lvalp = lval / HZ; - } - return 0; -} - -static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - if (write) { - if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ) - return 1; - *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp); - } else { - int val = *valp; - unsigned long lval; - if (val < 0) { - *negp = true; - lval = -(unsigned long)val; - } else { - *negp = false; - lval = (unsigned long)val; - } - *lvalp = jiffies_to_clock_t(lval); - } - return 0; -} - -static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - if (write) { - unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); - - if (jif > INT_MAX) - return 1; - WRITE_ONCE(*valp, (int)jif); - } else { - int val = READ_ONCE(*valp); - unsigned long lval; - if (val < 0) { - *negp = true; - lval = -(unsigned long)val; - } else { - *negp = false; - lval = (unsigned long)val; - } - *lvalp = jiffies_to_msecs(lval); - } - return 0; -} - -static int do_proc_dointvec_ms_jiffies_minmax_conv(bool *negp, unsigned long *lvalp, - int *valp, int write, void *data) -{ - int tmp, ret; - struct do_proc_dointvec_minmax_conv_param *param = data; - /* - * If writing, first do so via a temporary local int so we can - * bounds-check it before touching *valp. - */ - int *ip = write ? &tmp : valp; - - ret = do_proc_dointvec_ms_jiffies_conv(negp, lvalp, ip, write, data); - if (ret) - return ret; - - if (write) { - if ((param->min && *param->min > tmp) || - (param->max && *param->max < tmp)) - return -EINVAL; - *valp = tmp; - } - return 0; -} - -/** - * proc_dointvec_jiffies - read a vector of integers as seconds - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * The values read are assumed to be in seconds, and are converted into - * jiffies. - * - * Returns 0 on success. - */ -int proc_dointvec_jiffies(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_dointvec(table,write,buffer,lenp,ppos, - do_proc_dointvec_jiffies_conv,NULL); -} - -int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - struct do_proc_dointvec_minmax_conv_param param = { - .min = (int *) table->extra1, - .max = (int *) table->extra2, - }; - return do_proc_dointvec(table, write, buffer, lenp, ppos, - do_proc_dointvec_ms_jiffies_minmax_conv, ¶m); -} - -/** - * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: pointer to the file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * The values read are assumed to be in 1/USER_HZ seconds, and - * are converted into jiffies. - * - * Returns 0 on success. - */ -int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_dointvec(table, write, buffer, lenp, ppos, - do_proc_dointvec_userhz_jiffies_conv, NULL); -} - -/** - * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: the current position in the file - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * The values read are assumed to be in 1/1000 seconds, and - * are converted into jiffies. - * - * Returns 0 on success. - */ -int proc_dointvec_ms_jiffies(const struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos) -{ - return do_proc_dointvec(table, write, buffer, lenp, ppos, - do_proc_dointvec_ms_jiffies_conv, NULL); + return proc_doulongvec_minmax_conv(table, dir, buffer, lenp, ppos, 1l, 1l); } -static int proc_do_cad_pid(const struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos) +int proc_dointvec_conv(const struct ctl_table *table, int dir, void *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(bool *negp, unsigned long *u_ptr, int *k_ptr, + int dir, const struct ctl_table *table)) { - struct pid *new_pid; - pid_t tmp; - int r; - - tmp = pid_vnr(cad_pid); - - r = __do_proc_dointvec(&tmp, table, write, buffer, - lenp, ppos, NULL, NULL); - if (r || !write) - return r; - - new_pid = find_get_pid(tmp); - if (!new_pid) - return -ESRCH; - - put_pid(xchg(&cad_pid, new_pid)); - return 0; + return do_proc_dointvec(table, dir, buffer, lenp, ppos, conv); } /** * proc_do_large_bitmap - read/write from/to a large bitmap * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file + * @dir: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -1338,7 +887,7 @@ static int proc_do_cad_pid(const struct ctl_table *table, int write, void *buffe * * Returns 0 on success. */ -int proc_do_large_bitmap(const struct ctl_table *table, int write, +int proc_do_large_bitmap(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { int err = 0; @@ -1348,12 +897,12 @@ int proc_do_large_bitmap(const struct ctl_table *table, int write, unsigned long *tmp_bitmap = NULL; char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; - if (!bitmap || !bitmap_len || !left || (*ppos && !write)) { + if (!bitmap || !bitmap_len || !left || (*ppos && SYSCTL_KERN_TO_USER(dir))) { *lenp = 0; return 0; } - if (write) { + if (SYSCTL_USER_TO_KERN(dir)) { char *p = buffer; size_t skipped = 0; @@ -1454,7 +1003,7 @@ int proc_do_large_bitmap(const struct ctl_table *table, int write, } if (!err) { - if (write) { + if (SYSCTL_USER_TO_KERN(dir)) { if (*ppos) bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len); else @@ -1470,85 +1019,70 @@ int proc_do_large_bitmap(const struct ctl_table *table, int write, #else /* CONFIG_PROC_SYSCTL */ -int proc_dostring(const struct ctl_table *table, int write, +int proc_dostring(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dobool(const struct ctl_table *table, int write, +int proc_dobool(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec(const struct ctl_table *table, int write, +int proc_dointvec(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_douintvec(const struct ctl_table *table, int write, +int proc_douintvec(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_minmax(const struct ctl_table *table, int write, +int proc_dointvec_minmax(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_douintvec_minmax(const struct ctl_table *table, int write, +int proc_douintvec_minmax(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dou8vec_minmax(const struct ctl_table *table, int write, +int proc_dou8vec_minmax(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_jiffies(const struct ctl_table *table, int write, +int proc_doulongvec_minmax(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +int proc_doulongvec_minmax_conv(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos, + unsigned long convmul, unsigned long convdiv) { return -ENOSYS; } -int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +int proc_dointvec_conv(const struct ctl_table *table, int dir, void *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(bool *negp, unsigned long *u_ptr, int *k_ptr, + int dir, const struct ctl_table *table)) { return -ENOSYS; } -int proc_dointvec_ms_jiffies(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_doulongvec_minmax(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_do_large_bitmap(const struct ctl_table *table, int write, +int proc_do_large_bitmap(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; @@ -1557,7 +1091,7 @@ int proc_do_large_bitmap(const struct ctl_table *table, int write, #endif /* CONFIG_PROC_SYSCTL */ #if defined(CONFIG_SYSCTL) -int proc_do_static_key(const struct ctl_table *table, int write, +int proc_do_static_key(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { struct static_key *key = (struct static_key *)table->data; @@ -1571,13 +1105,13 @@ int proc_do_static_key(const struct ctl_table *table, int write, .extra2 = SYSCTL_ONE, }; - if (write && !capable(CAP_SYS_ADMIN)) + if (SYSCTL_USER_TO_KERN(dir) && !capable(CAP_SYS_ADMIN)) return -EPERM; mutex_lock(&static_key_mutex); val = static_key_enabled(key); - ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); - if (write && !ret) { + ret = proc_dointvec_minmax(&tmp, dir, buffer, lenp, ppos); + if (SYSCTL_USER_TO_KERN(dir) && !ret) { if (val) static_key_enable(key); else @@ -1587,22 +1121,9 @@ int proc_do_static_key(const struct ctl_table *table, int write, return ret; } -static const struct ctl_table kern_table[] = { - { - .procname = "panic", - .data = &panic_timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, +static const struct ctl_table sysctl_subsys_table[] = { #ifdef CONFIG_PROC_SYSCTL { - .procname = "tainted", - .maxlen = sizeof(long), - .mode = 0644, - .proc_handler = proc_taint, - }, - { .procname = "sysctl_writes_strict", .data = &sysctl_writes_strict, .maxlen = sizeof(int), @@ -1613,180 +1134,6 @@ static const struct ctl_table kern_table[] = { }, #endif { - .procname = "print-fatal-signals", - .data = &print_fatal_signals, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_SPARC - { - .procname = "reboot-cmd", - .data = reboot_command, - .maxlen = 256, - .mode = 0644, - .proc_handler = proc_dostring, - }, - { - .procname = "stop-a", - .data = &stop_a_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "scons-poweroff", - .data = &scons_pwroff, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_SPARC64 - { - .procname = "tsb-ratio", - .data = &sysctl_tsb_ratio, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_PARISC - { - .procname = "soft-power", - .data = &pwrsw_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW - { - .procname = "unaligned-trap", - .data = &unaligned_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_STACK_TRACER - { - .procname = "stack_tracer_enabled", - .data = &stack_tracer_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = stack_trace_sysctl, - }, -#endif -#ifdef CONFIG_TRACING - { - .procname = "ftrace_dump_on_oops", - .data = &ftrace_dump_on_oops, - .maxlen = MAX_TRACER_SIZE, - .mode = 0644, - .proc_handler = proc_dostring, - }, - { - .procname = "traceoff_on_warning", - .data = &__disable_trace_on_warning, - .maxlen = sizeof(__disable_trace_on_warning), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "tracepoint_printk", - .data = &tracepoint_printk, - .maxlen = sizeof(tracepoint_printk), - .mode = 0644, - .proc_handler = tracepoint_printk_sysctl, - }, -#endif -#ifdef CONFIG_MODULES - { - .procname = "modprobe", - .data = &modprobe_path, - .maxlen = KMOD_PATH_LEN, - .mode = 0644, - .proc_handler = proc_dostring, - }, - { - .procname = "modules_disabled", - .data = &modules_disabled, - .maxlen = sizeof(int), - .mode = 0644, - /* only handle a transition from default "0" to "1" */ - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_UEVENT_HELPER - { - .procname = "hotplug", - .data = &uevent_helper, - .maxlen = UEVENT_HELPER_PATH_LEN, - .mode = 0644, - .proc_handler = proc_dostring, - }, -#endif -#ifdef CONFIG_MAGIC_SYSRQ - { - .procname = "sysrq", - .data = NULL, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = sysrq_sysctl_handler, - }, -#endif -#ifdef CONFIG_PROC_SYSCTL - { - .procname = "cad_pid", - .data = NULL, - .maxlen = sizeof (int), - .mode = 0600, - .proc_handler = proc_do_cad_pid, - }, -#endif - { - .procname = "threads-max", - .data = NULL, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_max_threads, - }, - { - .procname = "overflowuid", - .data = &overflowuid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_MAXOLDUID, - }, - { - .procname = "overflowgid", - .data = &overflowgid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_MAXOLDUID, - }, - { - .procname = "panic_on_oops", - .data = &panic_on_oops, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "panic_print", - .data = &panic_print, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { .procname = "ngroups_max", .data = (void *)&ngroups_max, .maxlen = sizeof (int), @@ -1800,20 +1147,10 @@ static const struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = proc_dointvec, }, -#if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \ - defined(CONFIG_DEBUG_STACKOVERFLOW) - { - .procname = "panic_on_stackoverflow", - .data = &sysctl_panic_on_stackoverflow, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#if defined(CONFIG_MMU) +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW { - .procname = "randomize_va_space", - .data = &randomize_va_space, + .procname = "unaligned-trap", + .data = &unaligned_enabled, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, @@ -1828,49 +1165,11 @@ static const struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_RT_MUTEXES - { - .procname = "max_lock_depth", - .data = &max_lock_depth, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "panic_on_warn", - .data = &panic_on_warn, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#ifdef CONFIG_TREE_RCU - { - .procname = "panic_on_rcu_stall", - .data = &sysctl_panic_on_rcu_stall, - .maxlen = sizeof(sysctl_panic_on_rcu_stall), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "max_rcu_stall_to_panic", - .data = &sysctl_max_rcu_stall_to_panic, - .maxlen = sizeof(sysctl_max_rcu_stall_to_panic), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_INT_MAX, - }, -#endif }; int __init sysctl_init_bases(void) { - register_sysctl_init("kernel", kern_table); + register_sysctl_init("kernel", sysctl_subsys_table); return 0; } @@ -1882,12 +1181,8 @@ int __init sysctl_init_bases(void) EXPORT_SYMBOL(proc_dobool); EXPORT_SYMBOL(proc_dointvec); EXPORT_SYMBOL(proc_douintvec); -EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); EXPORT_SYMBOL_GPL(proc_douintvec_minmax); -EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); -EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); EXPORT_SYMBOL(proc_doulongvec_minmax); -EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); EXPORT_SYMBOL(proc_do_large_bitmap); diff --git a/kernel/task_work.c b/kernel/task_work.c index d1efec571a4a..0f7519f8e7c9 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -9,7 +9,12 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ #ifdef CONFIG_IRQ_WORK static void task_work_set_notify_irq(struct irq_work *entry) { - test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); + /* + * no-op IPI + * + * TWA_NMI_CURRENT will already have set the TIF flag, all + * this interrupt does it tickle the return-to-user path. + */ } static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) = IRQ_WORK_INIT_HARD(task_work_set_notify_irq); @@ -86,6 +91,7 @@ int task_work_add(struct task_struct *task, struct callback_head *work, break; #ifdef CONFIG_IRQ_WORK case TWA_NMI_CURRENT: + set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume)); break; #endif diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index b0b97a60aaa6..7c6a52f7836c 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -82,9 +82,9 @@ config CONTEXT_TRACKING_IDLE help Tracks idle state on behalf of RCU. -if GENERIC_CLOCKEVENTS menu "Timers subsystem" +if GENERIC_CLOCKEVENTS # Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is # only related to the tick functionality. Oneshot clockevent devices # are supported independent of this. @@ -208,6 +208,17 @@ config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US interval and NTP's maximum frequency drift of 500 parts per million. If the clocksource is good enough for NTP, it is good enough for the clocksource watchdog! +endif + +config POSIX_AUX_CLOCKS + bool "Enable auxiliary POSIX clocks" + depends on POSIX_TIMERS + help + Auxiliary POSIX clocks are clocks which can be steered + independently of the core timekeeper, which controls the + MONOTONIC, REALTIME, BOOTTIME and TAI clocks. They are useful to + provide e.g. lockless time accessors to independent PTP clocks + and other clock domains, which are not correlated to the TAI/NTP + notion of time. endmenu -endif diff --git a/kernel/time/Makefile b/kernel/time/Makefile index e6e9b85d4db5..f7d52d9543cc 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -26,7 +26,7 @@ obj-$(CONFIG_LEGACY_TIMER_TICK) += tick-legacy.o ifeq ($(CONFIG_SMP),y) obj-$(CONFIG_NO_HZ_COMMON) += timer_migration.o endif -obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o +obj-$(CONFIG_GENERIC_GETTIMEOFDAY) += vsyscall.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o obj-$(CONFIG_TIME_NS) += namespace.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 0ddccdff119a..069d93bfb0c7 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -35,7 +35,7 @@ /** * struct alarm_base - Alarm timer bases - * @lock: Lock for syncrhonized access to the base + * @lock: Lock for synchronized access to the base * @timerqueue: Timerqueue head managing the list of events * @get_ktime: Function to read the time correlating to the base * @get_timespec: Function to read the namespace time correlating to the base @@ -70,12 +70,10 @@ static DEFINE_SPINLOCK(rtcdev_lock); */ struct rtc_device *alarmtimer_get_rtcdev(void) { - unsigned long flags; struct rtc_device *ret; - spin_lock_irqsave(&rtcdev_lock, flags); + guard(spinlock_irqsave)(&rtcdev_lock); ret = rtcdev; - spin_unlock_irqrestore(&rtcdev_lock, flags); return ret; } @@ -83,7 +81,6 @@ EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev); static int alarmtimer_rtc_add_device(struct device *dev) { - unsigned long flags; struct rtc_device *rtc = to_rtc_device(dev); struct platform_device *pdev; int ret = 0; @@ -101,25 +98,18 @@ static int alarmtimer_rtc_add_device(struct device *dev) if (!IS_ERR(pdev)) device_init_wakeup(&pdev->dev, true); - spin_lock_irqsave(&rtcdev_lock, flags); - if (!IS_ERR(pdev) && !rtcdev) { - if (!try_module_get(rtc->owner)) { + scoped_guard(spinlock_irqsave, &rtcdev_lock) { + if (!IS_ERR(pdev) && !rtcdev && try_module_get(rtc->owner)) { + rtcdev = rtc; + /* hold a reference so it doesn't go away */ + get_device(dev); + pdev = NULL; + } else { ret = -1; - goto unlock; } - - rtcdev = rtc; - /* hold a reference so it doesn't go away */ - get_device(dev); - pdev = NULL; - } else { - ret = -1; } -unlock: - spin_unlock_irqrestore(&rtcdev_lock, flags); platform_device_unregister(pdev); - return ret; } @@ -198,7 +188,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) struct alarm *alarm = container_of(timer, struct alarm, timer); struct alarm_base *base = &alarm_bases[alarm->type]; - scoped_guard (spinlock_irqsave, &base->lock) + scoped_guard(spinlock_irqsave, &base->lock) alarmtimer_dequeue(base, alarm); if (alarm->function) @@ -228,17 +218,16 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining); static int alarmtimer_suspend(struct device *dev) { ktime_t min, now, expires; - int i, ret, type; struct rtc_device *rtc; - unsigned long flags; struct rtc_time tm; + int i, ret, type; - spin_lock_irqsave(&freezer_delta_lock, flags); - min = freezer_delta; - expires = freezer_expires; - type = freezer_alarmtype; - freezer_delta = 0; - spin_unlock_irqrestore(&freezer_delta_lock, flags); + scoped_guard(spinlock_irqsave, &freezer_delta_lock) { + min = freezer_delta; + expires = freezer_expires; + type = freezer_alarmtype; + freezer_delta = 0; + } rtc = alarmtimer_get_rtcdev(); /* If we have no rtcdev, just return */ @@ -251,9 +240,8 @@ static int alarmtimer_suspend(struct device *dev) struct timerqueue_node *next; ktime_t delta; - spin_lock_irqsave(&base->lock, flags); - next = timerqueue_getnext(&base->timerqueue); - spin_unlock_irqrestore(&base->lock, flags); + scoped_guard(spinlock_irqsave, &base->lock) + next = timerqueue_getnext(&base->timerqueue); if (!next) continue; delta = ktime_sub(next->expires, base->get_ktime()); @@ -352,13 +340,12 @@ EXPORT_SYMBOL_GPL(alarm_init); void alarm_start(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; - unsigned long flags; - spin_lock_irqsave(&base->lock, flags); - alarm->node.expires = start; - alarmtimer_enqueue(base, alarm); - hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); - spin_unlock_irqrestore(&base->lock, flags); + scoped_guard(spinlock_irqsave, &base->lock) { + alarm->node.expires = start; + alarmtimer_enqueue(base, alarm); + hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); + } trace_alarmtimer_start(alarm, base->get_ktime()); } @@ -381,13 +368,11 @@ EXPORT_SYMBOL_GPL(alarm_start_relative); void alarm_restart(struct alarm *alarm) { struct alarm_base *base = &alarm_bases[alarm->type]; - unsigned long flags; - spin_lock_irqsave(&base->lock, flags); + guard(spinlock_irqsave)(&base->lock); hrtimer_set_expires(&alarm->timer, alarm->node.expires); hrtimer_restart(&alarm->timer); alarmtimer_enqueue(base, alarm); - spin_unlock_irqrestore(&base->lock, flags); } EXPORT_SYMBOL_GPL(alarm_restart); @@ -401,14 +386,13 @@ EXPORT_SYMBOL_GPL(alarm_restart); int alarm_try_to_cancel(struct alarm *alarm) { struct alarm_base *base = &alarm_bases[alarm->type]; - unsigned long flags; int ret; - spin_lock_irqsave(&base->lock, flags); - ret = hrtimer_try_to_cancel(&alarm->timer); - if (ret >= 0) - alarmtimer_dequeue(base, alarm); - spin_unlock_irqrestore(&base->lock, flags); + scoped_guard(spinlock_irqsave, &base->lock) { + ret = hrtimer_try_to_cancel(&alarm->timer); + if (ret >= 0) + alarmtimer_dequeue(base, alarm); + } trace_alarmtimer_cancel(alarm, base->get_ktime()); return ret; @@ -479,7 +463,6 @@ EXPORT_SYMBOL_GPL(alarm_forward_now); static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) { struct alarm_base *base; - unsigned long flags; ktime_t delta; switch(type) { @@ -498,13 +481,12 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) delta = ktime_sub(absexp, base->get_ktime()); - spin_lock_irqsave(&freezer_delta_lock, flags); + guard(spinlock_irqsave)(&freezer_delta_lock); if (!freezer_delta || (delta < freezer_delta)) { freezer_delta = delta; freezer_expires = absexp; freezer_alarmtype = type; } - spin_unlock_irqrestore(&freezer_delta_lock, flags); } /** @@ -515,9 +497,9 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) { if (clockid == CLOCK_REALTIME_ALARM) return ALARM_REALTIME; - if (clockid == CLOCK_BOOTTIME_ALARM) - return ALARM_BOOTTIME; - return -1; + + WARN_ON_ONCE(clockid != CLOCK_BOOTTIME_ALARM); + return ALARM_BOOTTIME; } /** diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index f3e831f62906..a59bc75ab7c5 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -633,7 +633,7 @@ void tick_offline_cpu(unsigned int cpu) raw_spin_lock(&clockevents_lock); tick_broadcast_offline(cpu); - tick_shutdown(cpu); + tick_shutdown(); /* * Unregister the clock event devices which were diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index bb48498ebb5a..a1890a073196 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -144,7 +144,7 @@ static u64 suspend_start; * Default for maximum permissible skew when cs->uncertainty_margin is * not specified, and the lower bound even when cs->uncertainty_margin * is specified. This is also the default that is used when registering - * clocks with unspecifed cs->uncertainty_margin, so this macro is used + * clocks with unspecified cs->uncertainty_margin, so this macro is used * even in CONFIG_CLOCKSOURCE_WATCHDOG=n kernels. */ #define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC) @@ -310,7 +310,7 @@ static void clocksource_verify_choose_cpus(void) { int cpu, i, n = verify_n_cpus; - if (n < 0) { + if (n < 0 || n >= num_online_cpus()) { /* Check all of the CPUs. */ cpumask_copy(&cpus_chosen, cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), &cpus_chosen); @@ -323,9 +323,7 @@ static void clocksource_verify_choose_cpus(void) return; /* Make sure to select at least one CPU other than the current CPU. */ - cpu = cpumask_first(cpu_online_mask); - if (cpu == smp_processor_id()) - cpu = cpumask_next(cpu, cpu_online_mask); + cpu = cpumask_any_but(cpu_online_mask, smp_processor_id()); if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) return; cpumask_set_cpu(cpu, &cpus_chosen); @@ -342,10 +340,7 @@ static void clocksource_verify_choose_cpus(void) * CPUs that are currently online. */ for (i = 1; i < n; i++) { - cpu = get_random_u32_below(nr_cpu_ids); - cpu = cpumask_next(cpu - 1, cpu_online_mask); - if (cpu >= nr_cpu_ids) - cpu = cpumask_first(cpu_online_mask); + cpu = cpumask_random(cpu_online_mask); if (!WARN_ON_ONCE(cpu >= nr_cpu_ids)) cpumask_set_cpu(cpu, &cpus_chosen); } @@ -412,9 +407,8 @@ void clocksource_verify_percpu(struct clocksource *cs) if (!cpumask_empty(&cpus_behind)) pr_warn(" CPUs %*pbl behind CPU %d for clocksource %s.\n", cpumask_pr_args(&cpus_behind), testcpu, cs->name); - if (!cpumask_empty(&cpus_ahead) || !cpumask_empty(&cpus_behind)) - pr_warn(" CPU %d check durations %lldns - %lldns for clocksource %s.\n", - testcpu, cs_nsec_min, cs_nsec_max, cs->name); + pr_info(" CPU %d check durations %lldns - %lldns for clocksource %s.\n", + testcpu, cs_nsec_min, cs_nsec_max, cs->name); } EXPORT_SYMBOL_GPL(clocksource_verify_percpu); @@ -589,9 +583,7 @@ static void clocksource_watchdog(struct timer_list *unused) * Cycle through CPUs to check if the CPUs stay synchronized * to each other. */ - next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); - if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(cpu_online_mask); + next_cpu = cpumask_next_wrap(raw_smp_processor_id(), cpu_online_mask); /* * Arm timer if not already pending: could race with concurrent diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 30899a8cc52c..f8ea8c8fc895 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -59,6 +59,7 @@ #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) static void retrigger_next_event(void *arg); +static ktime_t __hrtimer_cb_get_time(clockid_t clock_id); /* * The timer bases: @@ -76,42 +77,34 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .index = HRTIMER_BASE_MONOTONIC, .clockid = CLOCK_MONOTONIC, - .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME, .clockid = CLOCK_REALTIME, - .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME, .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, - .get_time = &ktime_get_clocktai, }, { .index = HRTIMER_BASE_MONOTONIC_SOFT, .clockid = CLOCK_MONOTONIC, - .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME_SOFT, .clockid = CLOCK_REALTIME, - .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME_SOFT, .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI_SOFT, .clockid = CLOCK_TAI, - .get_time = &ktime_get_clocktai, }, }, .csd = CSD_INIT(retrigger_next_event, NULL) @@ -208,7 +201,7 @@ static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_ /* * The offline local CPU can't be the default target if the * next remote target event is after this timer. Keep the - * elected new base. An IPI will we issued to reprogram + * elected new base. An IPI will be issued to reprogram * it as a last resort. */ if (!hrtimer_base_is_online(this_cpu_base)) @@ -787,10 +780,10 @@ static void retrigger_next_event(void *arg) * of the next expiring timer is enough. The return from the SMP * function call will take care of the reprogramming in case the * CPU was in a NOHZ idle sleep. + * + * In periodic low resolution mode, the next softirq expiration + * must also be updated. */ - if (!hrtimer_hres_active(base) && !tick_nohz_active) - return; - raw_spin_lock(&base->lock); hrtimer_update_base(base); if (hrtimer_hres_active(base)) @@ -1253,7 +1246,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, remove_hrtimer(timer, base, true, force_local); if (mode & HRTIMER_MODE_REL) - tim = ktime_add_safe(tim, base->get_time()); + tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid)); tim = hrtimer_update_lowres(timer, tim, mode); @@ -1574,10 +1567,10 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude) static inline int hrtimer_clockid_to_base(clockid_t clock_id) { switch (clock_id) { - case CLOCK_REALTIME: - return HRTIMER_BASE_REALTIME; case CLOCK_MONOTONIC: return HRTIMER_BASE_MONOTONIC; + case CLOCK_REALTIME: + return HRTIMER_BASE_REALTIME; case CLOCK_BOOTTIME: return HRTIMER_BASE_BOOTTIME; case CLOCK_TAI: @@ -1588,6 +1581,29 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) } } +static ktime_t __hrtimer_cb_get_time(clockid_t clock_id) +{ + switch (clock_id) { + case CLOCK_MONOTONIC: + return ktime_get(); + case CLOCK_REALTIME: + return ktime_get_real(); + case CLOCK_BOOTTIME: + return ktime_get_boottime(); + case CLOCK_TAI: + return ktime_get_clocktai(); + default: + WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); + return ktime_get(); + } +} + +ktime_t hrtimer_cb_get_time(const struct hrtimer *timer) +{ + return __hrtimer_cb_get_time(timer->base->clockid); +} +EXPORT_SYMBOL_GPL(hrtimer_cb_get_time); + static void __hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) @@ -2129,7 +2145,7 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) int ret; hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS); - hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); + hrtimer_set_expires(&t.timer, restart->nanosleep.expires); ret = do_nanosleep(&t, HRTIMER_MODE_ABS); destroy_hrtimer_on_stack(&t.timer); return ret; @@ -2156,7 +2172,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, restart = ¤t->restart_block; restart->nanosleep.clockid = t.timer.base->clockid; - restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); + restart->nanosleep.expires = hrtimer_get_expires(&t.timer); set_restart_fn(restart, hrtimer_nanosleep_restart); out: destroy_hrtimer_on_stack(&t.timer); @@ -2295,11 +2311,6 @@ int hrtimers_cpu_dying(unsigned int dying_cpu) &new_base->clock_base[i]); } - /* - * The migration might have changed the first expiring softirq - * timer on this CPU. Update it. - */ - __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT); /* Tell the other CPU to retrigger the next event */ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 876d389b2e21..7c6110e964e7 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -163,8 +163,7 @@ void posixtimer_rearm_itimer(struct task_struct *tsk) struct hrtimer *tmr = &tsk->signal->real_timer; if (!hrtimer_is_queued(tmr) && tsk->signal->it_real_incr != 0) { - hrtimer_forward(tmr, tmr->base->get_time(), - tsk->signal->it_real_incr); + hrtimer_forward_now(tmr, tsk->signal->it_real_incr); hrtimer_restart(tmr); } } diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index bc4db9e5ab70..d31a6d40d38d 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -75,13 +75,11 @@ struct clocksource * __init __weak clocksource_default_clock(void) static struct clocksource refined_jiffies; -int register_refined_jiffies(long cycles_per_second) +void __init register_refined_jiffies(long cycles_per_second) { u64 nsec_per_tick, shift_hz; long cycles_per_tick; - - refined_jiffies = clocksource_jiffies; refined_jiffies.name = "refined-jiffies"; refined_jiffies.rating++; @@ -100,5 +98,129 @@ int register_refined_jiffies(long cycles_per_second) refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; __clocksource_register(&refined_jiffies); - return 0; } + +#define SYSCTL_CONV_MULT_HZ(val) ((val) * HZ) +#define SYSCTL_CONV_DIV_HZ(val) ((val) / HZ) + +static SYSCTL_USER_TO_KERN_INT_CONV(_hz, SYSCTL_CONV_MULT_HZ) +static SYSCTL_KERN_TO_USER_INT_CONV(_hz, SYSCTL_CONV_DIV_HZ) +static SYSCTL_USER_TO_KERN_INT_CONV(_userhz, clock_t_to_jiffies) +static SYSCTL_KERN_TO_USER_INT_CONV(_userhz, jiffies_to_clock_t) +static SYSCTL_USER_TO_KERN_INT_CONV(_ms, msecs_to_jiffies) +static SYSCTL_KERN_TO_USER_INT_CONV(_ms, jiffies_to_msecs) + +static SYSCTL_INT_CONV_CUSTOM(_jiffies, sysctl_user_to_kern_int_conv_hz, + sysctl_kern_to_user_int_conv_hz, false) +static SYSCTL_INT_CONV_CUSTOM(_userhz_jiffies, + sysctl_user_to_kern_int_conv_userhz, + sysctl_kern_to_user_int_conv_userhz, false) +static SYSCTL_INT_CONV_CUSTOM(_ms_jiffies, sysctl_user_to_kern_int_conv_ms, + sysctl_kern_to_user_int_conv_ms, false) +static SYSCTL_INT_CONV_CUSTOM(_ms_jiffies_minmax, + sysctl_user_to_kern_int_conv_ms, + sysctl_kern_to_user_int_conv_ms, true) + +/** + * proc_dointvec_jiffies - read a vector of integers as seconds + * @table: the sysctl table + * @dir: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in seconds, and are converted into + * jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_jiffies(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_dointvec_conv(table, dir, buffer, lenp, ppos, + do_proc_int_conv_jiffies); +} +EXPORT_SYMBOL(proc_dointvec_jiffies); + +/** + * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds + * @table: the sysctl table + * @dir: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: pointer to the file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/USER_HZ seconds, and + * are converted into jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos) +{ + if (SYSCTL_USER_TO_KERN(dir) && USER_HZ < HZ) + return -EINVAL; + return proc_dointvec_conv(table, dir, buffer, lenp, ppos, + do_proc_int_conv_userhz_jiffies); +} +EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); + +/** + * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds + * @table: the sysctl table + * @dir: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: the current position in the file + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/1000 seconds, and + * are converted into jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_ms_jiffies(const struct ctl_table *table, int dir, void *buffer, + size_t *lenp, loff_t *ppos) +{ + return proc_dointvec_conv(table, dir, buffer, lenp, ppos, + do_proc_int_conv_ms_jiffies); +} +EXPORT_SYMBOL(proc_dointvec_ms_jiffies); + +int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_dointvec_conv(table, dir, buffer, lenp, ppos, + do_proc_int_conv_ms_jiffies_minmax); +} + +/** + * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values + * @table: the sysctl table + * @dir: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long + * values from/to the user buffer, treated as an ASCII string. The values + * are treated as milliseconds, and converted to jiffies when they are stored. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_doulongvec_minmax_conv(table, dir, buffer, lenp, ppos, + HZ, 1000l); +} +EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); + diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index e3642278df43..e76be24b132c 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -12,6 +12,7 @@ #include <linux/seq_file.h> #include <linux/proc_ns.h> #include <linux/export.h> +#include <linux/nstree.h> #include <linux/time.h> #include <linux/slab.h> #include <linux/cred.h> @@ -88,25 +89,23 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); + ns = kzalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); if (!ns) goto fail_dec; - refcount_set(&ns->ns.count, 1); - ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!ns->vvar_page) goto fail_free; - err = ns_alloc_inum(&ns->ns); + err = ns_common_init(ns); if (err) goto fail_free_page; ns->ucounts = ucounts; - ns->ns.ops = &timens_operations; ns->user_ns = get_user_ns(user_ns); ns->offsets = old_ns->offsets; ns->frozen_offsets = false; + ns_tree_add(ns); return ns; fail_free_page: @@ -130,7 +129,7 @@ fail: * * Return: timens_for_children namespace or ERR_PTR. */ -struct time_namespace *copy_time_ns(unsigned long flags, +struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns) { if (!(flags & CLONE_NEWTIME)) @@ -242,22 +241,24 @@ static void timens_set_vvar_page(struct task_struct *task, for (i = 0; i < CS_BASES; i++) timens_setup_vdso_clock_data(&vc[i], ns); + if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) { + for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++) + timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns); + } + out: mutex_unlock(&offset_lock); } void free_time_ns(struct time_namespace *ns) { + ns_tree_remove(ns); dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); __free_page(ns->vvar_page); - kfree(ns); -} - -static struct time_namespace *to_time_ns(struct ns_common *ns) -{ - return container_of(ns, struct time_namespace, ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); } static struct ns_common *timens_get(struct task_struct *task) @@ -461,7 +462,6 @@ out: const struct proc_ns_operations timens_operations = { .name = "time", - .type = CLONE_NEWTIME, .get = timens_get, .put = timens_put, .install = timens_install, @@ -471,7 +471,6 @@ const struct proc_ns_operations timens_operations = { const struct proc_ns_operations timens_for_children_operations = { .name = "time_for_children", .real_ns_name = "time", - .type = CLONE_NEWTIME, .get = timens_for_children_get, .put = timens_put, .install = timens_install, @@ -479,9 +478,12 @@ const struct proc_ns_operations timens_for_children_operations = { }; struct time_namespace init_time_ns = { - .ns.count = REFCOUNT_INIT(3), + .ns = NS_COMMON_INIT(init_time_ns), .user_ns = &init_user_ns, - .ns.inum = PROC_TIME_INIT_INO, - .ns.ops = &timens_operations, .frozen_offsets = true, }; + +void __init time_ns_init(void) +{ + ns_tree_add(&init_time_ns); +} diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index b837d3d9d325..97fa99b96dd0 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -18,6 +18,7 @@ #include <linux/module.h> #include <linux/rtc.h> #include <linux/audit.h> +#include <linux/timekeeper_internal.h> #include "ntp_internal.h" #include "timekeeping_internal.h" @@ -86,14 +87,16 @@ struct ntp_data { #endif }; -static struct ntp_data tk_ntp_data = { - .tick_usec = USER_TICK_USEC, - .time_state = TIME_OK, - .time_status = STA_UNSYNC, - .time_constant = 2, - .time_maxerror = NTP_PHASE_LIMIT, - .time_esterror = NTP_PHASE_LIMIT, - .ntp_next_leap_sec = TIME64_MAX, +static struct ntp_data tk_ntp_data[TIMEKEEPERS_MAX] = { + [ 0 ... TIMEKEEPERS_MAX - 1 ] = { + .tick_usec = USER_TICK_USEC, + .time_state = TIME_OK, + .time_status = STA_UNSYNC, + .time_constant = 2, + .time_maxerror = NTP_PHASE_LIMIT, + .time_esterror = NTP_PHASE_LIMIT, + .ntp_next_leap_sec = TIME64_MAX, + }, }; #define SECS_PER_DAY 86400 @@ -300,7 +303,7 @@ static void ntp_update_offset(struct ntp_data *ntpdata, long offset) * Select how the frequency is to be controlled * and in which mode (PLL or FLL). */ - real_secs = __ktime_get_real_seconds(); + real_secs = ktime_get_ntp_seconds(ntpdata - tk_ntp_data); secs = (long)(real_secs - ntpdata->time_reftime); if (unlikely(ntpdata->time_status & STA_FREQHOLD)) secs = 0; @@ -348,33 +351,38 @@ static void __ntp_clear(struct ntp_data *ntpdata) /** * ntp_clear - Clears the NTP state variables + * @tkid: Timekeeper ID to be able to select proper ntp data array member */ -void ntp_clear(void) +void ntp_clear(unsigned int tkid) { - __ntp_clear(&tk_ntp_data); + __ntp_clear(&tk_ntp_data[tkid]); } -u64 ntp_tick_length(void) +u64 ntp_tick_length(unsigned int tkid) { - return tk_ntp_data.tick_length; + return tk_ntp_data[tkid].tick_length; } /** * ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t + * @tkid: Timekeeper ID * - * Provides the time of the next leapsecond against CLOCK_REALTIME in - * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending. + * Returns: For @tkid == TIMEKEEPER_CORE this provides the time of the next + * leap second against CLOCK_REALTIME in a ktime_t format if a + * leap second is pending. KTIME_MAX otherwise. */ -ktime_t ntp_get_next_leap(void) +ktime_t ntp_get_next_leap(unsigned int tkid) { - struct ntp_data *ntpdata = &tk_ntp_data; - ktime_t ret; + struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE]; + + if (tkid != TIMEKEEPER_CORE) + return KTIME_MAX; if ((ntpdata->time_state == TIME_INS) && (ntpdata->time_status & STA_INS)) return ktime_set(ntpdata->ntp_next_leap_sec, 0); - ret = KTIME_MAX; - return ret; + + return KTIME_MAX; } /* @@ -387,9 +395,9 @@ ktime_t ntp_get_next_leap(void) * * Also handles leap second processing, and returns leap offset */ -int second_overflow(time64_t secs) +int second_overflow(unsigned int tkid, time64_t secs) { - struct ntp_data *ntpdata = &tk_ntp_data; + struct ntp_data *ntpdata = &tk_ntp_data[tkid]; s64 delta; int leap = 0; s32 rem; @@ -605,7 +613,7 @@ static inline int update_rtc(struct timespec64 *to_set, unsigned long *offset_ns */ static inline bool ntp_synced(void) { - return !(tk_ntp_data.time_status & STA_UNSYNC); + return !(tk_ntp_data[TIMEKEEPER_CORE].time_status & STA_UNSYNC); } /* @@ -702,7 +710,7 @@ static inline void process_adj_status(struct ntp_data *ntpdata, const struct __k * reference time to current time. */ if (!(ntpdata->time_status & STA_PLL) && (txc->status & STA_PLL)) - ntpdata->time_reftime = __ktime_get_real_seconds(); + ntpdata->time_reftime = ktime_get_ntp_seconds(ntpdata - tk_ntp_data); /* only set allowed bits */ ntpdata->time_status &= STA_RONLY; @@ -759,10 +767,10 @@ static inline void process_adjtimex_modes(struct ntp_data *ntpdata, const struct * adjtimex() mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ -int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, - s32 *time_tai, struct audit_ntp_data *ad) +int ntp_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts, + s32 *time_tai, struct audit_ntp_data *ad) { - struct ntp_data *ntpdata = &tk_ntp_data; + struct ntp_data *ntpdata = &tk_ntp_data[tkid]; int result; if (txc->modes & ADJ_ADJTIME) { @@ -1031,8 +1039,8 @@ static void hardpps_update_phase(struct ntp_data *ntpdata, long error) */ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { + struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE]; struct pps_normtime pts_norm, freq_norm; - struct ntp_data *ntpdata = &tk_ntp_data; pts_norm = pps_normalize_ts(*phase_ts); @@ -1083,18 +1091,18 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t static int __init ntp_tick_adj_setup(char *str) { - int rc = kstrtos64(str, 0, &tk_ntp_data.ntp_tick_adj); + int rc = kstrtos64(str, 0, &tk_ntp_data[TIMEKEEPER_CORE].ntp_tick_adj); if (rc) return rc; - tk_ntp_data.ntp_tick_adj <<= NTP_SCALE_SHIFT; + tk_ntp_data[TIMEKEEPER_CORE].ntp_tick_adj <<= NTP_SCALE_SHIFT; return 1; } - __setup("ntp_tick_adj=", ntp_tick_adj_setup); void __init ntp_init(void) { - ntp_clear(); + for (int id = 0; id < TIMEKEEPERS_MAX; id++) + __ntp_clear(tk_ntp_data + id); ntp_init_cmos_sync(); } diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 5a633dce9057..7084d839c207 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -3,14 +3,13 @@ #define _LINUX_NTP_INTERNAL_H extern void ntp_init(void); -extern void ntp_clear(void); +extern void ntp_clear(unsigned int tkid); /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ -extern u64 ntp_tick_length(void); -extern ktime_t ntp_get_next_leap(void); -extern int second_overflow(time64_t secs); -extern int __do_adjtimex(struct __kernel_timex *txc, - const struct timespec64 *ts, - s32 *time_tai, struct audit_ntp_data *ad); +extern u64 ntp_tick_length(unsigned int tkid); +extern ktime_t ntp_get_next_leap(unsigned int tkid); +extern int second_overflow(unsigned int tkid, time64_t secs); +extern int ntp_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts, + s32 *time_tai, struct audit_ntp_data *ad); extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts); #if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 50e8d04ab661..0de2bb7cbec0 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1406,6 +1406,15 @@ void run_posix_cpu_timers(void) lockdep_assert_irqs_disabled(); /* + * Ensure that release_task(tsk) can't happen while + * handle_posix_cpu_timers() is running. Otherwise, a concurrent + * posix_cpu_timer_del() may fail to lock_task_sighand(tsk) and + * miss timer->it.cpu.firing != 0. + */ + if (tsk->exit_state) + return; + + /* * If the actual expiry is deferred to task work context and the * work is already scheduled there is no point to do anything here. */ @@ -1548,7 +1557,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, * Report back to the user the time still remaining. */ restart = ¤t->restart_block; - restart->nanosleep.expires = expires; + restart->nanosleep.expires = ns_to_ktime(expires); if (restart->nanosleep.type != TT_NONE) error = nanosleep_copyout(restart, &it.it_value); } @@ -1590,7 +1599,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block) clockid_t which_clock = restart_block->nanosleep.clockid; struct timespec64 t; - t = ns_to_timespec64(restart_block->nanosleep.expires); + t = ktime_to_timespec64(restart_block->nanosleep.expires); return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t); } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 6222112533a7..80a8a09a21a0 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -30,8 +30,6 @@ #include "timekeeping.h" #include "posix-timers.h" -static struct kmem_cache *posix_timers_cache; - /* * Timers are managed in a hash table for lockless lookup. The hash key is * constructed from current::signal and the timer ID and the timer is @@ -49,10 +47,12 @@ struct timer_hash_bucket { static struct { struct timer_hash_bucket *buckets; unsigned long mask; -} __timer_data __ro_after_init __aligned(2*sizeof(long)); + struct kmem_cache *cache; +} __timer_data __ro_after_init __aligned(4*sizeof(long)); -#define timer_buckets (__timer_data.buckets) -#define timer_hashmask (__timer_data.mask) +#define timer_buckets (__timer_data.buckets) +#define timer_hashmask (__timer_data.mask) +#define posix_timers_cache (__timer_data.cache) static const struct k_clock * const posix_clocks[]; static const struct k_clock *clockid_to_kclock(const clockid_t id); @@ -283,14 +283,6 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) return 0; } -static __init int init_posix_timers(void) -{ - posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof(struct k_itimer), - __alignof__(struct k_itimer), SLAB_ACCOUNT, NULL); - return 0; -} -__initcall(init_posix_timers); - /* * The siginfo si_overrun field and the return value of timer_getoverrun(2) * are of type int. Clamp the overrun value to INT_MAX @@ -307,8 +299,7 @@ static void common_hrtimer_rearm(struct k_itimer *timr) { struct hrtimer *timer = &timr->it.real.timer; - timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(), - timr->it_interval); + timr->it_overrun += hrtimer_forward_now(timer, timr->it_interval); hrtimer_restart(timer); } @@ -484,12 +475,6 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, if (!kc->timer_create) return -EOPNOTSUPP; - new_timer = alloc_posix_timer(); - if (unlikely(!new_timer)) - return -EAGAIN; - - spin_lock_init(&new_timer->it_lock); - /* Special case for CRIU to restore timers with a given timer ID. */ if (unlikely(current->signal->timer_create_restore_ids)) { if (copy_from_user(&req_id, created_timer_id, sizeof(req_id))) @@ -499,6 +484,12 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, return -EINVAL; } + new_timer = alloc_posix_timer(); + if (unlikely(!new_timer)) + return -EAGAIN; + + spin_lock_init(&new_timer->it_lock); + /* * Add the timer to the hash table. The timer is not yet valid * after insertion, but has a unique ID allocated. @@ -543,7 +534,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, goto out; } /* - * After succesful copy out, the timer ID is visible to user space + * After successful copy out, the timer ID is visible to user space * now but not yet valid because new_timer::signal low order bit is 1. * * Complete the initialization with the clock specific create @@ -833,7 +824,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, hrtimer_setup(&timr->it.real.timer, posix_timer_fn, timr->it_clock, mode); if (!absolute) - expires = ktime_add_safe(expires, timer->base->get_time()); + expires = ktime_add_safe(expires, hrtimer_cb_get_time(timer)); hrtimer_set_expires(timer, expires); if (!sigev_none) @@ -1251,7 +1242,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, * sys_clock_settime(). The kernel internal timekeeping is always using * nanoseconds precision independent of the clocksource device which is * used to read the time from. The resolution of that device only - * affects the presicion of the time returned by sys_clock_gettime(). + * affects the precision of the time returned by sys_clock_gettime(). * * Returns: * 0 Success. @tp contains the resolution @@ -1534,6 +1525,9 @@ static const struct k_clock * const posix_clocks[] = { [CLOCK_REALTIME_ALARM] = &alarm_clock, [CLOCK_BOOTTIME_ALARM] = &alarm_clock, [CLOCK_TAI] = &clock_tai, +#ifdef CONFIG_POSIX_AUX_CLOCKS + [CLOCK_AUX ... CLOCK_AUX_LAST] = &clock_aux, +#endif }; static const struct k_clock *clockid_to_kclock(const clockid_t id) @@ -1556,6 +1550,11 @@ static int __init posixtimer_init(void) unsigned long i, size; unsigned int shift; + posix_timers_cache = kmem_cache_create("posix_timers_cache", + sizeof(struct k_itimer), + __alignof__(struct k_itimer), + SLAB_ACCOUNT, NULL); + if (IS_ENABLED(CONFIG_BASE_SMALL)) size = 512; else diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 61906f0688c1..7f259e845d24 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -41,6 +41,7 @@ extern const struct k_clock clock_posix_dynamic; extern const struct k_clock clock_process; extern const struct k_clock clock_thread; extern const struct k_clock alarm_clock; +extern const struct k_clock clock_aux; void posix_timer_queue_signal(struct k_itimer *timr); diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index cc15fe293719..f39111830ca3 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -174,8 +174,7 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) return HRTIMER_RESTART; } -void __init -sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) +void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) { u64 res, wrap, new_mask, new_epoch, cyc, ns; u32 new_mult, new_shift; @@ -247,6 +246,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) pr_debug("Registered %pS as sched_clock source\n", read); } +EXPORT_SYMBOL_GPL(sched_clock_register); void __init generic_sched_clock_init(void) { @@ -296,6 +296,11 @@ int sched_clock_suspend(void) return 0; } +static int sched_clock_syscore_suspend(void *data) +{ + return sched_clock_suspend(); +} + void sched_clock_resume(void) { struct clock_read_data *rd = &cd.read_data[0]; @@ -305,14 +310,23 @@ void sched_clock_resume(void) rd->read_sched_clock = cd.actual_read_sched_clock; } -static struct syscore_ops sched_clock_ops = { - .suspend = sched_clock_suspend, - .resume = sched_clock_resume, +static void sched_clock_syscore_resume(void *data) +{ + sched_clock_resume(); +} + +static const struct syscore_ops sched_clock_syscore_ops = { + .suspend = sched_clock_syscore_suspend, + .resume = sched_clock_syscore_resume, +}; + +static struct syscore sched_clock_syscore = { + .ops = &sched_clock_syscore_ops, }; static int __init sched_clock_syscore_init(void) { - register_syscore_ops(&sched_clock_ops); + register_syscore(&sched_clock_syscore); return 0; } diff --git a/kernel/time/sleep_timeout.c b/kernel/time/sleep_timeout.c index c0e960a5de39..3c90574bd904 100644 --- a/kernel/time/sleep_timeout.c +++ b/kernel/time/sleep_timeout.c @@ -22,7 +22,7 @@ struct process_timer { static void process_timeout(struct timer_list *t) { - struct process_timer *timeout = from_timer(timeout, t, timer); + struct process_timer *timeout = timer_container_of(timeout, t, timer); wake_up_process(timeout->task); } @@ -100,7 +100,7 @@ signed long __sched schedule_timeout(signed long timeout) timer_delete_sync(&timer.timer); /* Remove the timer from the object tracker */ - destroy_timer_on_stack(&timer.timer); + timer_destroy_on_stack(&timer.timer); timeout = expire - jiffies; diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 9a3859443c04..7e33d3f2e889 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -411,24 +411,18 @@ int tick_cpu_dying(unsigned int dying_cpu) } /* - * Shutdown an event device on a given cpu: + * Shutdown an event device on the outgoing CPU: * - * This is called on a life CPU, when a CPU is dead. So we cannot - * access the hardware device itself. - * We just set the mode and remove it from the lists. + * Called by the dying CPU during teardown, with clockevents_lock held + * and interrupts disabled. */ -void tick_shutdown(unsigned int cpu) +void tick_shutdown(void) { - struct tick_device *td = &per_cpu(tick_cpu_device, cpu); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); struct clock_event_device *dev = td->evtdev; td->mode = TICKDEV_MODE_PERIODIC; if (dev) { - /* - * Prevent that the clock events layer tries to call - * the set mode function! - */ - clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); clockevents_exchange_device(dev, NULL); dev->event_handler = clockevents_handle_noop; td->evtdev = NULL; diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index faac36de35b9..4e4f7bbe2a64 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -26,7 +26,7 @@ extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); extern void tick_check_new_device(struct clock_event_device *dev); extern void tick_offline_cpu(unsigned int cpu); -extern void tick_shutdown(unsigned int cpu); +extern void tick_shutdown(void); extern void tick_suspend(void); extern void tick_resume(void); extern bool tick_check_replacement(struct clock_event_device *curdev, diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 5e2c2c26b3cc..ffee943d796d 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -19,6 +19,10 @@ /** * tick_program_event - program the CPU local timer device for the next event + * @expires: the time at which the next timer event should occur + * @force: flag to force reprograming even if the event time hasn't changed + * + * Return: 0 on success, negative error code on failure */ int tick_program_event(ktime_t expires, int force) { @@ -57,6 +61,13 @@ void tick_resume_oneshot(void) /** * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz) + * @newdev: Pointer to the clock event device to configure + * @handler: Function to be called when the event device triggers an interrupt + * @next_event: Initial expiry time for the next event (in ktime) + * + * Configures the specified clock event device for onshot mode, + * assigns the given handler as its event callback, and programs + * the device to trigger at the specified next event time. */ void tick_setup_oneshot(struct clock_event_device *newdev, void (*handler)(struct clock_event_device *), @@ -69,6 +80,10 @@ void tick_setup_oneshot(struct clock_event_device *newdev, /** * tick_switch_to_oneshot - switch to oneshot mode + * @handler: function to call when an event occurs on the tick device + * + * Return: 0 on success, -EINVAL if the tick device is not present, + * not functional, or does not support oneshot mode. */ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) { @@ -101,7 +116,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) /** * tick_oneshot_mode_active - check whether the system is in oneshot mode * - * returns 1 when either nohz or highres are enabled. otherwise 0. + * Return: 1 when either nohz or highres are enabled, otherwise 0. */ int tick_oneshot_mode_active(void) { @@ -120,6 +135,9 @@ int tick_oneshot_mode_active(void) * tick_init_highres - switch to high resolution mode * * Called with interrupts disabled. + * + * Return: 0 on success, -EINVAL if the tick device cannot switch + * to oneshot/high-resolution mode. */ int tick_init_highres(void) { diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index c527b421c865..8ddf74e705d3 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -201,6 +201,27 @@ static inline void tick_sched_flag_clear(struct tick_sched *ts, ts->flags &= ~flag; } +/* + * Allow only one non-timekeeper CPU at a time update jiffies from + * the timer tick. + * + * Returns true if update was run. + */ +static bool tick_limited_update_jiffies64(struct tick_sched *ts, ktime_t now) +{ + static atomic_t in_progress; + int inp; + + inp = atomic_read(&in_progress); + if (inp || !atomic_try_cmpxchg(&in_progress, &inp, 1)) + return false; + + if (ts->last_tick_jiffies == jiffies) + tick_do_update_jiffies64(now); + atomic_set(&in_progress, 0); + return true; +} + #define MAX_STALLED_JIFFIES 5 static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) @@ -239,10 +260,11 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } else { - if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) { - tick_do_update_jiffies64(now); - ts->stalled_jiffies = 0; - ts->last_tick_jiffies = READ_ONCE(jiffies); + if (++ts->stalled_jiffies >= MAX_STALLED_JIFFIES) { + if (tick_limited_update_jiffies64(ts, now)) { + ts->stalled_jiffies = 0; + ts->last_tick_jiffies = READ_ONCE(jiffies); + } } } @@ -1152,16 +1174,15 @@ static bool report_idle_softirq(void) return false; } - if (ratelimit >= 10) - return false; - /* On RT, softirq handling may be waiting on some lock */ if (local_bh_blocked()) return false; - pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", - pending); - ratelimit++; + if (ratelimit < 10) { + pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", + pending); + ratelimit++; + } return true; } diff --git a/kernel/time/time.c b/kernel/time/time.c index 1b69caa87480..0ba8e3c50d62 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -858,6 +858,7 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, return res; } +EXPORT_SYMBOL_GPL(timespec64_add_safe); /** * get_timespec64 - get user's time value into kernel space diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c index e6285288d765..3d2a354cfe1c 100644 --- a/kernel/time/timecounter.c +++ b/kernel/time/timecounter.c @@ -6,7 +6,7 @@ #include <linux/timecounter.h> void timecounter_init(struct timecounter *tc, - const struct cyclecounter *cc, + struct cyclecounter *cc, u64 start_tstamp) { tc->cc = cc; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1e67d076f195..3ec3daa4acab 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -6,6 +6,7 @@ #include <linux/timekeeper_internal.h> #include <linux/module.h> #include <linux/interrupt.h> +#include <linux/kobject.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/mm.h> @@ -25,6 +26,8 @@ #include <linux/audit.h> #include <linux/random.h> +#include <vdso/auxclock.h> + #include "tick-internal.h" #include "ntp_internal.h" #include "timekeeping_internal.h" @@ -53,7 +56,38 @@ struct tk_data { raw_spinlock_t lock; } ____cacheline_aligned; -static struct tk_data tk_core; +static struct tk_data timekeeper_data[TIMEKEEPERS_MAX]; + +/* The core timekeeper */ +#define tk_core (timekeeper_data[TIMEKEEPER_CORE]) + +#ifdef CONFIG_POSIX_AUX_CLOCKS +static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts) +{ + return ktime_get_aux_ts64(CLOCK_AUX + tkid - TIMEKEEPER_AUX_FIRST, ts); +} + +static inline bool tk_is_aux(const struct timekeeper *tk) +{ + return tk->id >= TIMEKEEPER_AUX_FIRST && tk->id <= TIMEKEEPER_AUX_LAST; +} +#else +static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts) +{ + return false; +} + +static inline bool tk_is_aux(const struct timekeeper *tk) +{ + return false; +} +#endif + +static inline void tk_update_aux_offs(struct timekeeper *tk, ktime_t offs) +{ + tk->offs_aux = offs; + tk->monotonic_to_aux = ktime_to_timespec64(offs); +} /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -113,6 +147,16 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = { .base[1] = FAST_TK_INIT, }; +#ifdef CONFIG_POSIX_AUX_CLOCKS +static __init void tk_aux_setup(void); +static void tk_aux_update_clocksource(void); +static void tk_aux_advance(void); +#else +static inline void tk_aux_setup(void) { } +static inline void tk_aux_update_clocksource(void) { } +static inline void tk_aux_advance(void) { } +#endif + unsigned long timekeeper_lock_irqsave(void) { unsigned long flags; @@ -164,10 +208,34 @@ static inline struct timespec64 tk_xtime(const struct timekeeper *tk) return ts; } +static inline struct timespec64 tk_xtime_coarse(const struct timekeeper *tk) +{ + struct timespec64 ts; + + ts.tv_sec = tk->xtime_sec; + ts.tv_nsec = tk->coarse_nsec; + return ts; +} + +/* + * Update the nanoseconds part for the coarse time keepers. They can't rely + * on xtime_nsec because xtime_nsec could be adjusted by a small negative + * amount when the multiplication factor of the clock is adjusted, which + * could cause the coarse clocks to go slightly backwards. See + * timekeeping_apply_adjustment(). Thus we keep a separate copy for the coarse + * clockids which only is updated when the clock has been set or we have + * accumulated time. + */ +static inline void tk_update_coarse_nsecs(struct timekeeper *tk) +{ + tk->coarse_nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; +} + static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec = ts->tv_sec; tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift; + tk_update_coarse_nsecs(tk); } static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) @@ -175,6 +243,7 @@ static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) tk->xtime_sec += ts->tv_sec; tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift; tk_normalize_xtime(tk); + tk_update_coarse_nsecs(tk); } static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) @@ -576,7 +645,7 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); */ static inline void tk_update_leap_state(struct timekeeper *tk) { - tk->next_leap_ktime = ntp_get_next_leap(); + tk->next_leap_ktime = ntp_get_next_leap(tk->id); if (tk->next_leap_ktime != KTIME_MAX) /* Convert to monotonic time */ tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real); @@ -638,7 +707,7 @@ static void timekeeping_restore_shadow(struct tk_data *tkd) static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action) { - struct timekeeper *tk = &tk_core.shadow_timekeeper; + struct timekeeper *tk = &tkd->shadow_timekeeper; lockdep_assert_held(&tkd->lock); @@ -653,18 +722,22 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act if (action & TK_CLEAR_NTP) { tk->ntp_error = 0; - ntp_clear(); + ntp_clear(tk->id); } tk_update_leap_state(tk); tk_update_ktime_data(tk); + tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; - update_vsyscall(tk); - update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + if (tk->id == TIMEKEEPER_CORE) { + update_vsyscall(tk); + update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); - tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; - update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); - update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); + update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); + update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); + } else if (tk_is_aux(tk)) { + vdso_time_update_aux(tk); + } if (action & TK_CLOCK_WAS_SET) tk->clock_was_set_seq++; @@ -708,6 +781,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) tk_normalize_xtime(tk); delta -= incr; } + tk_update_coarse_nsecs(tk); } /** @@ -804,8 +878,8 @@ EXPORT_SYMBOL_GPL(ktime_get_with_offset); ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned int seq; ktime_t base, *offset = offsets[offs]; + unsigned int seq; u64 nsecs; WARN_ON(timekeeping_suspended); @@ -813,7 +887,7 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) do { seq = read_seqcount_begin(&tk_core.seq); base = ktime_add(tk->tkr_mono.base, *offset); - nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsecs = tk->coarse_nsec; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -949,9 +1023,14 @@ time64_t ktime_get_real_seconds(void) EXPORT_SYMBOL_GPL(ktime_get_real_seconds); /** - * __ktime_get_real_seconds - The same as ktime_get_real_seconds - * but without the sequence counter protect. This internal function - * is called just when timekeeping lock is already held. + * __ktime_get_real_seconds - Unprotected access to CLOCK_REALTIME seconds + * + * The same as ktime_get_real_seconds() but without the sequence counter + * protection. This function is used in restricted contexts like the x86 MCE + * handler and in KGDB. It's unprotected on 32-bit vs. concurrent half + * completed modification and only to be used for such critical contexts. + * + * Returns: Racy snapshot of the CLOCK_REALTIME seconds value */ noinstr time64_t __ktime_get_real_seconds(void) { @@ -1230,7 +1309,7 @@ int get_device_system_crosststamp(int (*get_time_fn) struct system_time_snapshot *history_begin, struct system_device_crosststamp *xtstamp) { - struct system_counterval_t system_counterval; + struct system_counterval_t system_counterval = {}; struct timekeeper *tk = &tk_core.timekeeper; u64 cycles, now, interval_start; unsigned int clock_was_set_seq = 0; @@ -1386,41 +1465,73 @@ int do_settimeofday64(const struct timespec64 *ts) } EXPORT_SYMBOL(do_settimeofday64); +static inline bool timekeeper_is_core_tk(struct timekeeper *tk) +{ + return !IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS) || tk->id == TIMEKEEPER_CORE; +} + /** - * timekeeping_inject_offset - Adds or subtracts from the current time. + * __timekeeping_inject_offset - Adds or subtracts from the current time. + * @tkd: Pointer to the timekeeper to modify * @ts: Pointer to the timespec variable containing the offset * * Adds or subtracts an offset value from the current time. */ -static int timekeeping_inject_offset(const struct timespec64 *ts) +static int __timekeeping_inject_offset(struct tk_data *tkd, const struct timespec64 *ts) { + struct timekeeper *tks = &tkd->shadow_timekeeper; + struct timespec64 tmp; + if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { - struct timekeeper *tks = &tk_core.shadow_timekeeper; - struct timespec64 tmp; - - timekeeping_forward_now(tks); + timekeeping_forward_now(tks); + if (timekeeper_is_core_tk(tks)) { /* Make sure the proposed value is valid */ tmp = timespec64_add(tk_xtime(tks), *ts); if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 || !timespec64_valid_settod(&tmp)) { - timekeeping_restore_shadow(&tk_core); + timekeeping_restore_shadow(tkd); return -EINVAL; } tk_xtime_add(tks, ts); tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts)); - timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); + } else { + struct tk_read_base *tkr_mono = &tks->tkr_mono; + ktime_t now, offs; + + /* Get the current time */ + now = ktime_add_ns(tkr_mono->base, timekeeping_get_ns(tkr_mono)); + /* Add the relative offset change */ + offs = ktime_add(tks->offs_aux, timespec64_to_ktime(*ts)); + + /* Prevent that the resulting time becomes negative */ + if (ktime_add(now, offs) < 0) { + timekeeping_restore_shadow(tkd); + return -EINVAL; + } + tk_update_aux_offs(tks, offs); } - /* Signal hrtimers about time change */ - clock_was_set(CLOCK_SET_WALL); + timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL); return 0; } +static int timekeeping_inject_offset(const struct timespec64 *ts) +{ + int ret; + + scoped_guard (raw_spinlock_irqsave, &tk_core.lock) + ret = __timekeeping_inject_offset(&tk_core, ts); + + /* Signal hrtimers about time change */ + if (!ret) + clock_was_set(CLOCK_SET_WALL); + return ret; +} + /* * Indicates if there is an offset between the system clock and the hardware * clock/persistent clock/rtc. @@ -1496,6 +1607,8 @@ static int change_clocksource(void *data) timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); } + tk_aux_update_clocksource(); + if (old) { if (old->disable) old->disable(old); @@ -1547,6 +1660,39 @@ void ktime_get_raw_ts64(struct timespec64 *ts) } EXPORT_SYMBOL(ktime_get_raw_ts64); +/** + * ktime_get_clock_ts64 - Returns time of a clock in a timespec + * @id: POSIX clock ID of the clock to read + * @ts: Pointer to the timespec64 to be set + * + * The timestamp is invalidated (@ts->sec is set to -1) if the + * clock @id is not available. + */ +void ktime_get_clock_ts64(clockid_t id, struct timespec64 *ts) +{ + /* Invalidate time stamp */ + ts->tv_sec = -1; + ts->tv_nsec = 0; + + switch (id) { + case CLOCK_REALTIME: + ktime_get_real_ts64(ts); + return; + case CLOCK_MONOTONIC: + ktime_get_ts64(ts); + return; + case CLOCK_MONOTONIC_RAW: + ktime_get_raw_ts64(ts); + return; + case CLOCK_AUX ... CLOCK_AUX_LAST: + if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) + ktime_get_aux_ts64(id, ts); + return; + default: + WARN_ON_ONCE(1); + } +} +EXPORT_SYMBOL_GPL(ktime_get_clock_ts64); /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres @@ -1623,10 +1769,12 @@ read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, *boot_offset = ns_to_timespec64(local_clock()); } -static __init void tkd_basic_setup(struct tk_data *tkd) +static __init void tkd_basic_setup(struct tk_data *tkd, enum timekeeper_ids tk_id, bool valid) { raw_spin_lock_init(&tkd->lock); seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock); + tkd->timekeeper.id = tkd->shadow_timekeeper.id = tk_id; + tkd->timekeeper.clock_valid = tkd->shadow_timekeeper.clock_valid = valid; } /* @@ -1656,7 +1804,8 @@ void __init timekeeping_init(void) struct timekeeper *tks = &tk_core.shadow_timekeeper; struct clocksource *clock; - tkd_basic_setup(&tk_core); + tkd_basic_setup(&tk_core, TIMEKEEPER_CORE, true); + tk_aux_setup(); read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); if (timespec64_valid_settod(&wall_time) && @@ -1845,6 +1994,11 @@ void timekeeping_resume(void) timerfd_resume(); } +static void timekeeping_syscore_resume(void *data) +{ + timekeeping_resume(); +} + int timekeeping_suspend(void) { struct timekeeper *tks = &tk_core.shadow_timekeeper; @@ -1912,15 +2066,24 @@ int timekeeping_suspend(void) return 0; } +static int timekeeping_syscore_suspend(void *data) +{ + return timekeeping_suspend(); +} + /* sysfs resume/suspend bits for timekeeping */ -static struct syscore_ops timekeeping_syscore_ops = { - .resume = timekeeping_resume, - .suspend = timekeeping_suspend, +static const struct syscore_ops timekeeping_syscore_ops = { + .resume = timekeeping_syscore_resume, + .suspend = timekeeping_syscore_suspend, +}; + +static struct syscore timekeeping_syscore = { + .ops = &timekeeping_syscore_ops, }; static int __init timekeeping_init_ops(void) { - register_syscore_ops(&timekeeping_syscore_ops); + register_syscore(&timekeeping_syscore); return 0; } device_initcall(timekeeping_init_ops); @@ -2008,7 +2171,7 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, */ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) { - u64 ntp_tl = ntp_tick_length(); + u64 ntp_tl = ntp_tick_length(tk->id); u32 mult; /* @@ -2089,7 +2252,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) } /* Figure out if its a leap sec and apply if needed */ - leap = second_overflow(tk->xtime_sec); + leap = second_overflow(tk->id, tk->xtime_sec); if (unlikely(leap)) { struct timespec64 ts; @@ -2155,15 +2318,13 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, * timekeeping_advance - Updates the timekeeper to the current time and * current NTP tick length */ -static bool timekeeping_advance(enum timekeeping_adv_mode mode) +static bool __timekeeping_advance(struct tk_data *tkd, enum timekeeping_adv_mode mode) { - struct timekeeper *tk = &tk_core.shadow_timekeeper; - struct timekeeper *real_tk = &tk_core.timekeeper; + struct timekeeper *tk = &tkd->shadow_timekeeper; + struct timekeeper *real_tk = &tkd->timekeeper; unsigned int clock_set = 0; int shift = 0, maxshift; - u64 offset; - - guard(raw_spinlock_irqsave)(&tk_core.lock); + u64 offset, orig_offset; /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) @@ -2172,7 +2333,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), tk->tkr_mono.cycle_last, tk->tkr_mono.mask, tk->tkr_mono.clock->max_raw_delta); - + orig_offset = offset; /* Check if there's really nothing to do */ if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) return false; @@ -2188,7 +2349,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) shift = ilog2(offset) - ilog2(tk->cycle_interval); shift = max(0, shift); /* Bound shift to one less than what overflows tick_length */ - maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; + maxshift = (64 - (ilog2(ntp_tick_length(tk->id)) + 1)) - 1; shift = min(shift, maxshift); while (offset >= tk->cycle_interval) { offset = logarithmic_accumulation(tk, offset, shift, &clock_set); @@ -2205,19 +2366,35 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) */ clock_set |= accumulate_nsecs_to_secs(tk); - timekeeping_update_from_shadow(&tk_core, clock_set); + /* + * To avoid inconsistencies caused adjtimex TK_ADV_FREQ calls + * making small negative adjustments to the base xtime_nsec + * value, only update the coarse clocks if we accumulated time + */ + if (orig_offset != offset) + tk_update_coarse_nsecs(tk); + + timekeeping_update_from_shadow(tkd, clock_set); return !!clock_set; } +static bool timekeeping_advance(enum timekeeping_adv_mode mode) +{ + guard(raw_spinlock_irqsave)(&tk_core.lock); + return __timekeeping_advance(&tk_core, mode); +} + /** * update_wall_time - Uses the current clocksource to increment the wall time * + * It also updates the enabled auxiliary clock timekeepers */ void update_wall_time(void) { if (timekeeping_advance(TK_ADV_TICK)) clock_was_set_delayed(); + tk_aux_advance(); } /** @@ -2248,7 +2425,7 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - *ts = tk_xtime(tk); + *ts = tk_xtime_coarse(tk); } while (read_seqcount_retry(&tk_core.seq, seq)); } EXPORT_SYMBOL(ktime_get_coarse_real_ts64); @@ -2271,7 +2448,7 @@ void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - *ts = tk_xtime(tk); + *ts = tk_xtime_coarse(tk); offset = tk_core.timekeeper.offs_real; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -2350,12 +2527,12 @@ void ktime_get_coarse_ts64(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - now = tk_xtime(tk); + now = tk_xtime_coarse(tk); mono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec, - now.tv_nsec + mono.tv_nsec); + now.tv_nsec + mono.tv_nsec); } EXPORT_SYMBOL(ktime_get_coarse_ts64); @@ -2415,7 +2592,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, /* * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex */ -static int timekeeping_validate_timex(const struct __kernel_timex *txc) +static int timekeeping_validate_timex(const struct __kernel_timex *txc, bool aux_clock) { if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ @@ -2474,6 +2651,20 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc) return -EINVAL; } + if (aux_clock) { + /* Auxiliary clocks are similar to TAI and do not have leap seconds */ + if (txc->status & (STA_INS | STA_DEL)) + return -EINVAL; + + /* No TAI offset setting */ + if (txc->modes & ADJ_TAI) + return -EINVAL; + + /* No PPS support either */ + if (txc->status & (STA_PPSFREQ | STA_PPSTIME)) + return -EINVAL; + } + return 0; } @@ -2492,74 +2683,103 @@ unsigned long random_get_entropy_fallback(void) } EXPORT_SYMBOL_GPL(random_get_entropy_fallback); -/** - * do_adjtimex() - Accessor function to NTP __do_adjtimex function - * @txc: Pointer to kernel_timex structure containing NTP parameters - */ -int do_adjtimex(struct __kernel_timex *txc) +struct adjtimex_result { + struct audit_ntp_data ad; + struct timespec64 delta; + bool clock_set; +}; + +static int __do_adjtimex(struct tk_data *tkd, struct __kernel_timex *txc, + struct adjtimex_result *result) { - struct audit_ntp_data ad; - bool offset_set = false; - bool clock_set = false; + struct timekeeper *tks = &tkd->shadow_timekeeper; + bool aux_clock = !timekeeper_is_core_tk(tks); struct timespec64 ts; + s32 orig_tai, tai; int ret; /* Validate the data before disabling interrupts */ - ret = timekeeping_validate_timex(txc); + ret = timekeeping_validate_timex(txc, aux_clock); if (ret) return ret; add_device_randomness(txc, sizeof(*txc)); - if (txc->modes & ADJ_SETOFFSET) { - struct timespec64 delta; + if (!aux_clock) + ktime_get_real_ts64(&ts); + else + tk_get_aux_ts64(tkd->timekeeper.id, &ts); + + add_device_randomness(&ts, sizeof(ts)); + + guard(raw_spinlock_irqsave)(&tkd->lock); + + if (!tks->clock_valid) + return -ENODEV; - delta.tv_sec = txc->time.tv_sec; - delta.tv_nsec = txc->time.tv_usec; + if (txc->modes & ADJ_SETOFFSET) { + result->delta.tv_sec = txc->time.tv_sec; + result->delta.tv_nsec = txc->time.tv_usec; if (!(txc->modes & ADJ_NANO)) - delta.tv_nsec *= 1000; - ret = timekeeping_inject_offset(&delta); + result->delta.tv_nsec *= 1000; + ret = __timekeeping_inject_offset(tkd, &result->delta); if (ret) return ret; - - offset_set = delta.tv_sec != 0; - audit_tk_injoffset(delta); + result->clock_set = true; } - audit_ntp_init(&ad); + orig_tai = tai = tks->tai_offset; + ret = ntp_adjtimex(tks->id, txc, &ts, &tai, &result->ad); - ktime_get_real_ts64(&ts); - add_device_randomness(&ts, sizeof(ts)); + if (tai != orig_tai) { + __timekeeping_set_tai_offset(tks, tai); + timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET); + result->clock_set = true; + } else { + tk_update_leap_state_all(&tk_core); + } - scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { - struct timekeeper *tks = &tk_core.shadow_timekeeper; - s32 orig_tai, tai; + /* Update the multiplier immediately if frequency was set directly */ + if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) + result->clock_set |= __timekeeping_advance(tkd, TK_ADV_FREQ); - orig_tai = tai = tks->tai_offset; - ret = __do_adjtimex(txc, &ts, &tai, &ad); + return ret; +} - if (tai != orig_tai) { - __timekeeping_set_tai_offset(tks, tai); - timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); - clock_set = true; - } else { - tk_update_leap_state_all(&tk_core); - } - } +/** + * do_adjtimex() - Accessor function to NTP __do_adjtimex function + * @txc: Pointer to kernel_timex structure containing NTP parameters + */ +int do_adjtimex(struct __kernel_timex *txc) +{ + struct adjtimex_result result = { }; + int ret; - audit_ntp_log(&ad); + ret = __do_adjtimex(&tk_core, txc, &result); + if (ret < 0) + return ret; - /* Update the multiplier immediately if frequency was set directly */ - if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) - clock_set |= timekeeping_advance(TK_ADV_FREQ); + if (txc->modes & ADJ_SETOFFSET) + audit_tk_injoffset(result.delta); + + audit_ntp_log(&result.ad); - if (clock_set) + if (result.clock_set) clock_was_set(CLOCK_SET_WALL); - ntp_notify_cmos_timer(offset_set); + ntp_notify_cmos_timer(result.delta.tv_sec != 0); return ret; } +/* + * Invoked from NTP with the time keeper lock held, so lockless access is + * fine. + */ +long ktime_get_ntp_seconds(unsigned int id) +{ + return timekeeper_data[id].timekeeper.xtime_sec; +} + #ifdef CONFIG_NTP_PPS /** * hardpps() - Accessor function to NTP __hardpps function @@ -2573,3 +2793,321 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) } EXPORT_SYMBOL(hardpps); #endif /* CONFIG_NTP_PPS */ + +#ifdef CONFIG_POSIX_AUX_CLOCKS +#include "posix-timers.h" + +/* + * Bitmap for the activated auxiliary timekeepers to allow lockless quick + * checks in the hot paths without touching extra cache lines. If set, then + * the state of the corresponding timekeeper has to be re-checked under + * timekeeper::lock. + */ +static unsigned long aux_timekeepers; + +static inline unsigned int clockid_to_tkid(unsigned int id) +{ + return TIMEKEEPER_AUX_FIRST + id - CLOCK_AUX; +} + +static inline struct tk_data *aux_get_tk_data(clockid_t id) +{ + if (!clockid_aux_valid(id)) + return NULL; + return &timekeeper_data[clockid_to_tkid(id)]; +} + +/* Invoked from timekeeping after a clocksource change */ +static void tk_aux_update_clocksource(void) +{ + unsigned long active = READ_ONCE(aux_timekeepers); + unsigned int id; + + for_each_set_bit(id, &active, BITS_PER_LONG) { + struct tk_data *tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST]; + struct timekeeper *tks = &tkd->shadow_timekeeper; + + guard(raw_spinlock_irqsave)(&tkd->lock); + if (!tks->clock_valid) + continue; + + timekeeping_forward_now(tks); + tk_setup_internals(tks, tk_core.timekeeper.tkr_mono.clock); + timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL); + } +} + +static void tk_aux_advance(void) +{ + unsigned long active = READ_ONCE(aux_timekeepers); + unsigned int id; + + /* Lockless quick check to avoid extra cache lines */ + for_each_set_bit(id, &active, BITS_PER_LONG) { + struct tk_data *aux_tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST]; + + guard(raw_spinlock)(&aux_tkd->lock); + if (aux_tkd->shadow_timekeeper.clock_valid) + __timekeeping_advance(aux_tkd, TK_ADV_TICK); + } +} + +/** + * ktime_get_aux - Get time for a AUX clock + * @id: ID of the clock to read (CLOCK_AUX...) + * @kt: Pointer to ktime_t to store the time stamp + * + * Returns: True if the timestamp is valid, false otherwise + */ +bool ktime_get_aux(clockid_t id, ktime_t *kt) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct timekeeper *aux_tk; + unsigned int seq; + ktime_t base; + u64 nsecs; + + WARN_ON(timekeeping_suspended); + + if (!aux_tkd) + return false; + + aux_tk = &aux_tkd->timekeeper; + do { + seq = read_seqcount_begin(&aux_tkd->seq); + if (!aux_tk->clock_valid) + return false; + + base = ktime_add(aux_tk->tkr_mono.base, aux_tk->offs_aux); + nsecs = timekeeping_get_ns(&aux_tk->tkr_mono); + } while (read_seqcount_retry(&aux_tkd->seq, seq)); + + *kt = ktime_add_ns(base, nsecs); + return true; +} +EXPORT_SYMBOL_GPL(ktime_get_aux); + +/** + * ktime_get_aux_ts64 - Get time for a AUX clock + * @id: ID of the clock to read (CLOCK_AUX...) + * @ts: Pointer to timespec64 to store the time stamp + * + * Returns: True if the timestamp is valid, false otherwise + */ +bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *ts) +{ + ktime_t now; + + if (!ktime_get_aux(id, &now)) + return false; + *ts = ktime_to_timespec64(now); + return true; +} +EXPORT_SYMBOL_GPL(ktime_get_aux_ts64); + +static int aux_get_res(clockid_t id, struct timespec64 *tp) +{ + if (!clockid_aux_valid(id)) + return -ENODEV; + + tp->tv_sec = aux_clock_resolution_ns() / NSEC_PER_SEC; + tp->tv_nsec = aux_clock_resolution_ns() % NSEC_PER_SEC; + return 0; +} + +static int aux_get_timespec(clockid_t id, struct timespec64 *tp) +{ + return ktime_get_aux_ts64(id, tp) ? 0 : -ENODEV; +} + +static int aux_clock_set(const clockid_t id, const struct timespec64 *tnew) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct timekeeper *aux_tks; + ktime_t tnow, nsecs; + + if (!timespec64_valid_settod(tnew)) + return -EINVAL; + if (!aux_tkd) + return -ENODEV; + + aux_tks = &aux_tkd->shadow_timekeeper; + + guard(raw_spinlock_irq)(&aux_tkd->lock); + if (!aux_tks->clock_valid) + return -ENODEV; + + /* Forward the timekeeper base time */ + timekeeping_forward_now(aux_tks); + /* + * Get the updated base time. tkr_mono.base has not been + * updated yet, so do that first. That makes the update + * in timekeeping_update_from_shadow() redundant, but + * that's harmless. After that @tnow can be calculated + * by using tkr_mono::cycle_last, which has been set + * by timekeeping_forward_now(). + */ + tk_update_ktime_data(aux_tks); + nsecs = timekeeping_cycles_to_ns(&aux_tks->tkr_mono, aux_tks->tkr_mono.cycle_last); + tnow = ktime_add(aux_tks->tkr_mono.base, nsecs); + + /* + * Calculate the new AUX offset as delta to @tnow ("monotonic"). + * That avoids all the tk::xtime back and forth conversions as + * xtime ("realtime") is not applicable for auxiliary clocks and + * kept in sync with "monotonic". + */ + tk_update_aux_offs(aux_tks, ktime_sub(timespec64_to_ktime(*tnew), tnow)); + + timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); + return 0; +} + +static int aux_clock_adj(const clockid_t id, struct __kernel_timex *txc) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct adjtimex_result result = { }; + + if (!aux_tkd) + return -ENODEV; + + /* + * @result is ignored for now as there are neither hrtimers nor a + * RTC related to auxiliary clocks for now. + */ + return __do_adjtimex(aux_tkd, txc, &result); +} + +const struct k_clock clock_aux = { + .clock_getres = aux_get_res, + .clock_get_timespec = aux_get_timespec, + .clock_set = aux_clock_set, + .clock_adj = aux_clock_adj, +}; + +static void aux_clock_enable(clockid_t id) +{ + struct tk_read_base *tkr_raw = &tk_core.timekeeper.tkr_raw; + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct timekeeper *aux_tks = &aux_tkd->shadow_timekeeper; + + /* Prevent the core timekeeper from changing. */ + guard(raw_spinlock_irq)(&tk_core.lock); + + /* + * Setup the auxiliary clock assuming that the raw core timekeeper + * clock frequency conversion is close enough. Userspace has to + * adjust for the deviation via clock_adjtime(2). + */ + guard(raw_spinlock_nested)(&aux_tkd->lock); + + /* Remove leftovers of a previous registration */ + memset(aux_tks, 0, sizeof(*aux_tks)); + /* Restore the timekeeper id */ + aux_tks->id = aux_tkd->timekeeper.id; + /* Setup the timekeeper based on the current system clocksource */ + tk_setup_internals(aux_tks, tkr_raw->clock); + + /* Mark it valid and set it live */ + aux_tks->clock_valid = true; + timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); +} + +static void aux_clock_disable(clockid_t id) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + + guard(raw_spinlock_irq)(&aux_tkd->lock); + aux_tkd->shadow_timekeeper.clock_valid = false; + timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); +} + +static DEFINE_MUTEX(aux_clock_mutex); + +static ssize_t aux_clock_enable_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + /* Lazy atoi() as name is "0..7" */ + int id = kobj->name[0] & 0x7; + bool enable; + + if (!capable(CAP_SYS_TIME)) + return -EPERM; + + if (kstrtobool(buf, &enable) < 0) + return -EINVAL; + + guard(mutex)(&aux_clock_mutex); + if (enable == test_bit(id, &aux_timekeepers)) + return count; + + if (enable) { + aux_clock_enable(CLOCK_AUX + id); + set_bit(id, &aux_timekeepers); + } else { + aux_clock_disable(CLOCK_AUX + id); + clear_bit(id, &aux_timekeepers); + } + return count; +} + +static ssize_t aux_clock_enable_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + unsigned long active = READ_ONCE(aux_timekeepers); + /* Lazy atoi() as name is "0..7" */ + int id = kobj->name[0] & 0x7; + + return sysfs_emit(buf, "%d\n", test_bit(id, &active)); +} + +static struct kobj_attribute aux_clock_enable_attr = __ATTR_RW(aux_clock_enable); + +static struct attribute *aux_clock_enable_attrs[] = { + &aux_clock_enable_attr.attr, + NULL +}; + +static const struct attribute_group aux_clock_enable_attr_group = { + .attrs = aux_clock_enable_attrs, +}; + +static int __init tk_aux_sysfs_init(void) +{ + struct kobject *auxo, *tko = kobject_create_and_add("time", kernel_kobj); + int ret = -ENOMEM; + + if (!tko) + return ret; + + auxo = kobject_create_and_add("aux_clocks", tko); + if (!auxo) + goto err_clean; + + for (int i = 0; i < MAX_AUX_CLOCKS; i++) { + char id[2] = { [0] = '0' + i, }; + struct kobject *clk = kobject_create_and_add(id, auxo); + + if (!clk) { + ret = -ENOMEM; + goto err_clean; + } + + ret = sysfs_create_group(clk, &aux_clock_enable_attr_group); + if (ret) + goto err_clean; + } + return 0; + +err_clean: + kobject_put(auxo); + kobject_put(tko); + return ret; +} +late_initcall(tk_aux_sysfs_init); + +static __init void tk_aux_setup(void) +{ + for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++) + tkd_basic_setup(&timekeeper_data[i], i, false); +} +#endif /* CONFIG_POSIX_AUX_CLOCKS */ diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 8c9079108ffb..973ede670a36 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -45,4 +45,7 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask, u64 max_delta) unsigned long timekeeper_lock_irqsave(void); void timekeeper_unlock_irqrestore(unsigned long flags); +/* NTP specific interface to access the current seconds value */ +long ktime_get_ntp_seconds(unsigned int id); + #endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 4d915c0a263c..1f2364126894 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -386,32 +386,6 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu, } /** - * __round_jiffies - function to round jiffies to a full second - * @j: the time in (absolute) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * __round_jiffies() rounds an absolute time in the future (in jiffies) - * up or down to (approximately) full seconds. This is useful for timers - * for which the exact time they fire does not matter too much, as long as - * they fire approximately every X seconds. - * - * By rounding these timers to whole seconds, all such timers will fire - * at the same time, rather than at various times spread out. The goal - * of this is to have the CPU wake up less, which saves power. - * - * The exact rounding is skewed for each processor to avoid all - * processors firing at the exact same time, which could lead - * to lock contention or spurious cache line bouncing. - * - * The return value is the rounded version of the @j parameter. - */ -unsigned long __round_jiffies(unsigned long j, int cpu) -{ - return round_jiffies_common(j, cpu, false); -} -EXPORT_SYMBOL_GPL(__round_jiffies); - -/** * __round_jiffies_relative - function to round jiffies to a full second * @j: the time in (relative) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen @@ -483,22 +457,6 @@ unsigned long round_jiffies_relative(unsigned long j) EXPORT_SYMBOL_GPL(round_jiffies_relative); /** - * __round_jiffies_up - function to round jiffies up to a full second - * @j: the time in (absolute) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * This is the same as __round_jiffies() except that it will never - * round down. This is useful for timeouts for which the exact time - * of firing does not matter too much, as long as they don't fire too - * early. - */ -unsigned long __round_jiffies_up(unsigned long j, int cpu) -{ - return round_jiffies_common(j, cpu, true); -} -EXPORT_SYMBOL_GPL(__round_jiffies_up); - -/** * __round_jiffies_up_relative - function to round jiffies up to a full second * @j: the time in (relative) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen @@ -850,7 +808,7 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags, const char *name, struct lock_class_key *key); -void init_timer_on_stack_key(struct timer_list *timer, +void timer_init_key_on_stack(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) @@ -858,13 +816,13 @@ void init_timer_on_stack_key(struct timer_list *timer, debug_object_init_on_stack(timer, &timer_debug_descr); do_init_timer(timer, func, flags, name, key); } -EXPORT_SYMBOL_GPL(init_timer_on_stack_key); +EXPORT_SYMBOL_GPL(timer_init_key_on_stack); -void destroy_timer_on_stack(struct timer_list *timer) +void timer_destroy_on_stack(struct timer_list *timer) { debug_object_free(timer, &timer_debug_descr); } -EXPORT_SYMBOL_GPL(destroy_timer_on_stack); +EXPORT_SYMBOL_GPL(timer_destroy_on_stack); #else static inline void debug_timer_init(struct timer_list *timer) { } @@ -904,7 +862,7 @@ static void do_init_timer(struct timer_list *timer, } /** - * init_timer_key - initialize a timer + * timer_init_key - initialize a timer * @timer: the timer to be initialized * @func: timer callback function * @flags: timer flags @@ -912,17 +870,17 @@ static void do_init_timer(struct timer_list *timer, * @key: lockdep class key of the fake lock used for tracking timer * sync lock dependencies * - * init_timer_key() must be done to a timer prior to calling *any* of the + * timer_init_key() must be done to a timer prior to calling *any* of the * other timer functions. */ -void init_timer_key(struct timer_list *timer, +void timer_init_key(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) { debug_init(timer); do_init_timer(timer, func, flags, name, key); } -EXPORT_SYMBOL(init_timer_key); +EXPORT_SYMBOL(timer_init_key); static inline void detach_timer(struct timer_list *timer, bool clear_pending) { @@ -1500,10 +1458,11 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) base = lock_timer_base(timer, &flags); - if (base->running_timer != timer) + if (base->running_timer != timer) { ret = detach_if_pending(timer, base, true); - if (shutdown) - timer->function = NULL; + if (shutdown) + timer->function = NULL; + } raw_spin_unlock_irqrestore(&base->lock, flags); @@ -1511,7 +1470,7 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) } /** - * try_to_del_timer_sync - Try to deactivate a timer + * timer_delete_sync_try - Try to deactivate a timer * @timer: Timer to deactivate * * This function tries to deactivate a timer. On success the timer is not @@ -1526,11 +1485,11 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) * * %1 - The timer was pending and deactivated * * %-1 - The timer callback function is running on a different CPU */ -int try_to_del_timer_sync(struct timer_list *timer) +int timer_delete_sync_try(struct timer_list *timer) { return __try_to_del_timer_sync(timer, false); } -EXPORT_SYMBOL(try_to_del_timer_sync); +EXPORT_SYMBOL(timer_delete_sync_try); #ifdef CONFIG_PREEMPT_RT static __init void timer_base_init_expiry_lock(struct timer_base *base) @@ -1900,7 +1859,7 @@ static void timer_recalc_next_expiry(struct timer_base *base) unsigned long clk, next, adj; unsigned lvl, offset = 0; - next = base->clk + NEXT_TIMER_MAX_DELTA; + next = base->clk + TIMER_NEXT_MAX_DELTA; clk = base->clk; for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { int pos = next_pending_bucket(base, offset, clk & LVL_MASK); @@ -1963,7 +1922,7 @@ static void timer_recalc_next_expiry(struct timer_base *base) WRITE_ONCE(base->next_expiry, next); base->next_expiry_recalc = false; - base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA); + base->timers_pending = !(next == base->clk + TIMER_NEXT_MAX_DELTA); } #ifdef CONFIG_NO_HZ_COMMON @@ -2015,7 +1974,7 @@ static unsigned long next_timer_interrupt(struct timer_base *base, * easy comparable to find out which base holds the first pending timer. */ if (!base->timers_pending) - WRITE_ONCE(base->next_expiry, basej + NEXT_TIMER_MAX_DELTA); + WRITE_ONCE(base->next_expiry, basej + TIMER_NEXT_MAX_DELTA); return base->next_expiry; } @@ -2399,7 +2358,7 @@ static inline void __run_timers(struct timer_base *base) * timer at this clk are that all matching timers have been * dequeued or no timer has been queued since * base::next_expiry was set to base::clk + - * NEXT_TIMER_MAX_DELTA. + * TIMER_NEXT_MAX_DELTA. */ WARN_ON_ONCE(!levels && !base->next_expiry_recalc && base->timers_pending); @@ -2514,7 +2473,7 @@ void update_process_times(int user_tick) run_local_timers(); rcu_sched_clock_irq(user_tick); #ifdef CONFIG_IRQ_WORK - if (in_irq()) + if (in_hardirq()) irq_work_tick(); #endif sched_tick(); @@ -2544,7 +2503,7 @@ int timers_prepare_cpu(unsigned int cpu) for (b = 0; b < NR_BASES; b++) { base = per_cpu_ptr(&timer_bases[b], cpu); base->clk = jiffies; - base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; + base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA; base->next_expiry_recalc = false; base->timers_pending = false; base->is_idle = false; @@ -2599,7 +2558,7 @@ static void __init init_timer_cpu(int cpu) base->cpu = cpu; raw_spin_lock_init(&base->lock); base->clk = jiffies; - base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; + base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA; timer_base_init_expiry_lock(base); } } @@ -2612,7 +2571,7 @@ static void __init init_timer_cpus(void) init_timer_cpu(cpu); } -void __init init_timers(void) +void __init timers_init(void) { init_timer_cpus(); posix_cputimers_init_work(); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index b03d0ada6469..488e47e96e93 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -102,8 +102,6 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); - - SEQ_printf(m, " .get_time: %ps\n", base->get_time); #ifdef CONFIG_HIGH_RES_TIMERS SEQ_printf(m, " .offset: %Lu nsecs\n", (unsigned long long) ktime_to_ns(base->offset)); diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 2f6330831f08..18dda1aa782d 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -10,6 +10,7 @@ #include <linux/spinlock.h> #include <linux/timerqueue.h> #include <trace/events/ipi.h> +#include <linux/sched/isolation.h> #include "timer_migration.h" #include "tick-internal.h" @@ -420,14 +421,54 @@ static struct list_head *tmigr_level_list __read_mostly; static unsigned int tmigr_hierarchy_levels __read_mostly; static unsigned int tmigr_crossnode_level __read_mostly; +static struct tmigr_group *tmigr_root; + static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu); +/* + * CPUs available for timer migration. + * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock. + * Additionally tmigr_available_mutex serializes set/clear operations with each other. + */ +static cpumask_var_t tmigr_available_cpumask; +static DEFINE_MUTEX(tmigr_available_mutex); + +/* Enabled during late initcall */ +static DEFINE_STATIC_KEY_FALSE(tmigr_exclude_isolated); + #define TMIGR_NONE 0xFF #define BIT_CNT 8 static inline bool tmigr_is_not_available(struct tmigr_cpu *tmc) { - return !(tmc->tmgroup && tmc->online); + return !(tmc->tmgroup && tmc->available); +} + +/* + * Returns true if @cpu should be excluded from the hierarchy as isolated. + * Domain isolated CPUs don't participate in timer migration, nohz_full CPUs + * are still part of the hierarchy but become idle (from a tick and timer + * migration perspective) when they stop their tick. This lets the timekeeping + * CPU handle their global timers. Marking also isolated CPUs as idle would be + * too costly, hence they are completely excluded from the hierarchy. + * This check is necessary, for instance, to prevent offline isolated CPUs from + * being incorrectly marked as available once getting back online. + * + * This function returns false during early boot and the isolation logic is + * enabled only after isolated CPUs are marked as unavailable at late boot. + * The tick CPU can be isolated at boot, however we cannot mark it as + * unavailable to avoid having no global migrator for the nohz_full CPUs. This + * should be ensured by the callers of this function: implicitly from hotplug + * callbacks and explicitly in tmigr_init_isolation() and + * tmigr_isolated_exclude_cpumask(). + */ +static inline bool tmigr_is_isolated(int cpu) +{ + if (!static_branch_unlikely(&tmigr_exclude_isolated)) + return false; + return (!housekeeping_cpu(cpu, HK_TYPE_DOMAIN) || + cpuset_cpu_is_isolated(cpu)) && + housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE); } /* @@ -502,11 +543,6 @@ static bool tmigr_check_lonely(struct tmigr_group *group) * @now: timer base monotonic * @check: is set if there is the need to handle remote timers; * required in tmigr_requires_handle_remote() only - * @tmc_active: this flag indicates, whether the CPU which triggers - * the hierarchy walk is !idle in the timer migration - * hierarchy. When the CPU is idle and the whole hierarchy is - * idle, only the first event of the top level has to be - * considered. */ struct tmigr_walk { u64 nextexp; @@ -517,16 +553,13 @@ struct tmigr_walk { unsigned long basej; u64 now; bool check; - bool tmc_active; }; typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, struct tmigr_walk *); -static void __walk_groups(up_f up, struct tmigr_walk *data, - struct tmigr_cpu *tmc) +static void __walk_groups_from(up_f up, struct tmigr_walk *data, + struct tmigr_group *child, struct tmigr_group *group) { - struct tmigr_group *child = NULL, *group = tmc->tmgroup; - do { WARN_ON_ONCE(group->level >= tmigr_hierarchy_levels); @@ -544,6 +577,12 @@ static void __walk_groups(up_f up, struct tmigr_walk *data, } while (group); } +static void __walk_groups(up_f up, struct tmigr_walk *data, + struct tmigr_cpu *tmc) +{ + __walk_groups_from(up, data, NULL, tmc->tmgroup); +} + static void walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc) { lockdep_assert_held(&tmc->lock); @@ -708,7 +747,7 @@ void tmigr_cpu_activate(void) /* * Returns true, if there is nothing to be propagated to the next level * - * @data->firstexp is set to expiry of first gobal event of the (top level of + * @data->firstexp is set to expiry of first global event of the (top level of * the) hierarchy, but only when hierarchy is completely idle. * * The child and group states need to be read under the lock, to prevent a race @@ -926,7 +965,7 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now, * updated the event takes care when hierarchy is completely * idle. Otherwise the migrator does it as the event is enqueued. */ - if (!tmc->online || tmc->remote || tmc->cpuevt.ignore || + if (!tmc->available || tmc->remote || tmc->cpuevt.ignore || now < tmc->cpuevt.nextevt.expires) { raw_spin_unlock_irq(&tmc->lock); return; @@ -973,7 +1012,7 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now, * (See also section "Required event and timerqueue update after a * remote expiry" in the documentation at the top) */ - if (!tmc->online || !tmc->idle) { + if (!tmc->available || !tmc->idle) { timer_unlock_remote_bases(cpu); goto unlock; } @@ -1113,15 +1152,6 @@ static bool tmigr_requires_handle_remote_up(struct tmigr_group *group, */ if (!tmigr_check_migrator(group, childmask)) return true; - - /* - * When there is a parent group and the CPU which triggered the - * hierarchy walk is not active, proceed the walk to reach the top level - * group before reading the next_expiry value. - */ - if (group->parent && !data->tmc_active) - return false; - /* * The lock is required on 32bit architectures to read the variable * consistently with a concurrent writer. On 64bit the lock is not @@ -1166,7 +1196,6 @@ bool tmigr_requires_handle_remote(void) data.now = get_jiffies_update(&jif); data.childmask = tmc->groupmask; data.firstexp = KTIME_MAX; - data.tmc_active = !tmc->idle; data.check = false; /* @@ -1405,23 +1434,20 @@ u64 tmigr_quick_check(u64 nextevt) return KTIME_MAX; do { - if (!tmigr_check_lonely(group)) { + if (!tmigr_check_lonely(group)) return KTIME_MAX; - } else { - /* - * Since current CPU is active, events may not be sorted - * from bottom to the top because the CPU's event is ignored - * up to the top and its sibling's events not propagated upwards. - * Thus keep track of the lowest observed expiry. - */ - nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry)); - if (!group->parent) - return nextevt; - } + + /* + * Since current CPU is active, events may not be sorted + * from bottom to the top because the CPU's event is ignored + * up to the top and its sibling's events not propagated upwards. + * Thus keep track of the lowest observed expiry. + */ + nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry)); group = group->parent; } while (group); - return KTIME_MAX; + return nextevt; } /* @@ -1435,38 +1461,43 @@ static long tmigr_trigger_active(void *unused) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); - WARN_ON_ONCE(!tmc->online || tmc->idle); + WARN_ON_ONCE(!tmc->available || tmc->idle); return 0; } -static int tmigr_cpu_offline(unsigned int cpu) +static int tmigr_clear_cpu_available(unsigned int cpu) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); int migrator; u64 firstexp; - raw_spin_lock_irq(&tmc->lock); - tmc->online = false; - WRITE_ONCE(tmc->wakeup, KTIME_MAX); + guard(mutex)(&tmigr_available_mutex); - /* - * CPU has to handle the local events on his own, when on the way to - * offline; Therefore nextevt value is set to KTIME_MAX - */ - firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); - trace_tmigr_cpu_offline(tmc); - raw_spin_unlock_irq(&tmc->lock); + cpumask_clear_cpu(cpu, tmigr_available_cpumask); + scoped_guard(raw_spinlock_irq, &tmc->lock) { + if (!tmc->available) + return 0; + tmc->available = false; + WRITE_ONCE(tmc->wakeup, KTIME_MAX); + + /* + * CPU has to handle the local events on his own, when on the way to + * offline; Therefore nextevt value is set to KTIME_MAX + */ + firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); + trace_tmigr_cpu_unavailable(tmc); + } if (firstexp != KTIME_MAX) { - migrator = cpumask_any_but(cpu_online_mask, cpu); + migrator = cpumask_any(tmigr_available_cpumask); work_on_cpu(migrator, tmigr_trigger_active, NULL); } return 0; } -static int tmigr_cpu_online(unsigned int cpu) +static int tmigr_set_cpu_available(unsigned int cpu) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); @@ -1474,16 +1505,123 @@ static int tmigr_cpu_online(unsigned int cpu) if (WARN_ON_ONCE(!tmc->tmgroup)) return -EINVAL; - raw_spin_lock_irq(&tmc->lock); - trace_tmigr_cpu_online(tmc); - tmc->idle = timer_base_is_idle(); - if (!tmc->idle) - __tmigr_cpu_activate(tmc); - tmc->online = true; - raw_spin_unlock_irq(&tmc->lock); + if (tmigr_is_isolated(cpu)) + return 0; + + guard(mutex)(&tmigr_available_mutex); + + cpumask_set_cpu(cpu, tmigr_available_cpumask); + scoped_guard(raw_spinlock_irq, &tmc->lock) { + if (tmc->available) + return 0; + trace_tmigr_cpu_available(tmc); + tmc->idle = timer_base_is_idle(); + if (!tmc->idle) + __tmigr_cpu_activate(tmc); + tmc->available = true; + } + return 0; +} + +static void tmigr_cpu_isolate(struct work_struct *ignored) +{ + tmigr_clear_cpu_available(smp_processor_id()); +} + +static void tmigr_cpu_unisolate(struct work_struct *ignored) +{ + tmigr_set_cpu_available(smp_processor_id()); +} + +/** + * tmigr_isolated_exclude_cpumask - Exclude given CPUs from hierarchy + * @exclude_cpumask: the cpumask to be excluded from timer migration hierarchy + * + * This function can be called from cpuset code to provide the new set of + * isolated CPUs that should be excluded from the hierarchy. + * Online CPUs not present in exclude_cpumask but already excluded are brought + * back to the hierarchy. + * Functions to isolate/unisolate need to be called locally and can sleep. + */ +int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask) +{ + struct work_struct __percpu *works __free(free_percpu) = + alloc_percpu(struct work_struct); + cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL; + int cpu; + + lockdep_assert_cpus_held(); + + if (!works) + return -ENOMEM; + if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) + return -ENOMEM; + + /* + * First set previously isolated CPUs as available (unisolate). + * This cpumask contains only CPUs that switched to available now. + */ + cpumask_andnot(cpumask, cpu_online_mask, exclude_cpumask); + cpumask_andnot(cpumask, cpumask, tmigr_available_cpumask); + + for_each_cpu(cpu, cpumask) { + struct work_struct *work = per_cpu_ptr(works, cpu); + + INIT_WORK(work, tmigr_cpu_unisolate); + schedule_work_on(cpu, work); + } + for_each_cpu(cpu, cpumask) + flush_work(per_cpu_ptr(works, cpu)); + + /* + * Then clear previously available CPUs (isolate). + * This cpumask contains only CPUs that switched to not available now. + * There cannot be overlap with the newly available ones. + */ + cpumask_and(cpumask, exclude_cpumask, tmigr_available_cpumask); + cpumask_and(cpumask, cpumask, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE)); + /* + * Handle this here and not in the cpuset code because exclude_cpumask + * might include also the tick CPU if included in isolcpus. + */ + for_each_cpu(cpu, cpumask) { + if (!tick_nohz_cpu_hotpluggable(cpu)) { + cpumask_clear_cpu(cpu, cpumask); + break; + } + } + + for_each_cpu(cpu, cpumask) { + struct work_struct *work = per_cpu_ptr(works, cpu); + + INIT_WORK(work, tmigr_cpu_isolate); + schedule_work_on(cpu, work); + } + for_each_cpu(cpu, cpumask) + flush_work(per_cpu_ptr(works, cpu)); + return 0; } +static int __init tmigr_init_isolation(void) +{ + cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL; + + static_branch_enable(&tmigr_exclude_isolated); + + if (!housekeeping_enabled(HK_TYPE_DOMAIN)) + return 0; + if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_andnot(cpumask, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); + + /* Protect against RCU torture hotplug testing */ + guard(cpus_read_lock)(); + return tmigr_isolated_exclude_cpumask(cpumask); +} +late_initcall(tmigr_init_isolation); + static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, int node) { @@ -1501,21 +1639,6 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, s.seq = 0; atomic_set(&group->migr_state, s.state); - /* - * If this is a new top-level, prepare its groupmask in advance. - * This avoids accidents where yet another new top-level is - * created in the future and made visible before the current groupmask. - */ - if (list_empty(&tmigr_level_list[lvl])) { - group->groupmask = BIT(0); - /* - * The previous top level has prepared its groupmask already, - * simply account it as the first child. - */ - if (lvl > 0) - group->num_children = 1; - } - timerqueue_init_head(&group->events); timerqueue_init(&group->groupevt.nextevt); group->groupevt.nextevt.expires = KTIME_MAX; @@ -1523,8 +1646,7 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, group->groupevt.ignore = true; } -static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node, - unsigned int lvl) +static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl) { struct tmigr_group *tmp, *group = NULL; @@ -1570,25 +1692,51 @@ static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node, return group; } +static bool tmigr_init_root(struct tmigr_group *group, bool activate) +{ + if (!group->parent && group != tmigr_root) { + /* + * This is the new top-level, prepare its groupmask in advance + * to avoid accidents where yet another new top-level is + * created in the future and made visible before this groupmask. + */ + group->groupmask = BIT(0); + WARN_ON_ONCE(activate); + + return true; + } + + return false; + +} + static void tmigr_connect_child_parent(struct tmigr_group *child, struct tmigr_group *parent, bool activate) { - struct tmigr_walk data; - - raw_spin_lock_irq(&child->lock); - raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING); + if (tmigr_init_root(parent, activate)) { + /* + * The previous top level had prepared its groupmask already, + * simply account it in advance as the first child. If some groups + * have been created between the old and new root due to node + * mismatch, the new root's child will be intialized accordingly. + */ + parent->num_children = 1; + } - if (activate) { + /* Connecting old root to new root ? */ + if (!parent->parent && activate) { /* - * @child is the old top and @parent the new one. In this - * case groupmask is pre-initialized and @child already - * accounted, along with its new sibling corresponding to the - * CPU going up. + * @child is the old top, or in case of node mismatch, some + * intermediate group between the old top and the new one in + * @parent. In this case the @child must be pre-accounted above + * as the first child. Its new inactive sibling corresponding + * to the CPU going up has been accounted as the second child. */ - WARN_ON_ONCE(child->groupmask != BIT(0) || parent->num_children != 2); + WARN_ON_ONCE(parent->num_children != 2); + child->groupmask = BIT(0); } else { - /* Adding @child for the CPU going up to @parent. */ + /* Common case adding @child for the CPU going up to @parent. */ child->groupmask = BIT(parent->num_children++); } @@ -1599,87 +1747,61 @@ static void tmigr_connect_child_parent(struct tmigr_group *child, */ smp_store_release(&child->parent, parent); - raw_spin_unlock(&parent->lock); - raw_spin_unlock_irq(&child->lock); - trace_tmigr_connect_child_parent(child); - - if (!activate) - return; - - /* - * To prevent inconsistent states, active children need to be active in - * the new parent as well. Inactive children are already marked inactive - * in the parent group: - * - * * When new groups were created by tmigr_setup_groups() starting from - * the lowest level (and not higher then one level below the current - * top level), then they are not active. They will be set active when - * the new online CPU comes active. - * - * * But if a new group above the current top level is required, it is - * mandatory to propagate the active state of the already existing - * child to the new parent. So tmigr_connect_child_parent() is - * executed with the formerly top level group (child) and the newly - * created group (parent). - * - * * It is ensured that the child is active, as this setup path is - * executed in hotplug prepare callback. This is exectued by an - * already connected and !idle CPU. Even if all other CPUs go idle, - * the CPU executing the setup will be responsible up to current top - * level group. And the next time it goes inactive, it will release - * the new childmask and parent to subsequent walkers through this - * @child. Therefore propagate active state unconditionally. - */ - data.childmask = child->groupmask; - - /* - * There is only one new level per time (which is protected by - * tmigr_mutex). When connecting the child and the parent and set the - * child active when the parent is inactive, the parent needs to be the - * uppermost level. Otherwise there went something wrong! - */ - WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent); } -static int tmigr_setup_groups(unsigned int cpu, unsigned int node) +static int tmigr_setup_groups(unsigned int cpu, unsigned int node, + struct tmigr_group *start, bool activate) { struct tmigr_group *group, *child, **stack; - int top = 0, err = 0, i = 0; - struct list_head *lvllist; + int i, top = 0, err = 0, start_lvl = 0; + bool root_mismatch = false; stack = kcalloc(tmigr_hierarchy_levels, sizeof(*stack), GFP_KERNEL); if (!stack) return -ENOMEM; - do { - group = tmigr_get_group(cpu, node, i); + if (start) { + stack[start->level] = start; + start_lvl = start->level + 1; + } + + if (tmigr_root) + root_mismatch = tmigr_root->numa_node != node; + + for (i = start_lvl; i < tmigr_hierarchy_levels; i++) { + group = tmigr_get_group(node, i); if (IS_ERR(group)) { err = PTR_ERR(group); + i--; break; } top = i; - stack[i++] = group; + stack[i] = group; /* * When booting only less CPUs of a system than CPUs are - * available, not all calculated hierarchy levels are required. + * available, not all calculated hierarchy levels are required, + * unless a node mismatch is detected. * * The loop is aborted as soon as the highest level, which might * be different from tmigr_hierarchy_levels, contains only a - * single group. + * single group, unless the nodes mismatch below tmigr_crossnode_level */ - if (group->parent || list_is_singular(&tmigr_level_list[i - 1])) + if (group->parent) break; + if ((!root_mismatch || i >= tmigr_crossnode_level) && + list_is_singular(&tmigr_level_list[i])) + break; + } - } while (i < tmigr_hierarchy_levels); - - /* Assert single root */ - WARN_ON_ONCE(!err && !group->parent && !list_is_singular(&tmigr_level_list[top])); + /* Assert single root without parent */ + if (WARN_ON_ONCE(i >= tmigr_hierarchy_levels)) + return -EINVAL; - while (i > 0) { - group = stack[--i]; + for (; i >= start_lvl; i--) { + group = stack[i]; if (err < 0) { list_del(&group->list); @@ -1695,12 +1817,10 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) if (i == 0) { struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu); - raw_spin_lock_irq(&group->lock); - tmc->tmgroup = group; tmc->groupmask = BIT(group->num_children++); - raw_spin_unlock_irq(&group->lock); + tmigr_init_root(group, activate); trace_tmigr_connect_cpu_parent(tmc); @@ -1708,42 +1828,58 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) continue; } else { child = stack[i - 1]; - /* Will be activated at online time */ - tmigr_connect_child_parent(child, group, false); + tmigr_connect_child_parent(child, group, activate); } + } - /* check if uppermost level was newly created */ - if (top != i) - continue; - - WARN_ON_ONCE(top == 0); + if (err < 0) + goto out; - lvllist = &tmigr_level_list[top]; + if (activate) { + struct tmigr_walk data; + union tmigr_state state; /* - * Newly created root level should have accounted the upcoming - * CPU's child group and pre-accounted the old root. + * To prevent inconsistent states, active children need to be active in + * the new parent as well. Inactive children are already marked inactive + * in the parent group: + * + * * When new groups were created by tmigr_setup_groups() starting from + * the lowest level, then they are not active. They will be set active + * when the new online CPU comes active. + * + * * But if new groups above the current top level are required, it is + * mandatory to propagate the active state of the already existing + * child to the new parents. So tmigr_active_up() activates the + * new parents while walking up from the old root to the new. + * + * * It is ensured that @start is active, as this setup path is + * executed in hotplug prepare callback. This is executed by an + * already connected and !idle CPU. Even if all other CPUs go idle, + * the CPU executing the setup will be responsible up to current top + * level group. And the next time it goes inactive, it will release + * the new childmask and parent to subsequent walkers through this + * @child. Therefore propagate active state unconditionally. */ - if (group->num_children == 2 && list_is_singular(lvllist)) { - /* - * The target CPU must never do the prepare work, except - * on early boot when the boot CPU is the target. Otherwise - * it may spuriously activate the old top level group inside - * the new one (nevertheless whether old top level group is - * active or not) and/or release an uninitialized childmask. - */ - WARN_ON_ONCE(cpu == raw_smp_processor_id()); - - lvllist = &tmigr_level_list[top - 1]; - list_for_each_entry(child, lvllist, list) { - if (child->parent) - continue; + state.state = atomic_read(&start->migr_state); + WARN_ON_ONCE(!state.active); + WARN_ON_ONCE(!start->parent); + data.childmask = start->groupmask; + __walk_groups_from(tmigr_active_up, &data, start, start->parent); + } - tmigr_connect_child_parent(child, group, true); - } + /* Root update */ + if (list_is_singular(&tmigr_level_list[top])) { + group = list_first_entry(&tmigr_level_list[top], + typeof(*group), list); + WARN_ON_ONCE(group->parent); + if (tmigr_root) { + /* Old root should be the same or below */ + WARN_ON_ONCE(tmigr_root->level > top); } + tmigr_root = group; } - +out: kfree(stack); return err; @@ -1751,12 +1887,31 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) static int tmigr_add_cpu(unsigned int cpu) { + struct tmigr_group *old_root = tmigr_root; int node = cpu_to_node(cpu); int ret; - mutex_lock(&tmigr_mutex); - ret = tmigr_setup_groups(cpu, node); - mutex_unlock(&tmigr_mutex); + guard(mutex)(&tmigr_mutex); + + ret = tmigr_setup_groups(cpu, node, NULL, false); + + /* Root has changed? Connect the old one to the new */ + if (ret >= 0 && old_root && old_root != tmigr_root) { + /* + * The target CPU must never do the prepare work, except + * on early boot when the boot CPU is the target. Otherwise + * it may spuriously activate the old top level group inside + * the new one (nevertheless whether old top level group is + * active or not) and/or release an uninitialized childmask. + */ + WARN_ON_ONCE(cpu == raw_smp_processor_id()); + /* + * The (likely) current CPU is expected to be online in the hierarchy, + * otherwise the old root may not be active as expected. + */ + WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available); + ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true); + } return ret; } @@ -1801,6 +1956,11 @@ static int __init tmigr_init(void) if (ncpus == 1) return 0; + if (!zalloc_cpumask_var(&tmigr_available_cpumask, GFP_KERNEL)) { + ret = -ENOMEM; + goto err; + } + /* * Calculate the required hierarchy levels. Unfortunately there is no * reliable information available, unless all possible CPUs have been @@ -1850,7 +2010,7 @@ static int __init tmigr_init(void) goto err; ret = cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online", - tmigr_cpu_online, tmigr_cpu_offline); + tmigr_set_cpu_available, tmigr_clear_cpu_available); if (ret) goto err; diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h index ae19f70f8170..70879cde6fdd 100644 --- a/kernel/time/timer_migration.h +++ b/kernel/time/timer_migration.h @@ -97,7 +97,7 @@ struct tmigr_group { */ struct tmigr_cpu { raw_spinlock_t lock; - bool online; + bool available; bool idle; bool remote; struct tmigr_group *tmgroup; diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 01c2ab1e8971..aa59919b8f2c 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -15,26 +15,25 @@ #include "timekeeping_internal.h" +static inline void fill_clock_configuration(struct vdso_clock *vc, const struct tk_read_base *base) +{ + vc->cycle_last = base->cycle_last; +#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT + vc->max_cycles = base->clock->max_cycles; +#endif + vc->mask = base->mask; + vc->mult = base->mult; + vc->shift = base->shift; +} + static inline void update_vdso_time_data(struct vdso_time_data *vdata, struct timekeeper *tk) { struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; u64 nsec, sec; - vc[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; -#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT - vc[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles; -#endif - vc[CS_HRES_COARSE].mask = tk->tkr_mono.mask; - vc[CS_HRES_COARSE].mult = tk->tkr_mono.mult; - vc[CS_HRES_COARSE].shift = tk->tkr_mono.shift; - vc[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; -#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT - vc[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles; -#endif - vc[CS_RAW].mask = tk->tkr_raw.mask; - vc[CS_RAW].mult = tk->tkr_raw.mult; - vc[CS_RAW].shift = tk->tkr_raw.shift; + fill_clock_configuration(&vc[CS_HRES_COARSE], &tk->tkr_mono); + fill_clock_configuration(&vc[CS_RAW], &tk->tkr_raw); /* CLOCK_MONOTONIC */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; @@ -98,12 +97,12 @@ void update_vsyscall(struct timekeeper *tk) /* CLOCK_REALTIME_COARSE */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; vdso_ts->sec = tk->xtime_sec; - vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + vdso_ts->nsec = tk->coarse_nsec; /* CLOCK_MONOTONIC_COARSE */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsec = tk->coarse_nsec; nsec = nsec + tk->wall_to_monotonic.tv_nsec; vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); @@ -119,7 +118,8 @@ void update_vsyscall(struct timekeeper *tk) if (clock_mode != VDSO_CLOCKMODE_NONE) update_vdso_time_data(vdata, tk); - __arch_update_vsyscall(vdata); + __arch_update_vdso_clock(&vc[CS_HRES_COARSE]); + __arch_update_vdso_clock(&vc[CS_RAW]); vdso_write_end(vdata); @@ -136,6 +136,46 @@ void update_vsyscall_tz(void) __arch_sync_vdso_time_data(vdata); } +#ifdef CONFIG_POSIX_AUX_CLOCKS +void vdso_time_update_aux(struct timekeeper *tk) +{ + struct vdso_time_data *vdata = vdso_k_time_data; + struct vdso_timestamp *vdso_ts; + struct vdso_clock *vc; + s32 clock_mode; + u64 nsec; + + vc = &vdata->aux_clock_data[tk->id - TIMEKEEPER_AUX_FIRST]; + vdso_ts = &vc->basetime[VDSO_BASE_AUX]; + clock_mode = tk->tkr_mono.clock->vdso_clock_mode; + if (!tk->clock_valid) + clock_mode = VDSO_CLOCKMODE_NONE; + + /* copy vsyscall data */ + vdso_write_begin_clock(vc); + + vc->clock_mode = clock_mode; + + if (clock_mode != VDSO_CLOCKMODE_NONE) { + fill_clock_configuration(vc, &tk->tkr_mono); + + vdso_ts->sec = tk->xtime_sec + tk->monotonic_to_aux.tv_sec; + + nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsec += tk->monotonic_to_aux.tv_nsec; + vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec); + nsec = nsec << tk->tkr_mono.shift; + vdso_ts->nsec = nsec; + } + + __arch_update_vdso_clock(vc); + + vdso_write_end_clock(vc); + + __arch_sync_vdso_time_data(vdata); +} +#endif + /** * vdso_update_begin - Start of a VDSO update section * diff --git a/kernel/torture.c b/kernel/torture.c index 3a0a8cc60401..1ea9f67953a7 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -359,6 +359,8 @@ torture_onoff(void *arg) torture_hrtimeout_jiffies(onoff_holdoff, &rand); VERBOSE_TOROUT_STRING("torture_onoff end holdoff"); } + while (!rcu_inkernel_boot_has_ended()) + schedule_timeout_interruptible(HZ / 10); while (!torture_must_stop()) { if (disable_onoff_at_boot && !rcu_inkernel_boot_has_ended()) { torture_hrtimeout_jiffies(HZ / 10, &rand); @@ -797,8 +799,9 @@ static unsigned long torture_init_jiffies; static void torture_print_module_parms(void) { - pr_alert("torture module --- %s: disable_onoff_at_boot=%d ftrace_dump_at_shutdown=%d verbose_sleep_frequency=%d verbose_sleep_duration=%d random_shuffle=%d\n", - torture_type, disable_onoff_at_boot, ftrace_dump_at_shutdown, verbose_sleep_frequency, verbose_sleep_duration, random_shuffle); + pr_alert("torture module --- %s: disable_onoff_at_boot=%d ftrace_dump_at_shutdown=%d verbose_sleep_frequency=%d verbose_sleep_duration=%d random_shuffle=%d%s\n", + torture_type, disable_onoff_at_boot, ftrace_dump_at_shutdown, verbose_sleep_frequency, verbose_sleep_duration, random_shuffle, + rcu_inkernel_boot_has_ended() ? "" : " still booting"); } /* diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a3f35c7d83b6..bfa2ec46e075 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -53,6 +53,12 @@ config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS config HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS bool +config HAVE_EXTRA_IPI_TRACEPOINTS + bool + help + For architectures that use ipi_raise, ipi_entry and ipi_exit + tracepoints. + config HAVE_DYNAMIC_FTRACE_WITH_ARGS bool help @@ -74,10 +80,11 @@ config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE If the architecture generates __patchable_function_entries sections but does not want them included in the ftrace locations. -config HAVE_FTRACE_MCOUNT_RECORD +config HAVE_DYNAMIC_FTRACE_WITH_JMP bool help - See Documentation/trace/ftrace-design.rst + If the architecture supports to replace the __fentry__ with a + "jmp" instruction. config HAVE_SYSCALL_TRACEPOINTS bool @@ -199,6 +206,19 @@ menuconfig FTRACE if FTRACE +config TRACEFS_AUTOMOUNT_DEPRECATED + bool "Automount tracefs on debugfs [DEPRECATED]" + depends on TRACING + default y + help + The tracing interface was moved from /sys/kernel/debug/tracing + to /sys/kernel/tracing in 2015, but the tracing file system + was still automounted in /sys/kernel/debug for backward + compatibility with tooling. + + The new interface has been around for more than 10 years and + the old debug mount will soon be removed. + config BOOTTIME_TRACING bool "Boot-time Tracing support" depends on TRACING @@ -275,7 +295,7 @@ config FUNCTION_TRACE_ARGS funcgraph-args (for the function graph tracer) config DYNAMIC_FTRACE - bool "enable/disable function tracing dynamically" + bool depends on FUNCTION_TRACER depends on HAVE_DYNAMIC_FTRACE default y @@ -316,6 +336,26 @@ config DYNAMIC_FTRACE_WITH_ARGS depends on DYNAMIC_FTRACE depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS +config DYNAMIC_FTRACE_WITH_JMP + def_bool y + depends on DYNAMIC_FTRACE + depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS + depends on HAVE_DYNAMIC_FTRACE_WITH_JMP + +config FUNCTION_SELF_TRACING + bool "Function trace tracing code" + depends on FUNCTION_TRACER + help + Normally all the tracing code is set to notrace, where the function + tracer will ignore all the tracing functions. Sometimes it is useful + for debugging to trace some of the tracing infratructure itself. + Enable this to allow some of the tracing infrastructure to be traced + by the function tracer. Note, this will likely add noise to function + tracing if events and other tracing features are enabled along with + function tracing. + + If unsure, say N. + config FPROBE bool "Kernel Function Probe (fprobe)" depends on HAVE_FUNCTION_GRAPH_FREGS && HAVE_FTRACE_GRAPH_FUNC @@ -561,6 +601,20 @@ config FTRACE_SYSCALLS help Basic tracer to catch the syscall entry and exit events. +config TRACE_SYSCALL_BUF_SIZE_DEFAULT + int "System call user read max size" + range 0 165 + default 63 + depends on FTRACE_SYSCALLS + help + Some system call trace events will record the data from a user + space address that one of the parameters point to. The amount of + data per event is limited. That limit is set by this config and + this config also affects how much user space data perf can read. + + For a tracing instance, this size may be changed by writing into + its syscall_user_buf_size file. + config TRACER_SNAPSHOT bool "Create a snapshot trace buffer" select TRACER_MAX_TRACE @@ -779,6 +833,20 @@ config UPROBE_EVENTS This option is required if you plan to use perf-probe subcommand of perf tools on user space applications. +config EPROBE_EVENTS + bool "Enable event-based dynamic events" + depends on TRACING + depends on HAVE_REGS_AND_STACK_ACCESS_API + select PROBE_EVENTS + select DYNAMIC_EVENTS + default y + help + Eprobes are dynamic events that can be placed on other existing + events. It can be used to limit what fields are recorded in + an event or even dereference a field of an event. It can + convert the type of an event field. For example, turn an + address into a string. + config BPF_EVENTS depends on BPF_SYSCALL depends on (KPROBE_EVENTS || UPROBE_EVENTS) && PERF_EVENTS @@ -803,27 +871,22 @@ config BPF_KPROBE_OVERRIDE Allows BPF to override the execution of a probed function and set a different return value. This is used for error injection. -config FTRACE_MCOUNT_RECORD - def_bool y - depends on DYNAMIC_FTRACE - depends on HAVE_FTRACE_MCOUNT_RECORD - config FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY bool - depends on FTRACE_MCOUNT_RECORD + depends on DYNAMIC_FTRACE config FTRACE_MCOUNT_USE_CC def_bool y depends on $(cc-option,-mrecord-mcount) depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY - depends on FTRACE_MCOUNT_RECORD + depends on DYNAMIC_FTRACE config FTRACE_MCOUNT_USE_OBJTOOL def_bool y depends on HAVE_OBJTOOL_MCOUNT depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY depends on !FTRACE_MCOUNT_USE_CC - depends on FTRACE_MCOUNT_RECORD + depends on DYNAMIC_FTRACE select OBJTOOL config FTRACE_MCOUNT_USE_RECORDMCOUNT @@ -831,7 +894,7 @@ config FTRACE_MCOUNT_USE_RECORDMCOUNT depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY depends on !FTRACE_MCOUNT_USE_CC depends on !FTRACE_MCOUNT_USE_OBJTOOL - depends on FTRACE_MCOUNT_RECORD + depends on DYNAMIC_FTRACE config TRACING_MAP bool diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 057cd975d014..fc5dcc888e13 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -16,6 +16,23 @@ obj-y += trace_selftest_dynamic.o endif endif +# Allow some files to be function traced +ifdef CONFIG_FUNCTION_SELF_TRACING +CFLAGS_trace_output.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_seq.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_stat.o = $(CC_FLAGS_FTRACE) +CFLAGS_tracing_map.o = $(CC_FLAGS_FTRACE) +CFLAGS_synth_event_gen_test.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_events.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_syscalls.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_events_filter.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_events_trigger.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_events_synth.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_events_hist.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_events_user.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_dynevent.o = $(CC_FLAGS_FTRACE) +endif + ifdef CONFIG_FTRACE_STARTUP_TEST CFLAGS_trace_kprobe_selftest.o = $(CC_FLAGS_FTRACE) obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe_selftest.o @@ -82,7 +99,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o -obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o +obj-$(CONFIG_EPROBE_EVENTS) += trace_eprobe.o obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 3679a6d18934..d031c8d80be4 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -63,13 +63,116 @@ static int blk_probes_ref; static void blk_register_tracepoints(void); static void blk_unregister_tracepoints(void); +static void record_blktrace_event(struct blk_io_trace *t, pid_t pid, int cpu, + sector_t sector, int bytes, u64 what, + dev_t dev, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, int pdu_len) + +{ + /* + * These two are not needed in ftrace as they are in the + * generic trace_entry, filled by tracing_generic_entry_update, + * but for the trace_event->bin() synthesizer benefit we do it + * here too. + */ + t->cpu = cpu; + t->pid = pid; + + t->sector = sector; + t->bytes = bytes; + t->action = lower_32_bits(what); + t->device = dev; + t->error = error; + t->pdu_len = pdu_len + cgid_len; + + if (cgid_len) + memcpy((void *)t + sizeof(*t), &cgid, cgid_len); + if (pdu_len) + memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); +} + +static void record_blktrace_event2(struct blk_io_trace2 *t2, pid_t pid, int cpu, + sector_t sector, int bytes, u64 what, + dev_t dev, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, + int pdu_len) +{ + t2->pid = pid; + t2->cpu = cpu; + + t2->sector = sector; + t2->bytes = bytes; + t2->action = what; + t2->device = dev; + t2->error = error; + t2->pdu_len = pdu_len + cgid_len; + + if (cgid_len) + memcpy((void *)t2 + sizeof(*t2), &cgid, cgid_len); + if (pdu_len) + memcpy((void *)t2 + sizeof(*t2) + cgid_len, pdu_data, pdu_len); +} + +static void relay_blktrace_event1(struct blk_trace *bt, unsigned long sequence, + pid_t pid, int cpu, sector_t sector, int bytes, + u64 what, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, int pdu_len) +{ + struct blk_io_trace *t; + size_t trace_len = sizeof(*t) + pdu_len + cgid_len; + + t = relay_reserve(bt->rchan, trace_len); + if (!t) + return; + + t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; + t->sequence = sequence; + t->time = ktime_to_ns(ktime_get()); + + record_blktrace_event(t, pid, cpu, sector, bytes, what, bt->dev, error, + cgid, cgid_len, pdu_data, pdu_len); +} + +static void relay_blktrace_event2(struct blk_trace *bt, unsigned long sequence, + pid_t pid, int cpu, sector_t sector, + int bytes, u64 what, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, int pdu_len) +{ + struct blk_io_trace2 *t; + size_t trace_len = sizeof(struct blk_io_trace2) + pdu_len + cgid_len; + + t = relay_reserve(bt->rchan, trace_len); + if (!t) + return; + + t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE2_VERSION; + t->sequence = sequence; + t->time = ktime_to_ns(ktime_get()); + + record_blktrace_event2(t, pid, cpu, sector, bytes, what, bt->dev, error, + cgid, cgid_len, pdu_data, pdu_len); +} + +static void relay_blktrace_event(struct blk_trace *bt, unsigned long sequence, + pid_t pid, int cpu, sector_t sector, int bytes, + u64 what, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, int pdu_len) +{ + if (bt->version == 2) + return relay_blktrace_event2(bt, sequence, pid, cpu, sector, + bytes, what, error, cgid, cgid_len, + pdu_data, pdu_len); + return relay_blktrace_event1(bt, sequence, pid, cpu, sector, bytes, + what, error, cgid, cgid_len, pdu_data, + pdu_len); +} + /* * Send out a notify message. */ -static void trace_note(struct blk_trace *bt, pid_t pid, int action, +static void trace_note(struct blk_trace *bt, pid_t pid, u64 action, const void *data, size_t len, u64 cgid) { - struct blk_io_trace *t; struct ring_buffer_event *event = NULL; struct trace_buffer *buffer = NULL; unsigned int trace_ctx = 0; @@ -77,38 +180,30 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0; + action = lower_32_bits(action | (cgid ? __BLK_TN_CGROUP : 0)); if (blk_tracer) { + struct blk_io_trace2 *t; + size_t trace_len = sizeof(*t) + cgid_len + len; + buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, - sizeof(*t) + len + cgid_len, - trace_ctx); + trace_len, trace_ctx); if (!event) return; t = ring_buffer_event_data(event); - goto record_it; + record_blktrace_event2(t, pid, cpu, 0, 0, + action, bt->dev, 0, cgid, cgid_len, + (void *)data, len); + trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); + return; } if (!bt->rchan) return; - t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len); - if (t) { - t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; - t->time = ktime_to_ns(ktime_get()); -record_it: - t->device = bt->dev; - t->action = action | (cgid ? __BLK_TN_CGROUP : 0); - t->pid = pid; - t->cpu = cpu; - t->pdu_len = len + cgid_len; - if (cgid_len) - memcpy((void *)t + sizeof(*t), &cgid, cgid_len); - memcpy((void *) t + sizeof(*t) + cgid_len, data, len); - - if (blk_tracer) - trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); - } + relay_blktrace_event(bt, 0, pid, cpu, 0, 0, action, 0, cgid, + cgid_len, (void *)data, len); } /* @@ -182,7 +277,7 @@ void __blk_trace_note_message(struct blk_trace *bt, } EXPORT_SYMBOL_GPL(__blk_trace_note_message); -static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, +static int act_log_check(struct blk_trace *bt, u64 what, sector_t sector, pid_t pid) { if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) @@ -213,13 +308,12 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), * blk_io_trace structure and places it in a per-cpu subbuffer. */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, - const blk_opf_t opf, u32 what, int error, + const blk_opf_t opf, u64 what, int error, int pdu_len, void *pdu_data, u64 cgid) { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; struct trace_buffer *buffer = NULL; - struct blk_io_trace *t; unsigned long flags = 0; unsigned long *sequence; unsigned int trace_ctx = 0; @@ -228,6 +322,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0; const enum req_op op = opf & REQ_OP_MASK; + size_t trace_len; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; @@ -238,10 +333,47 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= MASK_TC_BIT(opf, META); what |= MASK_TC_BIT(opf, PREFLUSH); what |= MASK_TC_BIT(opf, FUA); - if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE) + + switch (op) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: what |= BLK_TC_ACT(BLK_TC_DISCARD); - if (op == REQ_OP_FLUSH) + break; + case REQ_OP_FLUSH: what |= BLK_TC_ACT(BLK_TC_FLUSH); + break; + case REQ_OP_ZONE_APPEND: + what |= BLK_TC_ACT(BLK_TC_ZONE_APPEND); + break; + case REQ_OP_ZONE_RESET: + what |= BLK_TC_ACT(BLK_TC_ZONE_RESET); + break; + case REQ_OP_ZONE_RESET_ALL: + what |= BLK_TC_ACT(BLK_TC_ZONE_RESET_ALL); + break; + case REQ_OP_ZONE_FINISH: + what |= BLK_TC_ACT(BLK_TC_ZONE_FINISH); + break; + case REQ_OP_ZONE_OPEN: + what |= BLK_TC_ACT(BLK_TC_ZONE_OPEN); + break; + case REQ_OP_ZONE_CLOSE: + what |= BLK_TC_ACT(BLK_TC_ZONE_CLOSE); + break; + case REQ_OP_WRITE_ZEROES: + what |= BLK_TC_ACT(BLK_TC_WRITE_ZEROES); + break; + default: + break; + } + + /* Drop trace events for zone operations with blktrace v1 */ + if (bt->version == 1 && (what >> BLK_TC_SHIFT) > BLK_TC_END_V1) { + pr_debug_ratelimited("blktrace v1 cannot trace zone operation 0x%llx\n", + (unsigned long long)what); + return; + } + if (cgid) what |= __BLK_TA_CGROUP; @@ -255,13 +387,68 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); + switch (bt->version) { + case 1: + trace_len = sizeof(struct blk_io_trace); + break; + case 2: + default: + /* + * ftrace always uses v2 (blk_io_trace2) format. + * + * For sysfs-enabled tracing path (enabled via + * /sys/block/DEV/trace/enable), blk_trace_setup_queue() + * never initializes bt->version, leaving it 0 from + * kzalloc(). We must handle version==0 safely here. + * + * Fall through to default to ensure we never hit the + * old bug where default set trace_len=0, causing + * buffer underflow and memory corruption. + * + * Always use v2 format for ftrace and normalize + * bt->version to 2 when uninitialized. + */ + trace_len = sizeof(struct blk_io_trace2); + if (bt->version == 0) + bt->version = 2; + break; + } + trace_len += pdu_len + cgid_len; event = trace_buffer_lock_reserve(buffer, TRACE_BLK, - sizeof(*t) + pdu_len + cgid_len, - trace_ctx); + trace_len, trace_ctx); if (!event) return; - t = ring_buffer_event_data(event); - goto record_it; + + switch (bt->version) { + case 1: + record_blktrace_event(ring_buffer_event_data(event), + pid, cpu, sector, bytes, + what, bt->dev, error, cgid, cgid_len, + pdu_data, pdu_len); + break; + case 2: + default: + /* + * Use v2 recording function (record_blktrace_event2) + * which writes blk_io_trace2 structure with correct + * field layout: + * - 32-bit pid at offset 28 + * - 64-bit action at offset 32 + * + * Fall through to default handles version==0 case + * (from sysfs path), ensuring we always use correct + * v2 recording function to match the v2 buffer + * allocated above. + */ + record_blktrace_event2(ring_buffer_event_data(event), + pid, cpu, sector, bytes, + what, bt->dev, error, cgid, cgid_len, + pdu_data, pdu_len); + break; + } + + trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); + return; } if (unlikely(tsk->btrace_seq != blktrace_seq)) @@ -273,41 +460,10 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, * from coming in and stepping on our toes. */ local_irq_save(flags); - t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len); - if (t) { - sequence = per_cpu_ptr(bt->sequence, cpu); - - t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; - t->sequence = ++(*sequence); - t->time = ktime_to_ns(ktime_get()); -record_it: - /* - * These two are not needed in ftrace as they are in the - * generic trace_entry, filled by tracing_generic_entry_update, - * but for the trace_event->bin() synthesizer benefit we do it - * here too. - */ - t->cpu = cpu; - t->pid = pid; - - t->sector = sector; - t->bytes = bytes; - t->action = what; - t->device = bt->dev; - t->error = error; - t->pdu_len = pdu_len + cgid_len; - - if (cgid_len) - memcpy((void *)t + sizeof(*t), &cgid, cgid_len); - if (pdu_len) - memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); - - if (blk_tracer) { - trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); - return; - } - } - + sequence = per_cpu_ptr(bt->sequence, cpu); + (*sequence)++; + relay_blktrace_event(bt, *sequence, pid, cpu, sector, bytes, + what, error, cgid, cgid_len, pdu_data, pdu_len); local_irq_restore(flags); } @@ -415,9 +571,10 @@ static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, size_t count, loff_t *ppos) { struct blk_trace *bt = filp->private_data; + size_t dropped = relay_stats(bt->rchan, RELAY_STATS_BUF_FULL); char buf[16]; - snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped)); + snprintf(buf, sizeof(buf), "%zu\n", dropped); return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); } @@ -456,23 +613,6 @@ static const struct file_operations blk_msg_fops = { .llseek = noop_llseek, }; -/* - * Keep track of how many times we encountered a full subbuffer, to aid - * the user space app in telling how many lost events there were. - */ -static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) -{ - struct blk_trace *bt; - - if (!relay_buf_full(buf)) - return 1; - - bt = buf->chan->private_data; - atomic_inc(&bt->dropped); - return 0; -} - static int blk_remove_buf_file_callback(struct dentry *dentry) { debugfs_remove(dentry); @@ -491,7 +631,6 @@ static struct dentry *blk_create_buf_file_callback(const char *filename, } static const struct rchan_callbacks blk_relay_callbacks = { - .subbuf_start = blk_subbuf_start_callback, .create_buf_file = blk_create_buf_file_callback, .remove_buf_file = blk_remove_buf_file_callback, }; @@ -511,9 +650,10 @@ static void blk_trace_setup_lba(struct blk_trace *bt, /* * Setup everything required to start tracing */ -static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct block_device *bdev, - struct blk_user_trace_setup *buts) +static struct blk_trace *blk_trace_setup_prepare(struct request_queue *q, + char *name, dev_t dev, + u32 buf_size, u32 buf_nr, + struct block_device *bdev) { struct blk_trace *bt = NULL; struct dentry *dir = NULL; @@ -521,31 +661,19 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, lockdep_assert_held(&q->debugfs_mutex); - if (!buts->buf_size || !buts->buf_nr) - return -EINVAL; - - strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE); - - /* - * some device names have larger paths - convert the slashes - * to underscores for this to work as expected - */ - strreplace(buts->name, '/', '_'); - /* * bdev can be NULL, as with scsi-generic, this is a helpful as * we can be. */ if (rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex))) { - pr_warn("Concurrent blktraces are not allowed on %s\n", - buts->name); - return -EBUSY; + pr_warn("Concurrent blktraces are not allowed on %s\n", name); + return ERR_PTR(-EBUSY); } bt = kzalloc(sizeof(*bt), GFP_KERNEL); if (!bt) - return -ENOMEM; + return ERR_PTR(-ENOMEM); ret = -ENOMEM; bt->sequence = alloc_percpu(unsigned long); @@ -565,7 +693,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (bdev && !bdev_is_partition(bdev)) dir = q->debugfs_dir; else - bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); + bt->dir = dir = debugfs_create_dir(name, blk_debugfs_root); /* * As blktrace relies on debugfs for its interface the debugfs directory @@ -573,31 +701,52 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, * files or directories. */ if (IS_ERR_OR_NULL(dir)) { - pr_warn("debugfs_dir not present for %s so skipping\n", - buts->name); + pr_warn("debugfs_dir not present for %s so skipping\n", name); ret = -ENOENT; goto err; } bt->dev = dev; - atomic_set(&bt->dropped, 0); INIT_LIST_HEAD(&bt->running_list); ret = -EIO; debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops); debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); - bt->rchan = relay_open("trace", dir, buts->buf_size, - buts->buf_nr, &blk_relay_callbacks, bt); + bt->rchan = relay_open("trace", dir, buf_size, buf_nr, + &blk_relay_callbacks, bt); if (!bt->rchan) goto err; + blk_trace_setup_lba(bt, bdev); + + return bt; + +err: + blk_trace_free(q, bt); + + return ERR_PTR(ret); +} + +static void blk_trace_setup_finalize(struct request_queue *q, + char *name, int version, + struct blk_trace *bt, + struct blk_user_trace_setup2 *buts) + +{ + strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE2); + + /* + * some device names have larger paths - convert the slashes + * to underscores for this to work as expected + */ + strreplace(buts->name, '/', '_'); + + bt->version = version; bt->act_mask = buts->act_mask; if (!bt->act_mask) bt->act_mask = (u16) -1; - blk_trace_setup_lba(bt, bdev); - /* overwrite with user settings */ if (buts->start_lba) bt->start_lba = buts->start_lba; @@ -609,30 +758,43 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, rcu_assign_pointer(q->blk_trace, bt); get_probe_ref(); - - ret = 0; -err: - if (ret) - blk_trace_free(q, bt); - return ret; } int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, char __user *arg) { + struct blk_user_trace_setup2 buts2; struct blk_user_trace_setup buts; + struct blk_trace *bt; int ret; ret = copy_from_user(&buts, arg, sizeof(buts)); if (ret) return -EFAULT; + if (!buts.buf_size || !buts.buf_nr) + return -EINVAL; + + buts2 = (struct blk_user_trace_setup2) { + .act_mask = buts.act_mask, + .buf_size = buts.buf_size, + .buf_nr = buts.buf_nr, + .start_lba = buts.start_lba, + .end_lba = buts.end_lba, + .pid = buts.pid, + }; + mutex_lock(&q->debugfs_mutex); - ret = do_blk_trace_setup(q, name, dev, bdev, &buts); + bt = blk_trace_setup_prepare(q, name, dev, buts.buf_size, buts.buf_nr, + bdev); + if (IS_ERR(bt)) { + mutex_unlock(&q->debugfs_mutex); + return PTR_ERR(bt); + } + blk_trace_setup_finalize(q, name, 1, bt, &buts2); + strcpy(buts.name, buts2.name); mutex_unlock(&q->debugfs_mutex); - if (ret) - return ret; if (copy_to_user(arg, &buts, sizeof(buts))) { blk_trace_remove(q); @@ -642,19 +804,54 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, } EXPORT_SYMBOL_GPL(blk_trace_setup); +static int blk_trace_setup2(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, char __user *arg) +{ + struct blk_user_trace_setup2 buts2; + struct blk_trace *bt; + + if (copy_from_user(&buts2, arg, sizeof(buts2))) + return -EFAULT; + + if (!buts2.buf_size || !buts2.buf_nr) + return -EINVAL; + + if (buts2.flags != 0) + return -EINVAL; + + mutex_lock(&q->debugfs_mutex); + bt = blk_trace_setup_prepare(q, name, dev, buts2.buf_size, buts2.buf_nr, + bdev); + if (IS_ERR(bt)) { + mutex_unlock(&q->debugfs_mutex); + return PTR_ERR(bt); + } + blk_trace_setup_finalize(q, name, 2, bt, &buts2); + mutex_unlock(&q->debugfs_mutex); + + if (copy_to_user(arg, &buts2, sizeof(buts2))) { + blk_trace_remove(q); + return -EFAULT; + } + return 0; +} + #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) static int compat_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, char __user *arg) { - struct blk_user_trace_setup buts; + struct blk_user_trace_setup2 buts2; struct compat_blk_user_trace_setup cbuts; - int ret; + struct blk_trace *bt; if (copy_from_user(&cbuts, arg, sizeof(cbuts))) return -EFAULT; - buts = (struct blk_user_trace_setup) { + if (!cbuts.buf_size || !cbuts.buf_nr) + return -EINVAL; + + buts2 = (struct blk_user_trace_setup2) { .act_mask = cbuts.act_mask, .buf_size = cbuts.buf_size, .buf_nr = cbuts.buf_nr, @@ -664,12 +861,16 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, }; mutex_lock(&q->debugfs_mutex); - ret = do_blk_trace_setup(q, name, dev, bdev, &buts); + bt = blk_trace_setup_prepare(q, name, dev, buts2.buf_size, buts2.buf_nr, + bdev); + if (IS_ERR(bt)) { + mutex_unlock(&q->debugfs_mutex); + return PTR_ERR(bt); + } + blk_trace_setup_finalize(q, name, 1, bt, &buts2); mutex_unlock(&q->debugfs_mutex); - if (ret) - return ret; - if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { + if (copy_to_user(arg, &buts2.name, ARRAY_SIZE(buts2.name))) { blk_trace_remove(q); return -EFAULT; } @@ -725,6 +926,10 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) char b[BDEVNAME_SIZE]; switch (cmd) { + case BLKTRACESETUP2: + snprintf(b, sizeof(b), "%pg", bdev); + ret = blk_trace_setup2(q, b, bdev->bd_dev, bdev, arg); + break; case BLKTRACESETUP: snprintf(b, sizeof(b), "%pg", bdev); ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); @@ -812,7 +1017,7 @@ blk_trace_request_get_cgid(struct request *rq) * **/ static void blk_add_trace_rq(struct request *rq, blk_status_t error, - unsigned int nr_bytes, u32 what, u64 cgid) + unsigned int nr_bytes, u64 what, u64 cgid) { struct blk_trace *bt; @@ -864,6 +1069,22 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, blk_trace_request_get_cgid(rq)); } +static void blk_add_trace_zone_update_request(void *ignore, struct request *rq) +{ + struct blk_trace *bt; + + rcu_read_lock(); + bt = rcu_dereference(rq->q->blk_trace); + if (likely(!bt) || bt->version < 2) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ZONE_APPEND, + blk_trace_request_get_cgid(rq)); +} + /** * blk_add_trace_bio - Add a trace for a bio oriented action * @q: queue the io is for @@ -876,7 +1097,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, * **/ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, - u32 what, int error) + u64 what, int error) { struct blk_trace *bt; @@ -893,11 +1114,6 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, rcu_read_unlock(); } -static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio) -{ - blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BOUNCE, 0); -} - static void blk_add_trace_bio_complete(void *ignore, struct request_queue *q, struct bio *bio) { @@ -947,7 +1163,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(depth); - u32 what; + u64 what; if (explicit) what = BLK_TA_UNPLUG_IO; @@ -959,6 +1175,37 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, rcu_read_unlock(); } +static void blk_add_trace_zone_plug(void *ignore, struct request_queue *q, + unsigned int zno, sector_t sector, + unsigned int sectors) +{ + struct blk_trace *bt; + + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (bt && bt->version >= 2) + __blk_add_trace(bt, sector, sectors << SECTOR_SHIFT, 0, + BLK_TA_ZONE_PLUG, 0, 0, NULL, 0); + rcu_read_unlock(); + + return; +} + +static void blk_add_trace_zone_unplug(void *ignore, struct request_queue *q, + unsigned int zno, sector_t sector, + unsigned int sectors) +{ + struct blk_trace *bt; + + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (bt && bt->version >= 2) + __blk_add_trace(bt, sector, sectors << SECTOR_SHIFT, 0, + BLK_TA_ZONE_UNPLUG, 0, 0, NULL, 0); + rcu_read_unlock(); + return; +} + static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) { struct request_queue *q = bio->bi_bdev->bd_disk->queue; @@ -1089,8 +1336,6 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); WARN_ON(ret); - ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); - WARN_ON(ret); ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); WARN_ON(ret); ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); @@ -1101,6 +1346,15 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_getrq(blk_add_trace_getrq, NULL); WARN_ON(ret); + ret = register_trace_blk_zone_append_update_request_bio( + blk_add_trace_zone_update_request, NULL); + WARN_ON(ret); + ret = register_trace_disk_zone_wplug_add_bio(blk_add_trace_zone_plug, + NULL); + WARN_ON(ret); + ret = register_trace_blk_zone_wplug_bio(blk_add_trace_zone_unplug, + NULL); + WARN_ON(ret); ret = register_trace_block_plug(blk_add_trace_plug, NULL); WARN_ON(ret); ret = register_trace_block_unplug(blk_add_trace_unplug, NULL); @@ -1120,12 +1374,15 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_split(blk_add_trace_split, NULL); unregister_trace_block_unplug(blk_add_trace_unplug, NULL); unregister_trace_block_plug(blk_add_trace_plug, NULL); + unregister_trace_blk_zone_wplug_bio(blk_add_trace_zone_unplug, NULL); + unregister_trace_disk_zone_wplug_add_bio(blk_add_trace_zone_plug, NULL); + unregister_trace_blk_zone_append_update_request_bio( + blk_add_trace_zone_update_request, NULL); unregister_trace_block_getrq(blk_add_trace_getrq, NULL); unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); - unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); unregister_trace_block_rq_merge(blk_add_trace_rq_merge, NULL); @@ -1139,7 +1396,7 @@ static void blk_unregister_tracepoints(void) * struct blk_io_tracer formatting routines */ -static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) +static void fill_rwbs(char *rwbs, const struct blk_io_trace2 *t) { int i = 0; int tc = t->action >> BLK_TC_SHIFT; @@ -1154,7 +1411,10 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) if (tc & BLK_TC_DISCARD) rwbs[i++] = 'D'; - else if (tc & BLK_TC_WRITE) + else if (tc & BLK_TC_WRITE_ZEROES) { + rwbs[i++] = 'W'; + rwbs[i++] = 'Z'; + } else if (tc & BLK_TC_WRITE) rwbs[i++] = 'W'; else if (t->bytes) rwbs[i++] = 'R'; @@ -1174,9 +1434,9 @@ out: } static inline -const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) +const struct blk_io_trace2 *te_blk_io_trace(const struct trace_entry *ent) { - return (const struct blk_io_trace *)ent; + return (const struct blk_io_trace2 *)ent; } static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg) @@ -1235,7 +1495,7 @@ static void blk_log_action_classic(struct trace_iterator *iter, const char *act, unsigned long long ts = iter->ts; unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC); unsigned secs = (unsigned long)ts; - const struct blk_io_trace *t = te_blk_io_trace(iter->ent); + const struct blk_io_trace2 *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); @@ -1249,7 +1509,7 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, bool has_cg) { char rwbs[RWBS_LEN]; - const struct blk_io_trace *t = te_blk_io_trace(iter->ent); + const struct blk_io_trace2 *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); if (has_cg) { @@ -1462,7 +1722,6 @@ static const struct { [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug }, [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic }, [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split }, - [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic }, [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap }, }; @@ -1471,7 +1730,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, { struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; - const struct blk_io_trace *t; + const struct blk_io_trace2 *t; u16 what; bool long_act; blk_log_action_t *log_action; @@ -1479,7 +1738,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, t = te_blk_io_trace(iter->ent); what = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP; - long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE); + long_act = !!(tr->trace_flags & TRACE_ITER(VERBOSE)); log_action = classic ? &blk_log_action_classic : &blk_log_action; has_cg = t->action & __BLK_TA_CGROUP; @@ -1508,8 +1767,8 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, static void blk_trace_synthesize_old_trace(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; - struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; - const int offset = offsetof(struct blk_io_trace, sector); + struct blk_io_trace2 *t = (struct blk_io_trace2 *)iter->ent; + const int offset = offsetof(struct blk_io_trace2, sector); struct blk_io_trace old = { .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION, .time = iter->ts, @@ -1544,9 +1803,9 @@ blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) /* don't output context-info for blk_classic output */ if (bit == TRACE_BLK_OPT_CLASSIC) { if (set) - tr->trace_flags &= ~TRACE_ITER_CONTEXT_INFO; + tr->trace_flags &= ~TRACE_ITER(CONTEXT_INFO); else - tr->trace_flags |= TRACE_ITER_CONTEXT_INFO; + tr->trace_flags |= TRACE_ITER(CONTEXT_INFO); } return 0; } @@ -1586,6 +1845,10 @@ static int __init init_blk_tracer(void) return 1; } + BUILD_BUG_ON(__alignof__(struct blk_user_trace_setup2) % + __alignof__(long)); + BUILD_BUG_ON(__alignof__(struct blk_io_trace2) % __alignof__(long)); + return 0; } @@ -1694,6 +1957,7 @@ static const struct { { BLK_TC_DISCARD, "discard" }, { BLK_TC_DRV_DATA, "drv_data" }, { BLK_TC_FUA, "fua" }, + { BLK_TC_WRITE_ZEROES, "write-zeroes" }, }; static int blk_trace_str2mask(const char *str) @@ -1884,6 +2148,33 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf) case REQ_OP_READ: rwbs[i++] = 'R'; break; + case REQ_OP_ZONE_APPEND: + rwbs[i++] = 'Z'; + rwbs[i++] = 'A'; + break; + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_RESET_ALL: + rwbs[i++] = 'Z'; + rwbs[i++] = 'R'; + if ((opf & REQ_OP_MASK) == REQ_OP_ZONE_RESET_ALL) + rwbs[i++] = 'A'; + break; + case REQ_OP_ZONE_FINISH: + rwbs[i++] = 'Z'; + rwbs[i++] = 'F'; + break; + case REQ_OP_ZONE_OPEN: + rwbs[i++] = 'Z'; + rwbs[i++] = 'O'; + break; + case REQ_OP_ZONE_CLOSE: + rwbs[i++] = 'Z'; + rwbs[i++] = 'C'; + break; + case REQ_OP_WRITE_ZEROES: + rwbs[i++] = 'W'; + rwbs[i++] = 'Z'; + break; default: rwbs[i++] = 'N'; } @@ -1896,6 +2187,10 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf) rwbs[i++] = 'S'; if (opf & REQ_META) rwbs[i++] = 'M'; + if (opf & REQ_ATOMIC) + rwbs[i++] = 'U'; + + WARN_ON_ONCE(i >= RWBS_LEN); rwbs[i] = '\0'; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 187dc37d61d4..fe28d86f7c35 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -22,7 +22,6 @@ #include <linux/bsearch.h> #include <linux/sort.h> #include <linux/key.h> -#include <linux/verification.h> #include <linux/namei.h> #include <net/bpf_sk_storage.h> @@ -572,7 +571,7 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) return value; } -static const struct bpf_func_proto bpf_perf_event_read_proto = { +const struct bpf_func_proto bpf_perf_event_read_proto = { .func = bpf_perf_event_read, .gpl_only = true, .ret_type = RET_INTEGER, @@ -781,8 +780,7 @@ BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task) return (unsigned long) task_pt_regs(task); } -BTF_ID_LIST(bpf_task_pt_regs_ids) -BTF_ID(struct, pt_regs) +BTF_ID_LIST_SINGLE(bpf_task_pt_regs_ids, struct, pt_regs) const struct bpf_func_proto bpf_task_pt_regs_proto = { .func = bpf_task_pt_regs, @@ -882,7 +880,7 @@ BPF_CALL_1(bpf_send_signal, u32, sig) return bpf_send_signal_common(sig, PIDTYPE_TGID, NULL, 0); } -static const struct bpf_func_proto bpf_send_signal_proto = { +const struct bpf_func_proto bpf_send_signal_proto = { .func = bpf_send_signal, .gpl_only = false, .ret_type = RET_INTEGER, @@ -894,14 +892,14 @@ BPF_CALL_1(bpf_send_signal_thread, u32, sig) return bpf_send_signal_common(sig, PIDTYPE_PID, NULL, 0); } -static const struct bpf_func_proto bpf_send_signal_thread_proto = { +const struct bpf_func_proto bpf_send_signal_thread_proto = { .func = bpf_send_signal_thread, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz) +BPF_CALL_3(bpf_d_path, const struct path *, path, char *, buf, u32, sz) { struct path copy; long len; @@ -967,7 +965,7 @@ static const struct bpf_func_proto bpf_d_path_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_d_path_btf_ids[0], - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .allowed = bpf_d_path_allowed, }; @@ -1185,7 +1183,7 @@ BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags) return entry_cnt * br_entry_size; } -static const struct bpf_func_proto bpf_get_branch_snapshot_proto = { +const struct bpf_func_proto bpf_get_branch_snapshot_proto = { .func = bpf_get_branch_snapshot, .gpl_only = true, .ret_type = RET_INTEGER, @@ -1242,244 +1240,14 @@ static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -#ifdef CONFIG_KEYS -__bpf_kfunc_start_defs(); - -/** - * bpf_lookup_user_key - lookup a key by its serial - * @serial: key handle serial number - * @flags: lookup-specific flags - * - * Search a key with a given *serial* and the provided *flags*. - * If found, increment the reference count of the key by one, and - * return it in the bpf_key structure. - * - * The bpf_key structure must be passed to bpf_key_put() when done - * with it, so that the key reference count is decremented and the - * bpf_key structure is freed. - * - * Permission checks are deferred to the time the key is used by - * one of the available key-specific kfuncs. - * - * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested - * special keyring (e.g. session keyring), if it doesn't yet exist. - * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting - * for the key construction, and to retrieve uninstantiated keys (keys - * without data attached to them). - * - * Return: a bpf_key pointer with a valid key pointer if the key is found, a - * NULL pointer otherwise. - */ -__bpf_kfunc struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags) -{ - key_ref_t key_ref; - struct bpf_key *bkey; - - if (flags & ~KEY_LOOKUP_ALL) - return NULL; - - /* - * Permission check is deferred until the key is used, as the - * intent of the caller is unknown here. - */ - key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK); - if (IS_ERR(key_ref)) - return NULL; - - bkey = kmalloc(sizeof(*bkey), GFP_KERNEL); - if (!bkey) { - key_put(key_ref_to_ptr(key_ref)); - return NULL; - } - - bkey->key = key_ref_to_ptr(key_ref); - bkey->has_ref = true; - - return bkey; -} - -/** - * bpf_lookup_system_key - lookup a key by a system-defined ID - * @id: key ID - * - * Obtain a bpf_key structure with a key pointer set to the passed key ID. - * The key pointer is marked as invalid, to prevent bpf_key_put() from - * attempting to decrement the key reference count on that pointer. The key - * pointer set in such way is currently understood only by - * verify_pkcs7_signature(). - * - * Set *id* to one of the values defined in include/linux/verification.h: - * 0 for the primary keyring (immutable keyring of system keys); - * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring - * (where keys can be added only if they are vouched for by existing keys - * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform - * keyring (primarily used by the integrity subsystem to verify a kexec'ed - * kerned image and, possibly, the initramfs signature). - * - * Return: a bpf_key pointer with an invalid key pointer set from the - * pre-determined ID on success, a NULL pointer otherwise - */ -__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id) -{ - struct bpf_key *bkey; - - if (system_keyring_id_check(id) < 0) - return NULL; - - bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC); - if (!bkey) - return NULL; - - bkey->key = (struct key *)(unsigned long)id; - bkey->has_ref = false; - - return bkey; -} - -/** - * bpf_key_put - decrement key reference count if key is valid and free bpf_key - * @bkey: bpf_key structure - * - * Decrement the reference count of the key inside *bkey*, if the pointer - * is valid, and free *bkey*. - */ -__bpf_kfunc void bpf_key_put(struct bpf_key *bkey) -{ - if (bkey->has_ref) - key_put(bkey->key); - - kfree(bkey); -} - -#ifdef CONFIG_SYSTEM_DATA_VERIFICATION -/** - * bpf_verify_pkcs7_signature - verify a PKCS#7 signature - * @data_p: data to verify - * @sig_p: signature of the data - * @trusted_keyring: keyring with keys trusted for signature verification - * - * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr* - * with keys in a keyring referenced by *trusted_keyring*. - * - * Return: 0 on success, a negative value on error. - */ -__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, - struct bpf_dynptr *sig_p, - struct bpf_key *trusted_keyring) -{ - struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p; - struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p; - const void *data, *sig; - u32 data_len, sig_len; - int ret; - - if (trusted_keyring->has_ref) { - /* - * Do the permission check deferred in bpf_lookup_user_key(). - * See bpf_lookup_user_key() for more details. - * - * A call to key_task_permission() here would be redundant, as - * it is already done by keyring_search() called by - * find_asymmetric_key(). - */ - ret = key_validate(trusted_keyring->key); - if (ret < 0) - return ret; - } - - data_len = __bpf_dynptr_size(data_ptr); - data = __bpf_dynptr_data(data_ptr, data_len); - sig_len = __bpf_dynptr_size(sig_ptr); - sig = __bpf_dynptr_data(sig_ptr, sig_len); - - return verify_pkcs7_signature(data, data_len, sig, sig_len, - trusted_keyring->key, - VERIFYING_UNSPECIFIED_SIGNATURE, NULL, - NULL); -} -#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ - -__bpf_kfunc_end_defs(); - -BTF_KFUNCS_START(key_sig_kfunc_set) -BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) -#ifdef CONFIG_SYSTEM_DATA_VERIFICATION -BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE) -#endif -BTF_KFUNCS_END(key_sig_kfunc_set) - -static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = { - .owner = THIS_MODULE, - .set = &key_sig_kfunc_set, -}; - -static int __init bpf_key_sig_kfuncs_init(void) -{ - return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, - &bpf_key_sig_kfunc_set); -} - -late_initcall(bpf_key_sig_kfuncs_init); -#endif /* CONFIG_KEYS */ - static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; switch (func_id) { - case BPF_FUNC_map_lookup_elem: - return &bpf_map_lookup_elem_proto; - case BPF_FUNC_map_update_elem: - return &bpf_map_update_elem_proto; - case BPF_FUNC_map_delete_elem: - return &bpf_map_delete_elem_proto; - case BPF_FUNC_map_push_elem: - return &bpf_map_push_elem_proto; - case BPF_FUNC_map_pop_elem: - return &bpf_map_pop_elem_proto; - case BPF_FUNC_map_peek_elem: - return &bpf_map_peek_elem_proto; - case BPF_FUNC_map_lookup_percpu_elem: - return &bpf_map_lookup_percpu_elem_proto; - case BPF_FUNC_ktime_get_ns: - return &bpf_ktime_get_ns_proto; - case BPF_FUNC_ktime_get_boot_ns: - return &bpf_ktime_get_boot_ns_proto; - case BPF_FUNC_tail_call: - return &bpf_tail_call_proto; - case BPF_FUNC_get_current_task: - return &bpf_get_current_task_proto; - case BPF_FUNC_get_current_task_btf: - return &bpf_get_current_task_btf_proto; - case BPF_FUNC_task_pt_regs: - return &bpf_task_pt_regs_proto; - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_current_comm: - return &bpf_get_current_comm_proto; - case BPF_FUNC_trace_printk: - return bpf_get_trace_printk_proto(); case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; - case BPF_FUNC_get_numa_node_id: - return &bpf_get_numa_node_id_proto; - case BPF_FUNC_perf_event_read: - return &bpf_perf_event_read_proto; - case BPF_FUNC_get_prandom_u32: - return &bpf_get_prandom_u32_proto; - case BPF_FUNC_probe_read_user: - return &bpf_probe_read_user_proto; - case BPF_FUNC_probe_read_kernel: - return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? - NULL : &bpf_probe_read_kernel_proto; - case BPF_FUNC_probe_read_user_str: - return &bpf_probe_read_user_str_proto; - case BPF_FUNC_probe_read_kernel_str: - return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? - NULL : &bpf_probe_read_kernel_str_proto; #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE case BPF_FUNC_probe_read: return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? @@ -1488,65 +1256,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_compat_str_proto; #endif -#ifdef CONFIG_CGROUPS - case BPF_FUNC_cgrp_storage_get: - return &bpf_cgrp_storage_get_proto; - case BPF_FUNC_cgrp_storage_delete: - return &bpf_cgrp_storage_delete_proto; - case BPF_FUNC_current_task_under_cgroup: - return &bpf_current_task_under_cgroup_proto; -#endif - case BPF_FUNC_send_signal: - return &bpf_send_signal_proto; - case BPF_FUNC_send_signal_thread: - return &bpf_send_signal_thread_proto; - case BPF_FUNC_perf_event_read_value: - return &bpf_perf_event_read_value_proto; - case BPF_FUNC_ringbuf_output: - return &bpf_ringbuf_output_proto; - case BPF_FUNC_ringbuf_reserve: - return &bpf_ringbuf_reserve_proto; - case BPF_FUNC_ringbuf_submit: - return &bpf_ringbuf_submit_proto; - case BPF_FUNC_ringbuf_discard: - return &bpf_ringbuf_discard_proto; - case BPF_FUNC_ringbuf_query: - return &bpf_ringbuf_query_proto; - case BPF_FUNC_jiffies64: - return &bpf_jiffies64_proto; - case BPF_FUNC_get_task_stack: - return prog->sleepable ? &bpf_get_task_stack_sleepable_proto - : &bpf_get_task_stack_proto; - case BPF_FUNC_copy_from_user: - return &bpf_copy_from_user_proto; - case BPF_FUNC_copy_from_user_task: - return &bpf_copy_from_user_task_proto; - case BPF_FUNC_snprintf_btf: - return &bpf_snprintf_btf_proto; - case BPF_FUNC_per_cpu_ptr: - return &bpf_per_cpu_ptr_proto; - case BPF_FUNC_this_cpu_ptr: - return &bpf_this_cpu_ptr_proto; - case BPF_FUNC_task_storage_get: - if (bpf_prog_check_recur(prog)) - return &bpf_task_storage_get_recur_proto; - return &bpf_task_storage_get_proto; - case BPF_FUNC_task_storage_delete: - if (bpf_prog_check_recur(prog)) - return &bpf_task_storage_delete_recur_proto; - return &bpf_task_storage_delete_proto; - case BPF_FUNC_for_each_map_elem: - return &bpf_for_each_map_elem_proto; - case BPF_FUNC_snprintf: - return &bpf_snprintf_proto; case BPF_FUNC_get_func_ip: return &bpf_get_func_ip_proto_tracing; - case BPF_FUNC_get_branch_snapshot: - return &bpf_get_branch_snapshot_proto; - case BPF_FUNC_find_vma: - return &bpf_find_vma_proto; - case BPF_FUNC_trace_vprintk: - return bpf_get_trace_vprintk_proto(); default: break; } @@ -1627,8 +1338,6 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type { if (off < 0 || off >= sizeof(struct pt_regs)) return false; - if (type != BPF_READ) - return false; if (off % size != 0) return false; /* @@ -1638,6 +1347,9 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type if (off + size > sizeof(struct pt_regs)) return false; + if (type == BPF_WRITE) + prog->aux->kprobe_write_ctx = true; + return true; } @@ -1858,7 +1570,7 @@ static struct pt_regs *get_bpf_raw_tp_regs(void) struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs); int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level); - if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) { + if (nest_level > ARRAY_SIZE(tp_regs->regs)) { this_cpu_dec(bpf_raw_tp_nest_level); return ERR_PTR(-EBUSY); } @@ -2571,7 +2283,6 @@ struct bpf_kprobe_multi_link { u32 cnt; u32 mods_cnt; struct module **mods; - u32 flags; }; struct bpf_kprobe_multi_run_ctx { @@ -2691,7 +2402,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); info->kprobe_multi.count = kmulti_link->cnt; - info->kprobe_multi.flags = kmulti_link->flags; + info->kprobe_multi.flags = kmulti_link->link.flags; info->kprobe_multi.missed = kmulti_link->fp.nmissed; if (!uaddrs) @@ -2725,10 +2436,37 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, return err; } +#ifdef CONFIG_PROC_FS +static void bpf_kprobe_multi_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_kprobe_multi_link *kmulti_link; + + kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); + + seq_printf(seq, + "kprobe_cnt:\t%u\n" + "missed:\t%lu\n", + kmulti_link->cnt, + kmulti_link->fp.nmissed); + + seq_printf(seq, "%s\t %s\n", "cookie", "func"); + for (int i = 0; i < kmulti_link->cnt; i++) { + seq_printf(seq, + "%llu\t %pS\n", + kmulti_link->cookies[i], + (void *)kmulti_link->addrs[i]); + } +} +#endif + static const struct bpf_link_ops bpf_kprobe_multi_link_lops = { .release = bpf_kprobe_multi_link_release, .dealloc_deferred = bpf_kprobe_multi_link_dealloc, .fill_link_info = bpf_kprobe_multi_link_fill_link_info, +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_kprobe_multi_show_fdinfo, +#endif }; static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void *priv) @@ -2791,7 +2529,7 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) return run_ctx->entry_ip; } -static int +static __always_inline int kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, unsigned long entry_ip, struct ftrace_regs *fregs, bool is_return, void *data) @@ -2808,20 +2546,25 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, struct pt_regs *regs; int err; + /* + * graph tracer framework ensures we won't migrate, so there is no need + * to use migrate_disable for bpf_prog_run again. The check here just for + * __this_cpu_inc_return. + */ + cant_sleep(); + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { bpf_prog_inc_misses_counter(link->link.prog); err = 1; goto out; } - migrate_disable(); rcu_read_lock(); regs = ftrace_partial_regs(fregs, bpf_kprobe_multi_pt_regs_ptr()); old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx); err = bpf_prog_run(link->link.prog, regs); bpf_reset_run_ctx(old_run_ctx); rcu_read_unlock(); - migrate_enable(); out: __this_cpu_dec(bpf_prog_active); @@ -2987,9 +2730,16 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; + if (attr->link_create.flags) + return -EINVAL; + if (!is_kprobe_multi(prog)) return -EINVAL; + /* Writing to context is not allowed for kprobes. */ + if (prog->aux->kprobe_write_ctx) + return -EINVAL; + flags = attr->link_create.kprobe_multi.flags; if (flags & ~BPF_F_KPROBE_MULTI_RETURN) return -EINVAL; @@ -3062,7 +2812,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr } bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE_MULTI, - &bpf_kprobe_multi_link_lops, prog); + &bpf_kprobe_multi_link_lops, prog, attr->link_create.attach_type); err = bpf_link_prime(&link->link, &link_primer); if (err) @@ -3078,7 +2828,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr link->addrs = addrs; link->cookies = cookies; link->cnt = cnt; - link->flags = flags; + link->link.flags = flags; if (cookies) { /* @@ -3147,7 +2897,6 @@ struct bpf_uprobe_multi_link { struct path path; struct bpf_link link; u32 cnt; - u32 flags; struct bpf_uprobe *uprobes; struct task_struct *task; }; @@ -3211,7 +2960,7 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); info->uprobe_multi.count = umulti_link->cnt; - info->uprobe_multi.flags = umulti_link->flags; + info->uprobe_multi.flags = umulti_link->link.flags; info->uprobe_multi.pid = umulti_link->task ? task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0; @@ -3256,10 +3005,54 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, return err; } +#ifdef CONFIG_PROC_FS +static void bpf_uprobe_multi_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_uprobe_multi_link *umulti_link; + char *p, *buf; + pid_t pid; + + umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); + + buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!buf) + return; + + p = d_path(&umulti_link->path, buf, PATH_MAX); + if (IS_ERR(p)) { + kfree(buf); + return; + } + + pid = umulti_link->task ? + task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0; + seq_printf(seq, + "uprobe_cnt:\t%u\n" + "pid:\t%u\n" + "path:\t%s\n", + umulti_link->cnt, pid, p); + + seq_printf(seq, "%s\t %s\t %s\n", "cookie", "offset", "ref_ctr_offset"); + for (int i = 0; i < umulti_link->cnt; i++) { + seq_printf(seq, + "%llu\t %#llx\t %#lx\n", + umulti_link->uprobes[i].cookie, + umulti_link->uprobes[i].offset, + umulti_link->uprobes[i].ref_ctr_offset); + } + + kfree(buf); +} +#endif + static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { .release = bpf_uprobe_multi_link_release, .dealloc_deferred = bpf_uprobe_multi_link_dealloc, .fill_link_info = bpf_uprobe_multi_link_fill_link_info, +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_uprobe_multi_show_fdinfo, +#endif }; static int uprobe_prog_run(struct bpf_uprobe *uprobe, @@ -3376,6 +3169,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; + if (attr->link_create.flags) + return -EINVAL; + if (!is_uprobe_multi(prog)) return -EINVAL; @@ -3417,7 +3213,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr } if (pid) { + rcu_read_lock(); task = get_pid_task(find_vpid(pid), PIDTYPE_TGID); + rcu_read_unlock(); if (!task) { err = -ESRCH; goto error_path_put; @@ -3466,10 +3264,10 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr link->uprobes = uprobes; link->path = path; link->task = task; - link->flags = flags; + link->link.flags = flags; bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI, - &bpf_uprobe_multi_link_lops, prog); + &bpf_uprobe_multi_link_lops, prog, attr->link_create.attach_type); for (i = 0; i < cnt; i++) { uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry), @@ -3565,6 +3363,146 @@ static int __init bpf_kprobe_multi_kfuncs_init(void) late_initcall(bpf_kprobe_multi_kfuncs_init); +typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struct *tsk); + +/* + * The __always_inline is to make sure the compiler doesn't + * generate indirect calls into callbacks, which is expensive, + * on some kernel configurations. This allows compiler to put + * direct calls into all the specific callback implementations + * (copy_user_data_sleepable, copy_user_data_nofault, and so on) + */ +static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u64 doff, u64 size, + const void *unsafe_src, + copy_fn_t str_copy_fn, + struct task_struct *tsk) +{ + struct bpf_dynptr_kern *dst; + u64 chunk_sz, off; + void *dst_slice; + int cnt, err; + char buf[256]; + + dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size); + if (likely(dst_slice)) + return str_copy_fn(dst_slice, unsafe_src, size, tsk); + + dst = (struct bpf_dynptr_kern *)dptr; + if (bpf_dynptr_check_off_len(dst, doff, size)) + return -E2BIG; + + for (off = 0; off < size; off += chunk_sz - 1) { + chunk_sz = min_t(u64, sizeof(buf), size - off); + /* Expect str_copy_fn to return count of copied bytes, including + * zero terminator. Next iteration increment off by chunk_sz - 1 to + * overwrite NUL. + */ + cnt = str_copy_fn(buf, unsafe_src + off, chunk_sz, tsk); + if (cnt < 0) + return cnt; + err = __bpf_dynptr_write(dst, doff + off, buf, cnt, 0); + if (err) + return err; + if (cnt < chunk_sz || chunk_sz == 1) /* we are done */ + return off + cnt; + } + return off; +} + +static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u64 doff, + u64 size, const void *unsafe_src, + copy_fn_t copy_fn, struct task_struct *tsk) +{ + struct bpf_dynptr_kern *dst; + void *dst_slice; + char buf[256]; + u64 off, chunk_sz; + int err; + + dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size); + if (likely(dst_slice)) + return copy_fn(dst_slice, unsafe_src, size, tsk); + + dst = (struct bpf_dynptr_kern *)dptr; + if (bpf_dynptr_check_off_len(dst, doff, size)) + return -E2BIG; + + for (off = 0; off < size; off += chunk_sz) { + chunk_sz = min_t(u64, sizeof(buf), size - off); + err = copy_fn(buf, unsafe_src + off, chunk_sz, tsk); + if (err) + return err; + err = __bpf_dynptr_write(dst, doff + off, buf, chunk_sz, 0); + if (err) + return err; + } + return 0; +} + +static __always_inline int copy_user_data_nofault(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + return copy_from_user_nofault(dst, (const void __user *)unsafe_src, size); +} + +static __always_inline int copy_user_data_sleepable(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + int ret; + + if (!tsk) { /* Read from the current task */ + ret = copy_from_user(dst, (const void __user *)unsafe_src, size); + if (ret) + return -EFAULT; + return 0; + } + + ret = access_process_vm(tsk, (unsigned long)unsafe_src, dst, size, 0); + if (ret != size) + return -EFAULT; + return 0; +} + +static __always_inline int copy_kernel_data_nofault(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + return copy_from_kernel_nofault(dst, unsafe_src, size); +} + +static __always_inline int copy_user_str_nofault(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + return strncpy_from_user_nofault(dst, (const void __user *)unsafe_src, size); +} + +static __always_inline int copy_user_str_sleepable(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + int ret; + + if (unlikely(size == 0)) + return 0; + + if (tsk) { + ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_src, dst, size, 0); + } else { + ret = strncpy_from_user(dst, (const void __user *)unsafe_src, size - 1); + /* strncpy_from_user does not guarantee NUL termination */ + if (ret >= 0) + ((char *)dst)[ret] = '\0'; + } + + if (ret < 0) + return ret; + return ret + 1; +} + +static __always_inline int copy_kernel_str_nofault(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + return strncpy_from_kernel_nofault(dst, unsafe_src, size); +} + __bpf_kfunc_start_defs(); __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid_type type, @@ -3576,4 +3514,62 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid return bpf_send_signal_common(sig, type, task, value); } +__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_data_nofault, NULL); +} + +__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign, + copy_kernel_data_nofault, NULL); +} + +__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_str_nofault, NULL); +} + +__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign, + copy_kernel_str_nofault, NULL); +} + +__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_data_sleepable, NULL); +} + +__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_str_sleepable, NULL); +} + +__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign, + struct task_struct *tsk) +{ + return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_data_sleepable, tsk); +} + +__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign, + struct task_struct *tsk) +{ + return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_str_sleepable, tsk); +} + __bpf_kfunc_end_defs(); diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 8d925cbdce3a..cc48d16be43e 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -163,7 +163,7 @@ enum { #define RET_STACK(t, offset) ((struct ftrace_ret_stack *)(&(t)->ret_stack[offset])) /* - * Each fgraph_ops has a reservered unsigned long at the end (top) of the + * Each fgraph_ops has a reserved unsigned long at the end (top) of the * ret_stack to store task specific state. */ #define SHADOW_STACK_TASK_VARS(ret_stack) \ @@ -498,9 +498,6 @@ found: return get_data_type_data(current, offset); } -/* Both enabled by default (can be cleared by function_graph tracer flags */ -bool fgraph_sleep_time = true; - #ifdef CONFIG_DYNAMIC_FTRACE /* * archs can override this function if they must do something @@ -815,6 +812,7 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe unsigned long bitmap; unsigned long ret; int offset; + int bit; int i; ret_stack = ftrace_pop_return_trace(&trace, &ret, frame_pointer, &offset); @@ -829,6 +827,15 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe if (fregs) ftrace_regs_set_instruction_pointer(fregs, ret); + bit = ftrace_test_recursion_trylock(trace.func, ret); + /* + * This can fail because ftrace_test_recursion_trylock() allows one nest + * call. If we are already in a nested call, then we don't probe this and + * just return the original return address. + */ + if (unlikely(bit < 0)) + goto out; + #ifdef CONFIG_FUNCTION_GRAPH_RETVAL trace.retval = ftrace_regs_get_return_value(fregs); #endif @@ -852,6 +859,8 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe } } + ftrace_test_recursion_unlock(bit); +out: /* * The ftrace_graph_return() may still access the current * ret_stack structure, we need to make sure the update of @@ -1007,15 +1016,11 @@ void fgraph_init_ops(struct ftrace_ops *dst_ops, mutex_init(&dst_ops->local_hash.regex_lock); INIT_LIST_HEAD(&dst_ops->subop_list); dst_ops->flags |= FTRACE_OPS_FL_INITIALIZED; + dst_ops->private = src_ops->private; } #endif } -void ftrace_graph_sleep_time_control(bool enable) -{ - fgraph_sleep_time = enable; -} - /* * Simply points to ftrace_stub, but with the proper protocol. * Defined by the linker script in linux/vmlinux.lds.h @@ -1086,7 +1091,7 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt, * Does the user want to count the time a function was asleep. * If so, do not update the time stamps. */ - if (fgraph_sleep_time) + if (!fgraph_no_sleep_time) return; timestamp = trace_clock_local(); @@ -1325,6 +1330,10 @@ int register_ftrace_graph(struct fgraph_ops *gops) int ret = 0; int i = -1; + if (WARN_ONCE(gops->ops.flags & FTRACE_OPS_FL_GRAPH, + "function graph ops registered again")) + return -EBUSY; + guard(mutex)(&ftrace_lock); if (!fgraph_stack_cachep) { @@ -1360,6 +1369,13 @@ int register_ftrace_graph(struct fgraph_ops *gops) ftrace_graph_active++; + /* Always save the function, and reset at unregistering */ + gops->saved_func = gops->entryfunc; +#ifdef CONFIG_DYNAMIC_FTRACE + if (ftrace_pids_enabled(&gops->ops)) + gops->entryfunc = fgraph_pid_func; +#endif + if (ftrace_graph_active == 2) ftrace_graph_disable_direct(true); @@ -1379,8 +1395,8 @@ int register_ftrace_graph(struct fgraph_ops *gops) } else { init_task_vars(gops->idx); } - /* Always save the function, and reset at unregistering */ - gops->saved_func = gops->entryfunc; + + gops->ops.flags |= FTRACE_OPS_FL_GRAPH; ret = ftrace_startup_subops(&graph_ops, &gops->ops, command); if (!ret) @@ -1391,6 +1407,8 @@ error: ftrace_graph_active--; gops->saved_func = NULL; fgraph_lru_release_index(i); + if (!ftrace_graph_active) + unregister_pm_notifier(&ftrace_suspend_notifier); } return ret; } @@ -1399,17 +1417,21 @@ void unregister_ftrace_graph(struct fgraph_ops *gops) { int command = 0; + if (WARN_ONCE(!(gops->ops.flags & FTRACE_OPS_FL_GRAPH), + "function graph ops unregistered without registering")) + return; + guard(mutex)(&ftrace_lock); if (unlikely(!ftrace_graph_active)) - return; + goto out; if (unlikely(gops->idx < 0 || gops->idx >= FGRAPH_ARRAY_SIZE || fgraph_array[gops->idx] != gops)) - return; + goto out; if (fgraph_lru_release_index(gops->idx) < 0) - return; + goto out; fgraph_array[gops->idx] = &fgraph_stub; @@ -1432,4 +1454,6 @@ void unregister_ftrace_graph(struct fgraph_ops *gops) unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); } gops->saved_func = NULL; + out: + gops->ops.flags &= ~FTRACE_OPS_FL_GRAPH; } diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 95c6e3473a76..1188eefef07c 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -10,6 +10,7 @@ #include <linux/kprobes.h> #include <linux/list.h> #include <linux/mutex.h> +#include <linux/rhashtable.h> #include <linux/slab.h> #include <linux/sort.h> @@ -29,7 +30,7 @@ * fprobe_table: hold 'fprobe_hlist::hlist' for checking the fprobe still * exists. The key is the address of fprobe instance. * fprobe_ip_table: hold 'fprobe_hlist::array[*]' for searching the fprobe - * instance related to the funciton address. The key is the ftrace IP + * instance related to the function address. The key is the ftrace IP * address. * * When unregistering the fprobe, fprobe_hlist::fp and fprobe_hlist::array[*].fp @@ -41,60 +42,68 @@ * - RCU hlist traversal under disabling preempt */ static struct hlist_head fprobe_table[FPROBE_TABLE_SIZE]; -static struct hlist_head fprobe_ip_table[FPROBE_IP_TABLE_SIZE]; +static struct rhltable fprobe_ip_table; static DEFINE_MUTEX(fprobe_mutex); +static struct fgraph_ops fprobe_graph_ops; -/* - * Find first fprobe in the hlist. It will be iterated twice in the entry - * probe, once for correcting the total required size, the second time is - * calling back the user handlers. - * Thus the hlist in the fprobe_table must be sorted and new probe needs to - * be added *before* the first fprobe. - */ -static struct fprobe_hlist_node *find_first_fprobe_node(unsigned long ip) +static u32 fprobe_node_hashfn(const void *data, u32 len, u32 seed) { - struct fprobe_hlist_node *node; - struct hlist_head *head; + return hash_ptr(*(unsigned long **)data, 32); +} - head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)]; - hlist_for_each_entry_rcu(node, head, hlist, - lockdep_is_held(&fprobe_mutex)) { - if (node->addr == ip) - return node; - } - return NULL; +static int fprobe_node_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + unsigned long key = *(unsigned long *)arg->key; + const struct fprobe_hlist_node *n = ptr; + + return n->addr != key; } -NOKPROBE_SYMBOL(find_first_fprobe_node); -/* Node insertion and deletion requires the fprobe_mutex */ -static void insert_fprobe_node(struct fprobe_hlist_node *node) +static u32 fprobe_node_obj_hashfn(const void *data, u32 len, u32 seed) { - unsigned long ip = node->addr; - struct fprobe_hlist_node *next; - struct hlist_head *head; + const struct fprobe_hlist_node *n = data; + + return hash_ptr((void *)n->addr, 32); +} + +static const struct rhashtable_params fprobe_rht_params = { + .head_offset = offsetof(struct fprobe_hlist_node, hlist), + .key_offset = offsetof(struct fprobe_hlist_node, addr), + .key_len = sizeof_field(struct fprobe_hlist_node, addr), + .hashfn = fprobe_node_hashfn, + .obj_hashfn = fprobe_node_obj_hashfn, + .obj_cmpfn = fprobe_node_cmp, + .automatic_shrinking = true, +}; +/* Node insertion and deletion requires the fprobe_mutex */ +static int insert_fprobe_node(struct fprobe_hlist_node *node) +{ lockdep_assert_held(&fprobe_mutex); - next = find_first_fprobe_node(ip); - if (next) { - hlist_add_before_rcu(&node->hlist, &next->hlist); - return; - } - head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)]; - hlist_add_head_rcu(&node->hlist, head); + return rhltable_insert(&fprobe_ip_table, &node->hlist, fprobe_rht_params); } /* Return true if there are synonims */ static bool delete_fprobe_node(struct fprobe_hlist_node *node) { lockdep_assert_held(&fprobe_mutex); + bool ret; /* Avoid double deleting */ if (READ_ONCE(node->fp) != NULL) { WRITE_ONCE(node->fp, NULL); - hlist_del_rcu(&node->hlist); + rhltable_remove(&fprobe_ip_table, &node->hlist, + fprobe_rht_params); } - return !!find_first_fprobe_node(node->addr); + + rcu_read_lock(); + ret = !!rhltable_lookup(&fprobe_ip_table, &node->addr, + fprobe_rht_params); + rcu_read_unlock(); + + return ret; } /* Check existence of the fprobe */ @@ -246,12 +255,128 @@ static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent return ret; } -static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, - struct ftrace_regs *fregs) +#if defined(CONFIG_DYNAMIC_FTRACE_WITH_ARGS) || defined(CONFIG_DYNAMIC_FTRACE_WITH_REGS) +/* ftrace_ops callback, this processes fprobes which have only entry_handler. */ +static void fprobe_ftrace_entry(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct ftrace_regs *fregs) +{ + struct fprobe_hlist_node *node; + struct rhlist_head *head, *pos; + struct fprobe *fp; + int bit; + + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; + + /* + * ftrace_test_recursion_trylock() disables preemption, but + * rhltable_lookup() checks whether rcu_read_lcok is held. + * So we take rcu_read_lock() here. + */ + rcu_read_lock(); + head = rhltable_lookup(&fprobe_ip_table, &ip, fprobe_rht_params); + + rhl_for_each_entry_rcu(node, pos, head, hlist) { + if (node->addr != ip) + break; + fp = READ_ONCE(node->fp); + if (unlikely(!fp || fprobe_disabled(fp) || fp->exit_handler)) + continue; + + if (fprobe_shared_with_kprobes(fp)) + __fprobe_kprobe_handler(ip, parent_ip, fp, fregs, NULL); + else + __fprobe_handler(ip, parent_ip, fp, fregs, NULL); + } + rcu_read_unlock(); + ftrace_test_recursion_unlock(bit); +} +NOKPROBE_SYMBOL(fprobe_ftrace_entry); + +static struct ftrace_ops fprobe_ftrace_ops = { + .func = fprobe_ftrace_entry, + .flags = FTRACE_OPS_FL_SAVE_ARGS, +}; +static int fprobe_ftrace_active; + +static int fprobe_ftrace_add_ips(unsigned long *addrs, int num) +{ + int ret; + + lockdep_assert_held(&fprobe_mutex); + + ret = ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 0, 0); + if (ret) + return ret; + + if (!fprobe_ftrace_active) { + ret = register_ftrace_function(&fprobe_ftrace_ops); + if (ret) { + ftrace_free_filter(&fprobe_ftrace_ops); + return ret; + } + } + fprobe_ftrace_active++; + return 0; +} + +static void fprobe_ftrace_remove_ips(unsigned long *addrs, int num) +{ + lockdep_assert_held(&fprobe_mutex); + + fprobe_ftrace_active--; + if (!fprobe_ftrace_active) + unregister_ftrace_function(&fprobe_ftrace_ops); + if (num) + ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 1, 0); +} + +static bool fprobe_is_ftrace(struct fprobe *fp) +{ + return !fp->exit_handler; +} + +#ifdef CONFIG_MODULES +static void fprobe_set_ips(unsigned long *ips, unsigned int cnt, int remove, + int reset) +{ + ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, remove, reset); + ftrace_set_filter_ips(&fprobe_ftrace_ops, ips, cnt, remove, reset); +} +#endif +#else +static int fprobe_ftrace_add_ips(unsigned long *addrs, int num) +{ + return -ENOENT; +} + +static void fprobe_ftrace_remove_ips(unsigned long *addrs, int num) +{ +} + +static bool fprobe_is_ftrace(struct fprobe *fp) +{ + return false; +} + +#ifdef CONFIG_MODULES +static void fprobe_set_ips(unsigned long *ips, unsigned int cnt, int remove, + int reset) +{ + ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, remove, reset); +} +#endif +#endif /* !CONFIG_DYNAMIC_FTRACE_WITH_ARGS && !CONFIG_DYNAMIC_FTRACE_WITH_REGS */ + +/* fgraph_ops callback, this processes fprobes which have exit_handler. */ +static int fprobe_fgraph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, + struct ftrace_regs *fregs) { - struct fprobe_hlist_node *node, *first; unsigned long *fgraph_data = NULL; unsigned long func = trace->func; + struct fprobe_hlist_node *node; + struct rhlist_head *head, *pos; unsigned long ret_ip; int reserved_words; struct fprobe *fp; @@ -260,14 +385,12 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, if (WARN_ON_ONCE(!fregs)) return 0; - first = node = find_first_fprobe_node(func); - if (unlikely(!first)) - return 0; - + guard(rcu)(); + head = rhltable_lookup(&fprobe_ip_table, &func, fprobe_rht_params); reserved_words = 0; - hlist_for_each_entry_from_rcu(node, hlist) { + rhl_for_each_entry_rcu(node, pos, head, hlist) { if (node->addr != func) - break; + continue; fp = READ_ONCE(node->fp); if (!fp || !fp->exit_handler) continue; @@ -278,15 +401,14 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, reserved_words += FPROBE_HEADER_SIZE_IN_LONG + SIZE_IN_LONG(fp->entry_data_size); } - node = first; if (reserved_words) { fgraph_data = fgraph_reserve_data(gops->idx, reserved_words * sizeof(long)); if (unlikely(!fgraph_data)) { - hlist_for_each_entry_from_rcu(node, hlist) { + rhl_for_each_entry_rcu(node, pos, head, hlist) { if (node->addr != func) - break; + continue; fp = READ_ONCE(node->fp); - if (fp && !fprobe_disabled(fp)) + if (fp && !fprobe_disabled(fp) && !fprobe_is_ftrace(fp)) fp->nmissed++; } return 0; @@ -299,14 +421,14 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, */ ret_ip = ftrace_regs_get_return_address(fregs); used = 0; - hlist_for_each_entry_from_rcu(node, hlist) { + rhl_for_each_entry_rcu(node, pos, head, hlist) { int data_size; void *data; if (node->addr != func) - break; + continue; fp = READ_ONCE(node->fp); - if (!fp || fprobe_disabled(fp)) + if (unlikely(!fp || fprobe_disabled(fp) || fprobe_is_ftrace(fp))) continue; data_size = fp->entry_data_size; @@ -334,7 +456,7 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, /* If any exit_handler is set, data must be used. */ return used != 0; } -NOKPROBE_SYMBOL(fprobe_entry); +NOKPROBE_SYMBOL(fprobe_fgraph_entry); static void fprobe_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops, @@ -352,7 +474,7 @@ static void fprobe_return(struct ftrace_graph_ret *trace, size_words = SIZE_IN_LONG(size); ret_ip = ftrace_regs_get_instruction_pointer(fregs); - preempt_disable(); + preempt_disable_notrace(); curr = 0; while (size_words > curr) { @@ -368,12 +490,12 @@ static void fprobe_return(struct ftrace_graph_ret *trace, } curr += size; } - preempt_enable(); + preempt_enable_notrace(); } NOKPROBE_SYMBOL(fprobe_return); static struct fgraph_ops fprobe_graph_ops = { - .entryfunc = fprobe_entry, + .entryfunc = fprobe_fgraph_entry, .retfunc = fprobe_return, }; static int fprobe_graph_active; @@ -428,8 +550,9 @@ static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long ad { unsigned long *addrs; - if (alist->index >= alist->size) - return -ENOMEM; + /* Previously we failed to expand the list. */ + if (alist->index == alist->size) + return -ENOSPC; alist->addrs[alist->index++] = addr; if (alist->index < alist->size) @@ -448,24 +571,18 @@ static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long ad return 0; } -static void fprobe_remove_node_in_module(struct module *mod, struct hlist_head *head, - struct fprobe_addr_list *alist) +static void fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node, + struct fprobe_addr_list *alist) { - struct fprobe_hlist_node *node; - int ret = 0; - - hlist_for_each_entry_rcu(node, head, hlist) { - if (!within_module(node->addr, mod)) - continue; - if (delete_fprobe_node(node)) - continue; - /* - * If failed to update alist, just continue to update hlist. - * Therefore, at list user handler will not hit anymore. - */ - if (!ret) - ret = fprobe_addr_list_add(alist, node->addr); - } + if (!within_module(node->addr, mod)) + return; + if (delete_fprobe_node(node)) + return; + /* + * If failed to update alist, just continue to update hlist. + * Therefore, at list user handler will not hit anymore. + */ + fprobe_addr_list_add(alist, node->addr); } /* Handle module unloading to manage fprobe_ip_table. */ @@ -473,8 +590,9 @@ static int fprobe_module_callback(struct notifier_block *nb, unsigned long val, void *data) { struct fprobe_addr_list alist = {.size = FPROBE_IPS_BATCH_INIT}; + struct fprobe_hlist_node *node; + struct rhashtable_iter iter; struct module *mod = data; - int i; if (val != MODULE_STATE_GOING) return NOTIFY_DONE; @@ -485,12 +603,19 @@ static int fprobe_module_callback(struct notifier_block *nb, return NOTIFY_DONE; mutex_lock(&fprobe_mutex); - for (i = 0; i < FPROBE_IP_TABLE_SIZE; i++) - fprobe_remove_node_in_module(mod, &fprobe_ip_table[i], &alist); + rhltable_walk_enter(&fprobe_ip_table, &iter); + do { + rhashtable_walk_start(&iter); + + while ((node = rhashtable_walk_next(&iter)) && !IS_ERR(node)) + fprobe_remove_node_in_module(mod, node, &alist); - if (alist.index < alist.size && alist.index > 0) - ftrace_set_filter_ips(&fprobe_graph_ops.ops, - alist.addrs, alist.index, 1, 0); + rhashtable_walk_stop(&iter); + } while (node == ERR_PTR(-EAGAIN)); + rhashtable_walk_exit(&iter); + + if (alist.index > 0) + fprobe_set_ips(alist.addrs, alist.index, 1, 0); mutex_unlock(&fprobe_mutex); kfree(alist.addrs); @@ -647,6 +772,11 @@ static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num) #define FPROBE_IPS_MAX INT_MAX +int fprobe_count_ips_from_filter(const char *filter, const char *notfilter) +{ + return get_ips_from_filter(filter, notfilter, NULL, NULL, FPROBE_IPS_MAX); +} + /** * register_fprobe() - Register fprobe to ftrace by pattern. * @fp: A fprobe data structure to be registered. @@ -718,11 +848,23 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num) mutex_lock(&fprobe_mutex); hlist_array = fp->hlist_array; - ret = fprobe_graph_add_ips(addrs, num); + if (fprobe_is_ftrace(fp)) + ret = fprobe_ftrace_add_ips(addrs, num); + else + ret = fprobe_graph_add_ips(addrs, num); + if (!ret) { add_fprobe_hash(fp); - for (i = 0; i < hlist_array->size; i++) - insert_fprobe_node(&hlist_array->array[i]); + for (i = 0; i < hlist_array->size; i++) { + ret = insert_fprobe_node(&hlist_array->array[i]); + if (ret) + break; + } + /* fallback on insert error */ + if (ret) { + for (i--; i >= 0; i--) + delete_fprobe_node(&hlist_array->array[i]); + } } mutex_unlock(&fprobe_mutex); @@ -806,7 +948,10 @@ int unregister_fprobe(struct fprobe *fp) } del_fprobe_hash(fp); - fprobe_graph_remove_ips(addrs, count); + if (fprobe_is_ftrace(fp)) + fprobe_ftrace_remove_ips(addrs, count); + else + fprobe_graph_remove_ips(addrs, count); kfree_rcu(hlist_array, rcu); fp->hlist_array = NULL; @@ -818,3 +963,10 @@ out: return ret; } EXPORT_SYMBOL_GPL(unregister_fprobe); + +static int __init fprobe_initcall(void) +{ + rhltable_init(&fprobe_ip_table, &fprobe_rht_params); + return 0; +} +core_initcall(fprobe_initcall); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6981830c3128..ef2d5dca6f70 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -188,7 +188,7 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, op->saved_func(ip, parent_ip, op, fregs); } -static void ftrace_sync_ipi(void *data) +void ftrace_sync_ipi(void *data) { /* Probably not needed, but do it anyway */ smp_rmb(); @@ -534,7 +534,9 @@ static int function_stat_headers(struct seq_file *m) static int function_stat_show(struct seq_file *m, void *v) { + struct trace_array *tr = trace_get_global_array(); struct ftrace_profile *rec = v; + const char *refsymbol = NULL; char str[KSYM_SYMBOL_LEN]; #ifdef CONFIG_FUNCTION_GRAPH_TRACER static struct trace_seq s; @@ -554,7 +556,29 @@ static int function_stat_show(struct seq_file *m, void *v) return 0; #endif - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + if (tr->trace_flags & TRACE_ITER(PROF_TEXT_OFFSET)) { + unsigned long offset; + + if (core_kernel_text(rec->ip)) { + refsymbol = "_text"; + offset = rec->ip - (unsigned long)_text; + } else { + struct module *mod; + + guard(rcu)(); + mod = __module_text_address(rec->ip); + if (mod) { + refsymbol = mod->name; + /* Calculate offset from module's text entry address. */ + offset = rec->ip - (unsigned long)mod->mem[MOD_TEXT].base; + } + } + if (refsymbol) + snprintf(str, sizeof(str), " %s+%#lx", refsymbol, offset); + } + if (!refsymbol) + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + seq_printf(m, " %-30.30s %10lu", str, rec->counter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -838,6 +862,8 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace, return 1; } +bool fprofile_no_sleep_time; + static void profile_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops, struct ftrace_regs *fregs) @@ -863,7 +889,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace, calltime = rettime - profile_data->calltime; - if (!fgraph_sleep_time) { + if (fprofile_no_sleep_time) { if (current->ftrace_sleeptime) calltime -= current->ftrace_sleeptime - profile_data->sleeptime; } @@ -1042,10 +1068,6 @@ static struct ftrace_ops *removed_ops; */ static bool update_all_ops; -#ifndef CONFIG_FTRACE_MCOUNT_RECORD -# error Dynamic ftrace depends on MCOUNT_RECORD -#endif - struct ftrace_func_probe { struct ftrace_probe_ops *probe_ops; struct ftrace_ops ops; @@ -1975,7 +1997,8 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops) */ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, struct ftrace_hash *old_hash, - struct ftrace_hash *new_hash) + struct ftrace_hash *new_hash, + bool update_target) { struct ftrace_page *pg; struct dyn_ftrace *rec, *end = NULL; @@ -2010,10 +2033,13 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, if (rec->flags & FTRACE_FL_DISABLED) continue; - /* We need to update only differences of filter_hash */ + /* + * Unless we are updating the target of a direct function, + * we only need to update differences of filter_hash + */ in_old = !!ftrace_lookup_ip(old_hash, rec->ip); in_new = !!ftrace_lookup_ip(new_hash, rec->ip); - if (in_old == in_new) + if (!update_target && (in_old == in_new)) continue; if (in_new) { @@ -2024,7 +2050,16 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, if (is_ipmodify) goto rollback; - FTRACE_WARN_ON(rec->flags & FTRACE_FL_DIRECT); + /* + * If this is called by __modify_ftrace_direct() + * then it is only changing where the direct + * pointer is jumping to, and the record already + * points to a direct trampoline. If it isn't, + * then it is a bug to update ipmodify on a direct + * caller. + */ + FTRACE_WARN_ON(!update_target && + (rec->flags & FTRACE_FL_DIRECT)); /* * Another ops with IPMODIFY is already @@ -2080,7 +2115,7 @@ static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops) if (ftrace_hash_empty(hash)) hash = NULL; - return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash); + return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash, false); } /* Disabling always succeeds */ @@ -2091,7 +2126,7 @@ static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops) if (ftrace_hash_empty(hash)) hash = NULL; - __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH); + __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH, false); } static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, @@ -2105,7 +2140,7 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, if (ftrace_hash_empty(new_hash)) new_hash = NULL; - return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash); + return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash, false); } static void print_ip_ins(const char *fmt, const unsigned char *p) @@ -4373,6 +4408,42 @@ static inline int print_rec(struct seq_file *m, unsigned long ip) } #endif +static void print_subops(struct seq_file *m, struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + struct ftrace_ops *subops; + bool first = true; + + list_for_each_entry(subops, &ops->subop_list, list) { + if (!((subops->flags & FTRACE_OPS_FL_ENABLED) && + hash_contains_ip(rec->ip, subops->func_hash))) + continue; + if (first) { + seq_printf(m, "\tsubops:"); + first = false; + } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (subops->flags & FTRACE_OPS_FL_GRAPH) { + struct fgraph_ops *gops; + + gops = container_of(subops, struct fgraph_ops, ops); + seq_printf(m, " {ent:%pS ret:%pS}", + (void *)gops->entryfunc, + (void *)gops->retfunc); + continue; + } +#endif + if (subops->trampoline) { + seq_printf(m, " {%pS (%pS)}", + (void *)subops->trampoline, + (void *)subops->func); + add_trampoline_func(m, subops, rec); + } else { + seq_printf(m, " {%pS}", + (void *)subops->func); + } + } +} + static int t_show(struct seq_file *m, void *v) { struct ftrace_iterator *iter = m->private; @@ -4425,6 +4496,7 @@ static int t_show(struct seq_file *m, void *v) (void *)ops->trampoline, (void *)ops->func); add_trampoline_func(m, ops, rec); + print_subops(m, ops, rec); ops = ftrace_find_tramp_ops_next(rec, ops); } while (ops); } else @@ -4437,6 +4509,7 @@ static int t_show(struct seq_file *m, void *v) if (ops) { seq_printf(m, "\tops: %pS (%pS)", ops, ops->func); + print_subops(m, ops, rec); } else { seq_puts(m, "\tops: ERROR!"); } @@ -4445,8 +4518,11 @@ static int t_show(struct seq_file *m, void *v) unsigned long direct; direct = ftrace_find_rec_direct(rec->ip); - if (direct) - seq_printf(m, "\n\tdirect-->%pS", (void *)direct); + if (direct) { + seq_printf(m, "\n\tdirect%s-->%pS", + ftrace_is_jmp(direct) ? "(jmp)" : "", + (void *)ftrace_jmp_get(direct)); + } } } @@ -4627,13 +4703,17 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, } else { iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash); } + } else { + if (hash) + iter->hash = alloc_and_copy_ftrace_hash(hash->size_bits, hash); + else + iter->hash = EMPTY_HASH; + } - if (!iter->hash) { - trace_parser_put(&iter->parser); - goto out_unlock; - } - } else - iter->hash = hash; + if (!iter->hash) { + trace_parser_put(&iter->parser); + goto out_unlock; + } ret = 0; @@ -5170,8 +5250,12 @@ struct ftrace_func_map { void *data; }; +/* + * Note, ftrace_func_mapper is freed by free_ftrace_hash(&mapper->hash). + * The hash field must be the first field. + */ struct ftrace_func_mapper { - struct ftrace_hash hash; + struct ftrace_hash hash; /* Must be first! */ }; /** @@ -5306,6 +5390,7 @@ void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, } } } + /* This also frees the mapper itself */ free_ftrace_hash(&mapper->hash); } @@ -5895,7 +5980,8 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long for (i = 0; i < size; i++) { hlist_for_each_entry(entry, &hash->buckets[i], hlist) { del = __ftrace_lookup_ip(direct_functions, entry->ip); - if (del && del->direct == addr) { + if (del && ftrace_jmp_get(del->direct) == + ftrace_jmp_get(addr)) { remove_hash_entry(direct_functions, del); kfree(del); } @@ -5910,6 +5996,17 @@ static void register_ftrace_direct_cb(struct rcu_head *rhp) free_ftrace_hash(fhp); } +static void reset_direct(struct ftrace_ops *ops, unsigned long addr) +{ + struct ftrace_hash *hash = ops->func_hash->filter_hash; + + remove_direct_functions_hash(hash, addr); + + /* cleanup for possible another register call */ + ops->func = NULL; + ops->trampoline = 0; +} + /** * register_ftrace_direct - Call a custom trampoline directly * for multiple functions registered in @ops @@ -5949,8 +6046,15 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) if (ftrace_hash_empty(hash)) return -EINVAL; + /* This is a "raw" address, and this should never happen. */ + if (WARN_ON_ONCE(ftrace_is_jmp(addr))) + return -EINVAL; + mutex_lock(&direct_mutex); + if (ops->flags & FTRACE_OPS_FL_JMP) + addr = ftrace_jmp_set(addr); + /* Make sure requested entries are not already registered.. */ size = 1 << hash->size_bits; for (i = 0; i < size; i++) { @@ -6000,11 +6104,13 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) new_hash = NULL; ops->func = call_direct_funcs; - ops->flags = MULTI_FLAGS; + ops->flags |= MULTI_FLAGS; ops->trampoline = FTRACE_REGS_ADDR; ops->direct_call = addr; err = register_ftrace_function_nolock(ops); + if (err) + reset_direct(ops, addr); out_unlock: mutex_unlock(&direct_mutex); @@ -6037,7 +6143,6 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct); int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr, bool free_filters) { - struct ftrace_hash *hash = ops->func_hash->filter_hash; int err; if (check_direct_multi(ops)) @@ -6047,13 +6152,9 @@ int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr, mutex_lock(&direct_mutex); err = unregister_ftrace_function(ops); - remove_direct_functions_hash(hash, addr); + reset_direct(ops, addr); mutex_unlock(&direct_mutex); - /* cleanup for possible another register call */ - ops->func = NULL; - ops->trampoline = 0; - if (free_filters) ftrace_free_filter(ops); return err; @@ -6063,7 +6164,7 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_direct); static int __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) { - struct ftrace_hash *hash; + struct ftrace_hash *hash = ops->func_hash->filter_hash; struct ftrace_func_entry *entry, *iter; static struct ftrace_ops tmp_ops = { .func = ftrace_stub, @@ -6074,6 +6175,13 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) lockdep_assert_held_once(&direct_mutex); + /* This is a "raw" address, and this should never happen. */ + if (WARN_ON_ONCE(ftrace_is_jmp(addr))) + return -EINVAL; + + if (ops->flags & FTRACE_OPS_FL_JMP) + addr = ftrace_jmp_set(addr); + /* Enable the tmp_ops to have the same functions as the direct ops */ ftrace_ops_init(&tmp_ops); tmp_ops.func_hash = ops->func_hash; @@ -6084,12 +6192,20 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) return err; /* + * Call __ftrace_hash_update_ipmodify() here, so that we can call + * ops->ops_func for the ops. This is needed because the above + * register_ftrace_function_nolock() worked on tmp_ops. + */ + err = __ftrace_hash_update_ipmodify(ops, hash, hash, true); + if (err) + goto out; + + /* * Now the ftrace_ops_list_func() is called to do the direct callers. * We can safely change the direct functions attached to each entry. */ mutex_lock(&ftrace_lock); - hash = ops->func_hash->filter_hash; size = 1 << hash->size_bits; for (i = 0; i < size; i++) { hlist_for_each_entry(iter, &hash->buckets[i], hlist) { @@ -6104,6 +6220,7 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) mutex_unlock(&ftrace_lock); +out: /* Removing the tmp_ops will add the updated direct callers to the functions */ unregister_ftrace_function(&tmp_ops); @@ -6504,9 +6621,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file) ftrace_hash_move_and_update_ops(iter->ops, orig_hash, iter->hash, filter_hash); mutex_unlock(&ftrace_lock); - } else { - /* For read only, the hash is the ops hash */ - iter->hash = NULL; } mutex_unlock(&iter->ops->func_hash->regex_lock); @@ -7395,9 +7509,10 @@ void ftrace_release_mod(struct module *mod) mutex_lock(&ftrace_lock); - if (ftrace_disabled) - goto out_unlock; - + /* + * To avoid the UAF problem after the module is unloaded, the + * 'mod_map' resource needs to be released unconditionally. + */ list_for_each_entry_safe(mod_map, n, &ftrace_mod_maps, list) { if (mod_map->mod == mod) { list_del_rcu(&mod_map->list); @@ -7406,6 +7521,9 @@ void ftrace_release_mod(struct module *mod) } } + if (ftrace_disabled) + goto out_unlock; + /* * Each module has its own ftrace_pages, remove * them from the list. @@ -7491,6 +7609,8 @@ void ftrace_module_enable(struct module *mod) if (!within_module(rec->ip, mod)) break; + cond_resched(); + /* Weak functions should still be ignored */ if (!test_for_valid_rec(rec)) { /* Clear all other flags. Should not be enabled anyway */ @@ -7584,6 +7704,9 @@ allocate_ftrace_mod_map(struct module *mod, { struct ftrace_mod_map *mod_map; + if (ftrace_disabled) + return NULL; + mod_map = kmalloc(sizeof(*mod_map), GFP_KERNEL); if (!mod_map) return NULL; diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c index c62b9b3cfb3d..dbee72d69d0a 100644 --- a/kernel/trace/pid_list.c +++ b/kernel/trace/pid_list.c @@ -3,6 +3,7 @@ * Copyright (C) 2021 VMware Inc, Steven Rostedt <rostedt@goodmis.org> */ #include <linux/spinlock.h> +#include <linux/seqlock.h> #include <linux/irq_work.h> #include <linux/slab.h> #include "trace.h" @@ -81,13 +82,9 @@ static inline bool upper_empty(union upper_chunk *chunk) { /* * If chunk->data has no lower chunks, it will be the same - * as a zeroed bitmask. Use find_first_bit() to test it - * and if it doesn't find any bits set, then the array - * is empty. + * as a zeroed bitmask. */ - int bit = find_first_bit((unsigned long *)chunk->data, - sizeof(chunk->data) * 8); - return bit >= sizeof(chunk->data) * 8; + return bitmap_empty((unsigned long *)chunk->data, BITS_PER_TYPE(chunk->data)); } static inline int pid_split(unsigned int pid, unsigned int *upper1, @@ -130,7 +127,7 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid) { union upper_chunk *upper_chunk; union lower_chunk *lower_chunk; - unsigned long flags; + unsigned int seq; unsigned int upper1; unsigned int upper2; unsigned int lower; @@ -142,14 +139,16 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid) if (pid_split(pid, &upper1, &upper2, &lower) < 0) return false; - raw_spin_lock_irqsave(&pid_list->lock, flags); - upper_chunk = pid_list->upper[upper1]; - if (upper_chunk) { - lower_chunk = upper_chunk->data[upper2]; - if (lower_chunk) - ret = test_bit(lower, lower_chunk->data); - } - raw_spin_unlock_irqrestore(&pid_list->lock, flags); + do { + seq = read_seqcount_begin(&pid_list->seqcount); + ret = false; + upper_chunk = pid_list->upper[upper1]; + if (upper_chunk) { + lower_chunk = upper_chunk->data[upper2]; + if (lower_chunk) + ret = test_bit(lower, lower_chunk->data); + } + } while (read_seqcount_retry(&pid_list->seqcount, seq)); return ret; } @@ -182,6 +181,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid) return -EINVAL; raw_spin_lock_irqsave(&pid_list->lock, flags); + write_seqcount_begin(&pid_list->seqcount); upper_chunk = pid_list->upper[upper1]; if (!upper_chunk) { upper_chunk = get_upper_chunk(pid_list); @@ -203,6 +203,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid) set_bit(lower, lower_chunk->data); ret = 0; out: + write_seqcount_end(&pid_list->seqcount); raw_spin_unlock_irqrestore(&pid_list->lock, flags); return ret; } @@ -234,6 +235,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid) return -EINVAL; raw_spin_lock_irqsave(&pid_list->lock, flags); + write_seqcount_begin(&pid_list->seqcount); upper_chunk = pid_list->upper[upper1]; if (!upper_chunk) goto out; @@ -254,6 +256,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid) } } out: + write_seqcount_end(&pid_list->seqcount); raw_spin_unlock_irqrestore(&pid_list->lock, flags); return 0; } @@ -344,8 +347,10 @@ static void pid_list_refill_irq(struct irq_work *iwork) again: raw_spin_lock(&pid_list->lock); + write_seqcount_begin(&pid_list->seqcount); upper_count = CHUNK_ALLOC - pid_list->free_upper_chunks; lower_count = CHUNK_ALLOC - pid_list->free_lower_chunks; + write_seqcount_end(&pid_list->seqcount); raw_spin_unlock(&pid_list->lock); if (upper_count <= 0 && lower_count <= 0) @@ -374,6 +379,7 @@ static void pid_list_refill_irq(struct irq_work *iwork) } raw_spin_lock(&pid_list->lock); + write_seqcount_begin(&pid_list->seqcount); if (upper) { *upper_next = pid_list->upper_list; pid_list->upper_list = upper; @@ -384,6 +390,7 @@ static void pid_list_refill_irq(struct irq_work *iwork) pid_list->lower_list = lower; pid_list->free_lower_chunks += lcnt; } + write_seqcount_end(&pid_list->seqcount); raw_spin_unlock(&pid_list->lock); /* @@ -423,6 +430,7 @@ struct trace_pid_list *trace_pid_list_alloc(void) init_irq_work(&pid_list->refill_irqwork, pid_list_refill_irq); raw_spin_lock_init(&pid_list->lock); + seqcount_raw_spinlock_init(&pid_list->seqcount, &pid_list->lock); for (i = 0; i < CHUNK_ALLOC; i++) { union upper_chunk *chunk; diff --git a/kernel/trace/pid_list.h b/kernel/trace/pid_list.h index 62e73f1ac85f..0b45fb0eadb9 100644 --- a/kernel/trace/pid_list.h +++ b/kernel/trace/pid_list.h @@ -76,6 +76,7 @@ union upper_chunk { }; struct trace_pid_list { + seqcount_raw_spinlock_t seqcount; raw_spinlock_t lock; struct irq_work refill_irqwork; union upper_chunk *upper[UPPER1_SIZE]; // 1 or 2K in size diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index 21bb161c2316..f2fe33573e54 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -17,5 +17,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_frequency); -EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle); diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index 314ffc143039..acb0c971a408 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -117,12 +117,15 @@ static int preemptirq_delay_run(void *data) { int i; int s = MIN(burst_size, NR_TEST_FUNCS); - struct cpumask cpu_mask; + cpumask_var_t cpu_mask; + + if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; if (cpu_affinity > -1) { - cpumask_clear(&cpu_mask); - cpumask_set_cpu(cpu_affinity, &cpu_mask); - if (set_cpus_allowed_ptr(current, &cpu_mask)) + cpumask_clear(cpu_mask); + cpumask_set_cpu(cpu_affinity, cpu_mask); + if (set_cpus_allowed_ptr(current, cpu_mask)) pr_err("cpu_affinity:%d, failed\n", cpu_affinity); } @@ -139,6 +142,8 @@ static int preemptirq_delay_run(void *data) __set_current_state(TASK_RUNNING); + free_cpumask_var(cpu_mask); + return 0; } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c0f877d39a24..41c9f5d079be 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -402,6 +402,41 @@ static void free_buffer_page(struct buffer_page *bpage) } /* + * For best performance, allocate cpu buffer data cache line sized + * and per CPU. + */ +#define alloc_cpu_buffer(cpu) (struct ring_buffer_per_cpu *) \ + kzalloc_node(ALIGN(sizeof(struct ring_buffer_per_cpu), \ + cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); + +#define alloc_cpu_page(cpu) (struct buffer_page *) \ + kzalloc_node(ALIGN(sizeof(struct buffer_page), \ + cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); + +static struct buffer_data_page *alloc_cpu_data(int cpu, int order) +{ + struct buffer_data_page *dpage; + struct page *page; + gfp_t mflags; + + /* + * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails + * gracefully without invoking oom-killer and the system is not + * destabilized. + */ + mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_COMP | __GFP_ZERO; + + page = alloc_pages_node(cpu_to_node(cpu), mflags, order); + if (!page) + return NULL; + + dpage = page_address(page); + rb_init_page(dpage); + + return dpage; +} + +/* * We need to fit the time_stamp delta into 27 bits. */ static inline bool test_time_stamp(u64 delta) @@ -1358,6 +1393,13 @@ static inline void rb_inc_page(struct buffer_page **bpage) *bpage = list_entry(p, struct buffer_page, list); } +static inline void rb_dec_page(struct buffer_page **bpage) +{ + struct list_head *p = rb_list_head((*bpage)->list.prev); + + *bpage = list_entry(p, struct buffer_page, list); +} + static struct buffer_page * rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer) { @@ -1728,7 +1770,7 @@ static bool rb_meta_init(struct trace_buffer *buffer, int scratch_size) bmeta->total_size = total_size; bmeta->buffers_offset = (void *)ptr - (void *)bmeta; - /* Zero out the scatch pad */ + /* Zero out the scratch pad */ memset((void *)bmeta + sizeof(*bmeta), 0, bmeta->buffers_offset - sizeof(*bmeta)); return false; @@ -1866,10 +1908,11 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu) static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) { struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; - struct buffer_page *head_page; + struct buffer_page *head_page, *orig_head; unsigned long entry_bytes = 0; unsigned long entries = 0; int ret; + u64 ts; int i; if (!meta || !meta->head_buffer) @@ -1885,12 +1928,104 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) entry_bytes += local_read(&cpu_buffer->reader_page->page->commit); local_set(&cpu_buffer->reader_page->entries, ret); - head_page = cpu_buffer->head_page; + orig_head = head_page = cpu_buffer->head_page; + ts = head_page->page->time_stamp; + + /* + * Try to rewind the head so that we can read the pages which already + * read in the previous boot. + */ + if (head_page == cpu_buffer->tail_page) + goto skip_rewind; + + rb_dec_page(&head_page); + for (i = 0; i < meta->nr_subbufs + 1; i++, rb_dec_page(&head_page)) { - /* If both the head and commit are on the reader_page then we are done. */ - if (head_page == cpu_buffer->reader_page && - head_page == cpu_buffer->commit_page) + /* Rewind until tail (writer) page. */ + if (head_page == cpu_buffer->tail_page) + break; + + /* Ensure the page has older data than head. */ + if (ts < head_page->page->time_stamp) + break; + + ts = head_page->page->time_stamp; + /* Ensure the page has correct timestamp and some data. */ + if (!ts || rb_page_commit(head_page) == 0) + break; + + /* Stop rewind if the page is invalid. */ + ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu); + if (ret < 0) + break; + + /* Recover the number of entries and update stats. */ + local_set(&head_page->entries, ret); + if (ret) + local_inc(&cpu_buffer->pages_touched); + entries += ret; + entry_bytes += rb_page_commit(head_page); + } + if (i) + pr_info("Ring buffer [%d] rewound %d pages\n", cpu_buffer->cpu, i); + + /* The last rewound page must be skipped. */ + if (head_page != orig_head) + rb_inc_page(&head_page); + + /* + * If the ring buffer was rewound, then inject the reader page + * into the location just before the original head page. + */ + if (head_page != orig_head) { + struct buffer_page *bpage = orig_head; + + rb_dec_page(&bpage); + /* + * Insert the reader_page before the original head page. + * Since the list encode RB_PAGE flags, general list + * operations should be avoided. + */ + cpu_buffer->reader_page->list.next = &orig_head->list; + cpu_buffer->reader_page->list.prev = orig_head->list.prev; + orig_head->list.prev = &cpu_buffer->reader_page->list; + bpage->list.next = &cpu_buffer->reader_page->list; + + /* Make the head_page the reader page */ + cpu_buffer->reader_page = head_page; + bpage = head_page; + rb_inc_page(&head_page); + head_page->list.prev = bpage->list.prev; + rb_dec_page(&bpage); + bpage->list.next = &head_page->list; + rb_set_list_to_head(&bpage->list); + cpu_buffer->pages = &head_page->list; + + cpu_buffer->head_page = head_page; + meta->head_buffer = (unsigned long)head_page->page; + + /* Reset all the indexes */ + bpage = cpu_buffer->reader_page; + meta->buffers[0] = rb_meta_subbuf_idx(meta, bpage->page); + bpage->id = 0; + + for (i = 1, bpage = head_page; i < meta->nr_subbufs; + i++, rb_inc_page(&bpage)) { + meta->buffers[i] = rb_meta_subbuf_idx(meta, bpage->page); + bpage->id = i; + } + + /* We'll restart verifying from orig_head */ + head_page = orig_head; + } + + skip_rewind: + /* If the commit_buffer is the reader page, update the commit page */ + if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) { + cpu_buffer->commit_page = cpu_buffer->reader_page; + /* Nothing more to do, the only page is the reader page */ goto done; + } /* Iterate until finding the commit page */ for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) { @@ -2104,7 +2239,6 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_cpu_meta *meta = NULL; struct buffer_page *bpage, *tmp; bool user_thread = current->mm != NULL; - gfp_t mflags; long i; /* @@ -2119,13 +2253,6 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, return -ENOMEM; /* - * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails - * gracefully without invoking oom-killer and the system is not - * destabilized. - */ - mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL; - - /* * If a user thread allocates too much, and si_mem_available() * reports there's enough memory, even though there is not. * Make sure the OOM killer kills this thread. This can happen @@ -2141,10 +2268,8 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu); for (i = 0; i < nr_pages; i++) { - struct page *page; - bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - mflags, cpu_to_node(cpu_buffer->cpu)); + bpage = alloc_cpu_page(cpu_buffer->cpu); if (!bpage) goto free_pages; @@ -2167,13 +2292,10 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, bpage->range = 1; bpage->id = i + 1; } else { - page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), - mflags | __GFP_COMP | __GFP_ZERO, - cpu_buffer->buffer->subbuf_order); - if (!page) + int order = cpu_buffer->buffer->subbuf_order; + bpage->page = alloc_cpu_data(cpu_buffer->cpu, order); + if (!bpage->page) goto free_pages; - bpage->page = page_address(page); - rb_init_page(bpage->page); } bpage->order = cpu_buffer->buffer->subbuf_order; @@ -2224,14 +2346,12 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, static struct ring_buffer_per_cpu * rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) { - struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_per_cpu *cpu_buffer __free(kfree) = + alloc_cpu_buffer(cpu); struct ring_buffer_cpu_meta *meta; struct buffer_page *bpage; - struct page *page; int ret; - cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), - GFP_KERNEL, cpu_to_node(cpu)); if (!cpu_buffer) return NULL; @@ -2247,10 +2367,9 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) init_waitqueue_head(&cpu_buffer->irq_work.full_waiters); mutex_init(&cpu_buffer->mapping_lock); - bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - GFP_KERNEL, cpu_to_node(cpu)); + bpage = alloc_cpu_page(cpu); if (!bpage) - goto fail_free_buffer; + return NULL; rb_check_bpage(cpu_buffer, bpage); @@ -2270,13 +2389,10 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; } else { - page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_COMP | __GFP_ZERO, - cpu_buffer->buffer->subbuf_order); - if (!page) + int order = cpu_buffer->buffer->subbuf_order; + bpage->page = alloc_cpu_data(cpu, order); + if (!bpage->page) goto fail_free_reader; - bpage->page = page_address(page); - rb_init_page(bpage->page); } INIT_LIST_HEAD(&cpu_buffer->reader_page->list); @@ -2316,13 +2432,11 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) rb_head_page_activate(cpu_buffer); } - return cpu_buffer; + return_ptr(cpu_buffer); fail_free_reader: free_buffer_page(cpu_buffer->reader_page); - fail_free_buffer: - kfree(cpu_buffer); return NULL; } @@ -2357,7 +2471,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, unsigned long scratch_size, struct lock_class_key *key) { - struct trace_buffer *buffer; + struct trace_buffer *buffer __free(kfree) = NULL; long nr_pages; int subbuf_size; int bsize; @@ -2371,7 +2485,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, return NULL; if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) - goto fail_free_buffer; + return NULL; buffer->subbuf_order = order; subbuf_size = (PAGE_SIZE << order); @@ -2470,7 +2584,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, mutex_init(&buffer->mutex); - return buffer; + return_ptr(buffer); fail_free_buffers: for_each_buffer_cpu(buffer, cpu) { @@ -2482,8 +2596,6 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, fail_free_cpumask: free_cpumask_var(buffer->cpumask); - fail_free_buffer: - kfree(buffer); return NULL; } @@ -2847,6 +2959,12 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, if (nr_pages < 2) nr_pages = 2; + /* + * Keep CPUs from coming online while resizing to synchronize + * with new per CPU buffers being created. + */ + guard(cpus_read_lock)(); + /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); atomic_inc(&buffer->resizing); @@ -2891,7 +3009,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, cond_resched(); } - cpus_read_lock(); /* * Fire off all the required work handlers * We can't schedule on offline CPUs, but it's not necessary @@ -2931,7 +3048,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, cpu_buffer->nr_pages_to_update = 0; } - cpus_read_unlock(); } else { cpu_buffer = buffer->buffers[cpu_id]; @@ -2959,8 +3075,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, goto out_err; } - cpus_read_lock(); - /* Can't run something on an offline CPU. */ if (!cpu_online(cpu_id)) rb_update_pages(cpu_buffer); @@ -2979,7 +3093,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, } cpu_buffer->nr_pages_to_update = 0; - cpus_read_unlock(); } out: @@ -4119,7 +4232,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); static const char *show_irq_str(int bits) { - const char *type[] = { + static const char * type[] = { ".", // 0 "s", // 1 "h", // 2 @@ -4682,10 +4795,7 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer, RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); rb_decrement_entry(cpu_buffer, event); - if (rb_try_to_discard(cpu_buffer, event)) - goto out; - - out: + rb_try_to_discard(cpu_buffer, event); rb_end_commit(cpu_buffer); trace_recursive_unlock(cpu_buffer); @@ -4718,26 +4828,26 @@ int ring_buffer_write(struct trace_buffer *buffer, int ret = -EBUSY; int cpu; - preempt_disable_notrace(); + guard(preempt_notrace)(); if (atomic_read(&buffer->record_disabled)) - goto out; + return -EBUSY; cpu = raw_smp_processor_id(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return -EBUSY; cpu_buffer = buffer->buffers[cpu]; if (atomic_read(&cpu_buffer->record_disabled)) - goto out; + return -EBUSY; if (length > buffer->max_data_size) - goto out; + return -EBUSY; if (unlikely(trace_recursive_lock(cpu_buffer))) - goto out; + return -EBUSY; event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) @@ -4755,10 +4865,6 @@ int ring_buffer_write(struct trace_buffer *buffer, out_unlock: trace_recursive_unlock(cpu_buffer); - - out: - preempt_enable_notrace(); - return ret; } EXPORT_SYMBOL_GPL(ring_buffer_write); @@ -4883,6 +4989,24 @@ bool ring_buffer_record_is_set_on(struct trace_buffer *buffer) } /** + * ring_buffer_record_is_on_cpu - return true if the ring buffer can write + * @buffer: The ring buffer to see if write is enabled + * @cpu: The CPU to test if the ring buffer can write too + * + * Returns true if the ring buffer is in a state that it accepts writes + * for a particular CPU. + */ +bool ring_buffer_record_is_on_cpu(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + cpu_buffer = buffer->buffers[cpu]; + + return ring_buffer_record_is_set_on(buffer) && + !atomic_read(&cpu_buffer->record_disabled); +} + +/** * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer * @buffer: The ring buffer to stop writes to. * @cpu: The CPU buffer to stop @@ -5328,7 +5452,6 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) */ local_set(&cpu_buffer->reader_page->write, 0); local_set(&cpu_buffer->reader_page->entries, 0); - local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->real_end = 0; spin: @@ -5832,24 +5955,20 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts, EXPORT_SYMBOL_GPL(ring_buffer_consume); /** - * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer + * ring_buffer_read_start - start a non consuming read of the buffer * @buffer: The ring buffer to read from * @cpu: The cpu buffer to iterate over * @flags: gfp flags to use for memory allocation * - * This performs the initial preparations necessary to iterate - * through the buffer. Memory is allocated, buffer resizing - * is disabled, and the iterator pointer is returned to the caller. + * This creates an iterator to allow non-consuming iteration through + * the buffer. If the buffer is disabled for writing, it will produce + * the same information each time, but if the buffer is still writing + * then the first hit of a write will cause the iteration to stop. * - * After a sequence of ring_buffer_read_prepare calls, the user is - * expected to make at least one call to ring_buffer_read_prepare_sync. - * Afterwards, ring_buffer_read_start is invoked to get things going - * for real. - * - * This overall must be paired with ring_buffer_read_finish. + * Must be paired with ring_buffer_read_finish. */ struct ring_buffer_iter * -ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) +ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_iter *iter; @@ -5875,51 +5994,12 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) atomic_inc(&cpu_buffer->resize_disabled); - return iter; -} -EXPORT_SYMBOL_GPL(ring_buffer_read_prepare); - -/** - * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls - * - * All previously invoked ring_buffer_read_prepare calls to prepare - * iterators will be synchronized. Afterwards, read_buffer_read_start - * calls on those iterators are allowed. - */ -void -ring_buffer_read_prepare_sync(void) -{ - synchronize_rcu(); -} -EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync); - -/** - * ring_buffer_read_start - start a non consuming read of the buffer - * @iter: The iterator returned by ring_buffer_read_prepare - * - * This finalizes the startup of an iteration through the buffer. - * The iterator comes from a call to ring_buffer_read_prepare and - * an intervening ring_buffer_read_prepare_sync must have been - * performed. - * - * Must be paired with ring_buffer_read_finish. - */ -void -ring_buffer_read_start(struct ring_buffer_iter *iter) -{ - struct ring_buffer_per_cpu *cpu_buffer; - unsigned long flags; - - if (!iter) - return; - - cpu_buffer = iter->cpu_buffer; - - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock); arch_spin_lock(&cpu_buffer->lock); rb_iter_reset(iter); arch_spin_unlock(&cpu_buffer->lock); - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + return iter; } EXPORT_SYMBOL_GPL(ring_buffer_read_start); @@ -6000,6 +6080,39 @@ static void rb_clear_buffer_page(struct buffer_page *page) page->read = 0; } +/* + * When the buffer is memory mapped to user space, each sub buffer + * has a unique id that is used by the meta data to tell the user + * where the current reader page is. + * + * For a normal allocated ring buffer, the id is saved in the buffer page + * id field, and updated via this function. + * + * But for a fixed memory mapped buffer, the id is already assigned for + * fixed memory ordering in the memory layout and can not be used. Instead + * the index of where the page lies in the memory layout is used. + * + * For the normal pages, set the buffer page id with the passed in @id + * value and return that. + * + * For fixed memory mapped pages, get the page index in the memory layout + * and return that as the id. + */ +static int rb_page_id(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *bpage, int id) +{ + /* + * For boot buffers, the id is the index, + * otherwise, set the buffer page with this id + */ + if (cpu_buffer->ring_meta) + id = rb_meta_subbuf_idx(cpu_buffer->ring_meta, bpage->page); + else + bpage->id = id; + + return id; +} + static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) { struct trace_buffer_meta *meta = cpu_buffer->meta_page; @@ -6008,7 +6121,9 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) return; meta->reader.read = cpu_buffer->reader_page->read; - meta->reader.id = cpu_buffer->reader_page->id; + meta->reader.id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, + cpu_buffer->reader_page->id); + meta->reader.lost_events = cpu_buffer->lost_events; meta->entries = local_read(&cpu_buffer->entries); @@ -6078,21 +6193,16 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) /* Must have disabled the cpu buffer then done a synchronize_rcu */ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) { - unsigned long flags; - - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock); if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) - goto out; + return; arch_spin_lock(&cpu_buffer->lock); rb_reset_cpu(cpu_buffer); arch_spin_unlock(&cpu_buffer->lock); - - out: - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } /** @@ -6280,37 +6390,33 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || !cpumask_test_cpu(cpu, buffer_b->cpumask)) - goto out; + return -EINVAL; cpu_buffer_a = buffer_a->buffers[cpu]; cpu_buffer_b = buffer_b->buffers[cpu]; /* It's up to the callers to not try to swap mapped buffers */ - if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) { - ret = -EBUSY; - goto out; - } + if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) + return -EBUSY; /* At least make sure the two buffers are somewhat the same */ if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) - goto out; + return -EINVAL; if (buffer_a->subbuf_order != buffer_b->subbuf_order) - goto out; - - ret = -EAGAIN; + return -EINVAL; if (atomic_read(&buffer_a->record_disabled)) - goto out; + return -EAGAIN; if (atomic_read(&buffer_b->record_disabled)) - goto out; + return -EAGAIN; if (atomic_read(&cpu_buffer_a->record_disabled)) - goto out; + return -EAGAIN; if (atomic_read(&cpu_buffer_b->record_disabled)) - goto out; + return -EAGAIN; /* * We can't do a synchronize_rcu here because this @@ -6347,7 +6453,6 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, out_dec: atomic_dec(&cpu_buffer_a->record_disabled); atomic_dec(&cpu_buffer_b->record_disabled); -out: return ret; } EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); @@ -6375,7 +6480,6 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) struct ring_buffer_per_cpu *cpu_buffer; struct buffer_data_read_page *bpage = NULL; unsigned long flags; - struct page *page; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return ERR_PTR(-ENODEV); @@ -6397,22 +6501,16 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); - if (bpage->data) - goto out; - - page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_NORETRY | __GFP_COMP | __GFP_ZERO, - cpu_buffer->buffer->subbuf_order); - if (!page) { - kfree(bpage); - return ERR_PTR(-ENOMEM); + if (bpage->data) { + rb_init_page(bpage->data); + } else { + bpage->data = alloc_cpu_data(cpu, cpu_buffer->buffer->subbuf_order); + if (!bpage->data) { + kfree(bpage); + return ERR_PTR(-ENOMEM); + } } - bpage->data = page_address(page); - - out: - rb_init_page(bpage->data); - return bpage; } EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); @@ -6506,38 +6604,37 @@ int ring_buffer_read_page(struct trace_buffer *buffer, struct buffer_data_page *bpage; struct buffer_page *reader; unsigned long missed_events; - unsigned long flags; unsigned int commit; unsigned int read; u64 save_timestamp; - int ret = -1; if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return -1; /* * If len is not big enough to hold the page header, then * we can not copy anything. */ if (len <= BUF_PAGE_HDR_SIZE) - goto out; + return -1; len -= BUF_PAGE_HDR_SIZE; if (!data_page || !data_page->data) - goto out; + return -1; + if (data_page->order != buffer->subbuf_order) - goto out; + return -1; bpage = data_page->data; if (!bpage) - goto out; + return -1; - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock); reader = rb_get_reader_page(cpu_buffer); if (!reader) - goto out_unlock; + return -1; event = rb_reader_event(cpu_buffer); @@ -6571,7 +6668,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, if (full && (!read || (len < (commit - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page)) - goto out_unlock; + return -1; if (len > (commit - read)) len = (commit - read); @@ -6580,7 +6677,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, size = rb_event_ts_length(event); if (len < size) - goto out_unlock; + return -1; /* save the current timestamp, since the user will need it */ save_timestamp = cpu_buffer->read_stamp; @@ -6638,7 +6735,6 @@ int ring_buffer_read_page(struct trace_buffer *buffer, if (reader->real_end) local_set(&bpage->commit, reader->real_end); } - ret = read; cpu_buffer->lost_events = 0; @@ -6665,11 +6761,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, if (commit < buffer->subbuf_size) memset(&bpage->data[commit], 0, buffer->subbuf_size - commit); - out_unlock: - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - - out: - return ret; + return read; } EXPORT_SYMBOL_GPL(ring_buffer_read_page); @@ -6762,7 +6854,7 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) old_size = buffer->subbuf_size; /* prevent another thread from changing buffer sizes */ - mutex_lock(&buffer->mutex); + guard(mutex)(&buffer->mutex); atomic_inc(&buffer->record_disabled); /* Make sure all commits have finished */ @@ -6867,7 +6959,6 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) } atomic_dec(&buffer->record_disabled); - mutex_unlock(&buffer->mutex); return 0; @@ -6876,7 +6967,6 @@ error: buffer->subbuf_size = old_size; atomic_dec(&buffer->record_disabled); - mutex_unlock(&buffer->mutex); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; @@ -6924,23 +7014,29 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, struct trace_buffer_meta *meta = cpu_buffer->meta_page; unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; struct buffer_page *first_subbuf, *subbuf; + int cnt = 0; int id = 0; - subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page; - cpu_buffer->reader_page->id = id++; + id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, id); + subbuf_ids[id++] = (unsigned long)cpu_buffer->reader_page->page; + cnt++; first_subbuf = subbuf = rb_set_head_page(cpu_buffer); do { + id = rb_page_id(cpu_buffer, subbuf, id); + if (WARN_ON(id >= nr_subbufs)) break; subbuf_ids[id] = (unsigned long)subbuf->page; - subbuf->id = id; rb_inc_page(&subbuf); id++; + cnt++; } while (subbuf != first_subbuf); + WARN_ON(cnt != nr_subbufs); + /* install subbuf ID to kern VA translation */ cpu_buffer->subbuf_ids = subbuf_ids; @@ -7032,7 +7128,7 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, { unsigned long nr_subbufs, nr_pages, nr_vma_pages, pgoff = vma->vm_pgoff; unsigned int subbuf_pages, subbuf_order; - struct page **pages; + struct page **pages __free(kfree) = NULL; int p = 0, s = 0; int err; @@ -7100,10 +7196,8 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, struct page *page; int off = 0; - if (WARN_ON_ONCE(s >= nr_subbufs)) { - err = -EINVAL; - goto out; - } + if (WARN_ON_ONCE(s >= nr_subbufs)) + return -EINVAL; page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]); @@ -7118,9 +7212,6 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); -out: - kfree(pages); - return err; } #else @@ -7136,36 +7227,34 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags, *subbuf_ids; - int err = 0; + int err; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; - mutex_lock(&cpu_buffer->mapping_lock); + guard(mutex)(&cpu_buffer->mapping_lock); if (cpu_buffer->user_mapped) { err = __rb_map_vma(cpu_buffer, vma); if (!err) err = __rb_inc_dec_mapped(cpu_buffer, true); - mutex_unlock(&cpu_buffer->mapping_lock); return err; } /* prevent another thread from changing buffer/sub-buffer sizes */ - mutex_lock(&buffer->mutex); + guard(mutex)(&buffer->mutex); err = rb_alloc_meta_page(cpu_buffer); if (err) - goto unlock; + return err; /* subbuf_ids include the reader while nr_pages does not */ subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL); if (!subbuf_ids) { rb_free_meta_page(cpu_buffer); - err = -ENOMEM; - goto unlock; + return -ENOMEM; } atomic_inc(&cpu_buffer->resize_disabled); @@ -7193,10 +7282,6 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, atomic_dec(&cpu_buffer->resize_disabled); } -unlock: - mutex_unlock(&buffer->mutex); - mutex_unlock(&cpu_buffer->mapping_lock); - return err; } @@ -7204,24 +7289,22 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; - int err = 0; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; - mutex_lock(&cpu_buffer->mapping_lock); + guard(mutex)(&cpu_buffer->mapping_lock); if (!cpu_buffer->user_mapped) { - err = -ENODEV; - goto out; + return -ENODEV; } else if (cpu_buffer->user_mapped > 1) { __rb_inc_dec_mapped(cpu_buffer, false); - goto out; + return 0; } - mutex_lock(&buffer->mutex); + guard(mutex)(&buffer->mutex); raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* This is the last user space mapping */ @@ -7236,12 +7319,7 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) rb_free_meta_page(cpu_buffer); atomic_dec(&cpu_buffer->resize_disabled); - mutex_unlock(&buffer->mutex); - -out: - mutex_unlock(&cpu_buffer->mapping_lock); - - return err; + return 0; } int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu) @@ -7275,6 +7353,10 @@ consume: goto out; } + /* Did the reader catch up with the writer? */ + if (cpu_buffer->reader_page == cpu_buffer->commit_page) + goto out; + reader = rb_get_reader_page(cpu_buffer); if (WARN_ON(!reader)) goto out; @@ -7282,8 +7364,8 @@ consume: /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; - if (cpu_buffer->reader_page != cpu_buffer->commit_page) { - if (missed_events) { + if (missed_events) { + if (cpu_buffer->reader_page != cpu_buffer->commit_page) { struct buffer_data_page *bpage = reader->page; unsigned int commit; /* @@ -7304,13 +7386,23 @@ consume: local_add(RB_MISSED_STORED, &bpage->commit); } local_add(RB_MISSED_EVENTS, &bpage->commit); + } else if (!WARN_ONCE(cpu_buffer->reader_page == cpu_buffer->tail_page, + "Reader on commit with %ld missed events", + missed_events)) { + /* + * There shouldn't be any missed events if the tail_page + * is on the reader page. But if the tail page is not on the + * reader page and the commit_page is, that would mean that + * there's a commit_overrun (an interrupt preempted an + * addition of an event and then filled the buffer + * with new events). In this case it's not an + * error, but it should still be reported. + * + * TODO: Add missed events to the page for user space to know. + */ + pr_info("Ring buffer [%d] commit overrun lost %ld events at timestamp:%lld\n", + cpu, missed_events, cpu_buffer->reader_page->page->time_stamp); } - } else { - /* - * There really shouldn't be any missed events if the commit - * is on the reader page. - */ - WARN_ON_ONCE(missed_events); } cpu_buffer->lost_events = 0; @@ -7577,7 +7669,7 @@ static __init int test_ringbuffer(void) /* * Show buffer is enabled before setting rb_test_started. * Yes there's a small race window where events could be - * dropped and the thread wont catch it. But when a ring + * dropped and the thread won't catch it. But when a ring * buffer gets enabled, there will always be some kind of * delay before other CPUs see it. Thus, we don't care about * those dropped events. We care about events dropped after @@ -7587,7 +7679,7 @@ static __init int test_ringbuffer(void) rb_test_started = true; set_current_state(TASK_INTERRUPTIBLE); - /* Just run for 10 seconds */; + /* Just run for 10 seconds */ schedule_timeout(10 * HZ); kthread_stop(rb_hammer); diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index cdc3aea12c93..593e3b59e42e 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -433,7 +433,7 @@ static int __init ring_buffer_benchmark_init(void) { int ret; - /* make a one meg buffer in overwite mode */ + /* make a one meg buffer in overwrite mode */ buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE); if (!buffer) return -ENOMEM; diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index b39f36013ef2..5b4be87ba59d 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -1,19 +1,31 @@ # SPDX-License-Identifier: GPL-2.0-only # -config DA_MON_EVENTS +config RV_MON_EVENTS + bool + +config RV_MON_MAINTENANCE_EVENTS bool config DA_MON_EVENTS_IMPLICIT - select DA_MON_EVENTS + select RV_MON_EVENTS + select RV_MON_MAINTENANCE_EVENTS bool config DA_MON_EVENTS_ID - select DA_MON_EVENTS + select RV_MON_EVENTS + select RV_MON_MAINTENANCE_EVENTS + bool + +config LTL_MON_EVENTS_ID + select RV_MON_EVENTS + bool + +config RV_LTL_MONITOR bool menuconfig RV bool "Runtime Verification" - depends on TRACING + select TRACING help Enable the kernel runtime verification infrastructure. RV is a lightweight (yet rigorous) method that complements classical @@ -25,15 +37,34 @@ menuconfig RV For further information, see: Documentation/trace/rv/runtime-verification.rst +config RV_PER_TASK_MONITORS + int "Maximum number of per-task monitor" + depends on RV + range 1 8 + default 2 + help + This option configures the maximum number of per-task RV monitors that can run + simultaneously. + source "kernel/trace/rv/monitors/wip/Kconfig" source "kernel/trace/rv/monitors/wwnr/Kconfig" + source "kernel/trace/rv/monitors/sched/Kconfig" -source "kernel/trace/rv/monitors/tss/Kconfig" source "kernel/trace/rv/monitors/sco/Kconfig" source "kernel/trace/rv/monitors/snroc/Kconfig" source "kernel/trace/rv/monitors/scpd/Kconfig" source "kernel/trace/rv/monitors/snep/Kconfig" -source "kernel/trace/rv/monitors/sncid/Kconfig" +source "kernel/trace/rv/monitors/sts/Kconfig" +source "kernel/trace/rv/monitors/nrp/Kconfig" +source "kernel/trace/rv/monitors/sssw/Kconfig" +source "kernel/trace/rv/monitors/opid/Kconfig" +# Add new sched monitors here + +source "kernel/trace/rv/monitors/rtapp/Kconfig" +source "kernel/trace/rv/monitors/pagefault/Kconfig" +source "kernel/trace/rv/monitors/sleep/Kconfig" +# Add new rtapp monitors here + # Add new monitors here config RV_REACTORS diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index f9b2cd0483c3..750e4ad6fa0f 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -6,12 +6,17 @@ obj-$(CONFIG_RV) += rv.o obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o obj-$(CONFIG_RV_MON_SCHED) += monitors/sched/sched.o -obj-$(CONFIG_RV_MON_TSS) += monitors/tss/tss.o obj-$(CONFIG_RV_MON_SCO) += monitors/sco/sco.o obj-$(CONFIG_RV_MON_SNROC) += monitors/snroc/snroc.o obj-$(CONFIG_RV_MON_SCPD) += monitors/scpd/scpd.o obj-$(CONFIG_RV_MON_SNEP) += monitors/snep/snep.o -obj-$(CONFIG_RV_MON_SNCID) += monitors/sncid/sncid.o +obj-$(CONFIG_RV_MON_RTAPP) += monitors/rtapp/rtapp.o +obj-$(CONFIG_RV_MON_PAGEFAULT) += monitors/pagefault/pagefault.o +obj-$(CONFIG_RV_MON_SLEEP) += monitors/sleep/sleep.o +obj-$(CONFIG_RV_MON_STS) += monitors/sts/sts.o +obj-$(CONFIG_RV_MON_NRP) += monitors/nrp/nrp.o +obj-$(CONFIG_RV_MON_SSSW) += monitors/sssw/sssw.o +obj-$(CONFIG_RV_MON_OPID) += monitors/opid/opid.o # Add new monitors here obj-$(CONFIG_RV_REACTORS) += rv_reactors.o obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o diff --git a/kernel/trace/rv/monitors/sncid/Kconfig b/kernel/trace/rv/monitors/nrp/Kconfig index 76bcfef4fd10..f5ec08f65535 100644 --- a/kernel/trace/rv/monitors/sncid/Kconfig +++ b/kernel/trace/rv/monitors/nrp/Kconfig @@ -1,15 +1,16 @@ # SPDX-License-Identifier: GPL-2.0-only # -config RV_MON_SNCID +config RV_MON_NRP depends on RV - depends on IRQSOFF_TRACER depends on RV_MON_SCHED - default y - select DA_MON_EVENTS_IMPLICIT - bool "sncid monitor" + default y if !ARM64 + select DA_MON_EVENTS_ID + bool "nrp monitor" help - Monitor to ensure schedule is not called with interrupt disabled. + Monitor to ensure preemption requires need resched. This monitor is part of the sched monitors collection. + This monitor is unstable on arm64, say N unless you are testing it. + For further information, see: Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/nrp/nrp.c b/kernel/trace/rv/monitors/nrp/nrp.c new file mode 100644 index 000000000000..5a83b7171432 --- /dev/null +++ b/kernel/trace/rv/monitors/nrp/nrp.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "nrp" + +#include <trace/events/irq.h> +#include <trace/events/sched.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "nrp.h" + +static struct rv_monitor rv_nrp; +DECLARE_DA_MON_PER_TASK(nrp, unsigned char); + +#ifdef CONFIG_X86_LOCAL_APIC +#include <asm/trace/irq_vectors.h> + +static void handle_vector_irq_entry(void *data, int vector) +{ + da_handle_event_nrp(current, irq_entry_nrp); +} + +static void attach_vector_irq(void) +{ + rv_attach_trace_probe("nrp", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_attach_trace_probe("nrp", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_attach_trace_probe("nrp", reschedule_entry, handle_vector_irq_entry); + rv_attach_trace_probe("nrp", call_function_entry, handle_vector_irq_entry); + rv_attach_trace_probe("nrp", call_function_single_entry, handle_vector_irq_entry); + } +} + +static void detach_vector_irq(void) +{ + rv_detach_trace_probe("nrp", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_detach_trace_probe("nrp", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_detach_trace_probe("nrp", reschedule_entry, handle_vector_irq_entry); + rv_detach_trace_probe("nrp", call_function_entry, handle_vector_irq_entry); + rv_detach_trace_probe("nrp", call_function_single_entry, handle_vector_irq_entry); + } +} + +#else +/* We assume irq_entry tracepoints are sufficient on other architectures */ +static void attach_vector_irq(void) { } +static void detach_vector_irq(void) { } +#endif + +static void handle_irq_entry(void *data, int irq, struct irqaction *action) +{ + da_handle_event_nrp(current, irq_entry_nrp); +} + +static void handle_sched_need_resched(void *data, struct task_struct *tsk, + int cpu, int tif) +{ + /* + * Although need_resched leads to both the rescheduling and preempt_irq + * states, it is safer to start the monitor always in preempt_irq, + * which may not mirror the system state but makes the monitor simpler, + */ + if (tif == TIF_NEED_RESCHED) + da_handle_start_event_nrp(tsk, sched_need_resched_nrp); +} + +static void handle_schedule_entry(void *data, bool preempt) +{ + if (preempt) + da_handle_event_nrp(current, schedule_entry_preempt_nrp); + else + da_handle_event_nrp(current, schedule_entry_nrp); +} + +static int enable_nrp(void) +{ + int retval; + + retval = da_monitor_init_nrp(); + if (retval) + return retval; + + rv_attach_trace_probe("nrp", irq_handler_entry, handle_irq_entry); + rv_attach_trace_probe("nrp", sched_set_need_resched_tp, handle_sched_need_resched); + rv_attach_trace_probe("nrp", sched_entry_tp, handle_schedule_entry); + attach_vector_irq(); + + return 0; +} + +static void disable_nrp(void) +{ + rv_nrp.enabled = 0; + + rv_detach_trace_probe("nrp", irq_handler_entry, handle_irq_entry); + rv_detach_trace_probe("nrp", sched_set_need_resched_tp, handle_sched_need_resched); + rv_detach_trace_probe("nrp", sched_entry_tp, handle_schedule_entry); + detach_vector_irq(); + + da_monitor_destroy_nrp(); +} + +static struct rv_monitor rv_nrp = { + .name = "nrp", + .description = "need resched preempts.", + .enable = enable_nrp, + .disable = disable_nrp, + .reset = da_monitor_reset_all_nrp, + .enabled = 0, +}; + +static int __init register_nrp(void) +{ + return rv_register_monitor(&rv_nrp, &rv_sched); +} + +static void __exit unregister_nrp(void) +{ + rv_unregister_monitor(&rv_nrp); +} + +module_init(register_nrp); +module_exit(unregister_nrp); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("nrp: need resched preempts."); diff --git a/kernel/trace/rv/monitors/nrp/nrp.h b/kernel/trace/rv/monitors/nrp/nrp.h new file mode 100644 index 000000000000..c9f12207cbf6 --- /dev/null +++ b/kernel/trace/rv/monitors/nrp/nrp.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of nrp automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_nrp { + preempt_irq_nrp = 0, + any_thread_running_nrp, + nested_preempt_nrp, + rescheduling_nrp, + state_max_nrp +}; + +#define INVALID_STATE state_max_nrp + +enum events_nrp { + irq_entry_nrp = 0, + sched_need_resched_nrp, + schedule_entry_nrp, + schedule_entry_preempt_nrp, + event_max_nrp +}; + +struct automaton_nrp { + char *state_names[state_max_nrp]; + char *event_names[event_max_nrp]; + unsigned char function[state_max_nrp][event_max_nrp]; + unsigned char initial_state; + bool final_states[state_max_nrp]; +}; + +static const struct automaton_nrp automaton_nrp = { + .state_names = { + "preempt_irq", + "any_thread_running", + "nested_preempt", + "rescheduling" + }, + .event_names = { + "irq_entry", + "sched_need_resched", + "schedule_entry", + "schedule_entry_preempt" + }, + .function = { + { + preempt_irq_nrp, + preempt_irq_nrp, + nested_preempt_nrp, + nested_preempt_nrp + }, + { + any_thread_running_nrp, + rescheduling_nrp, + any_thread_running_nrp, + INVALID_STATE + }, + { + nested_preempt_nrp, + preempt_irq_nrp, + any_thread_running_nrp, + any_thread_running_nrp + }, + { + preempt_irq_nrp, + rescheduling_nrp, + any_thread_running_nrp, + any_thread_running_nrp + }, + }, + .initial_state = preempt_irq_nrp, + .final_states = { 0, 1, 0, 0 }, +}; diff --git a/kernel/trace/rv/monitors/nrp/nrp_trace.h b/kernel/trace/rv/monitors/nrp/nrp_trace.h new file mode 100644 index 000000000000..2e13497de3b6 --- /dev/null +++ b/kernel/trace/rv/monitors/nrp/nrp_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_NRP +DEFINE_EVENT(event_da_monitor_id, event_nrp, + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_nrp, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); +#endif /* CONFIG_RV_MON_NRP */ diff --git a/kernel/trace/rv/monitors/opid/Kconfig b/kernel/trace/rv/monitors/opid/Kconfig new file mode 100644 index 000000000000..561d32da572b --- /dev/null +++ b/kernel/trace/rv/monitors/opid/Kconfig @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_OPID + depends on RV + depends on TRACE_IRQFLAGS + depends on TRACE_PREEMPT_TOGGLE + depends on RV_MON_SCHED + default y if PREEMPT_RT + select DA_MON_EVENTS_IMPLICIT + bool "opid monitor" + help + Monitor to ensure operations like wakeup and need resched occur with + interrupts and preemption disabled or during IRQs, where preemption + may not be disabled explicitly. + + This monitor is unstable on !PREEMPT_RT, say N unless you are testing it. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/opid/opid.c b/kernel/trace/rv/monitors/opid/opid.c new file mode 100644 index 000000000000..50d64e7fb8c4 --- /dev/null +++ b/kernel/trace/rv/monitors/opid/opid.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "opid" + +#include <trace/events/sched.h> +#include <trace/events/irq.h> +#include <trace/events/preemptirq.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "opid.h" + +static struct rv_monitor rv_opid; +DECLARE_DA_MON_PER_CPU(opid, unsigned char); + +#ifdef CONFIG_X86_LOCAL_APIC +#include <asm/trace/irq_vectors.h> + +static void handle_vector_irq_entry(void *data, int vector) +{ + da_handle_event_opid(irq_entry_opid); +} + +static void attach_vector_irq(void) +{ + rv_attach_trace_probe("opid", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_attach_trace_probe("opid", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_attach_trace_probe("opid", reschedule_entry, handle_vector_irq_entry); + rv_attach_trace_probe("opid", call_function_entry, handle_vector_irq_entry); + rv_attach_trace_probe("opid", call_function_single_entry, handle_vector_irq_entry); + } +} + +static void detach_vector_irq(void) +{ + rv_detach_trace_probe("opid", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_detach_trace_probe("opid", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_detach_trace_probe("opid", reschedule_entry, handle_vector_irq_entry); + rv_detach_trace_probe("opid", call_function_entry, handle_vector_irq_entry); + rv_detach_trace_probe("opid", call_function_single_entry, handle_vector_irq_entry); + } +} + +#else +/* We assume irq_entry tracepoints are sufficient on other architectures */ +static void attach_vector_irq(void) { } +static void detach_vector_irq(void) { } +#endif + +static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_opid(irq_disable_opid); +} + +static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_opid(irq_enable_opid); +} + +static void handle_irq_entry(void *data, int irq, struct irqaction *action) +{ + da_handle_event_opid(irq_entry_opid); +} + +static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_opid(preempt_disable_opid); +} + +static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_opid(preempt_enable_opid); +} + +static void handle_sched_need_resched(void *data, struct task_struct *tsk, int cpu, int tif) +{ + /* The monitor's intitial state is not in_irq */ + if (this_cpu_read(hardirq_context)) + da_handle_event_opid(sched_need_resched_opid); + else + da_handle_start_event_opid(sched_need_resched_opid); +} + +static void handle_sched_waking(void *data, struct task_struct *p) +{ + /* The monitor's intitial state is not in_irq */ + if (this_cpu_read(hardirq_context)) + da_handle_event_opid(sched_waking_opid); + else + da_handle_start_event_opid(sched_waking_opid); +} + +static int enable_opid(void) +{ + int retval; + + retval = da_monitor_init_opid(); + if (retval) + return retval; + + rv_attach_trace_probe("opid", irq_disable, handle_irq_disable); + rv_attach_trace_probe("opid", irq_enable, handle_irq_enable); + rv_attach_trace_probe("opid", irq_handler_entry, handle_irq_entry); + rv_attach_trace_probe("opid", preempt_disable, handle_preempt_disable); + rv_attach_trace_probe("opid", preempt_enable, handle_preempt_enable); + rv_attach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched); + rv_attach_trace_probe("opid", sched_waking, handle_sched_waking); + attach_vector_irq(); + + return 0; +} + +static void disable_opid(void) +{ + rv_opid.enabled = 0; + + rv_detach_trace_probe("opid", irq_disable, handle_irq_disable); + rv_detach_trace_probe("opid", irq_enable, handle_irq_enable); + rv_detach_trace_probe("opid", irq_handler_entry, handle_irq_entry); + rv_detach_trace_probe("opid", preempt_disable, handle_preempt_disable); + rv_detach_trace_probe("opid", preempt_enable, handle_preempt_enable); + rv_detach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched); + rv_detach_trace_probe("opid", sched_waking, handle_sched_waking); + detach_vector_irq(); + + da_monitor_destroy_opid(); +} + +/* + * This is the monitor register section. + */ +static struct rv_monitor rv_opid = { + .name = "opid", + .description = "operations with preemption and irq disabled.", + .enable = enable_opid, + .disable = disable_opid, + .reset = da_monitor_reset_all_opid, + .enabled = 0, +}; + +static int __init register_opid(void) +{ + return rv_register_monitor(&rv_opid, &rv_sched); +} + +static void __exit unregister_opid(void) +{ + rv_unregister_monitor(&rv_opid); +} + +module_init(register_opid); +module_exit(unregister_opid); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("opid: operations with preemption and irq disabled."); diff --git a/kernel/trace/rv/monitors/opid/opid.h b/kernel/trace/rv/monitors/opid/opid.h new file mode 100644 index 000000000000..b4b8c2ff7f64 --- /dev/null +++ b/kernel/trace/rv/monitors/opid/opid.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of opid automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_opid { + disabled_opid = 0, + enabled_opid, + in_irq_opid, + irq_disabled_opid, + preempt_disabled_opid, + state_max_opid +}; + +#define INVALID_STATE state_max_opid + +enum events_opid { + irq_disable_opid = 0, + irq_enable_opid, + irq_entry_opid, + preempt_disable_opid, + preempt_enable_opid, + sched_need_resched_opid, + sched_waking_opid, + event_max_opid +}; + +struct automaton_opid { + char *state_names[state_max_opid]; + char *event_names[event_max_opid]; + unsigned char function[state_max_opid][event_max_opid]; + unsigned char initial_state; + bool final_states[state_max_opid]; +}; + +static const struct automaton_opid automaton_opid = { + .state_names = { + "disabled", + "enabled", + "in_irq", + "irq_disabled", + "preempt_disabled" + }, + .event_names = { + "irq_disable", + "irq_enable", + "irq_entry", + "preempt_disable", + "preempt_enable", + "sched_need_resched", + "sched_waking" + }, + .function = { + { + INVALID_STATE, + preempt_disabled_opid, + disabled_opid, + INVALID_STATE, + irq_disabled_opid, + disabled_opid, + disabled_opid + }, + { + irq_disabled_opid, + INVALID_STATE, + INVALID_STATE, + preempt_disabled_opid, + enabled_opid, + INVALID_STATE, + INVALID_STATE + }, + { + INVALID_STATE, + enabled_opid, + in_irq_opid, + INVALID_STATE, + INVALID_STATE, + in_irq_opid, + in_irq_opid + }, + { + INVALID_STATE, + enabled_opid, + in_irq_opid, + disabled_opid, + INVALID_STATE, + irq_disabled_opid, + INVALID_STATE + }, + { + disabled_opid, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + enabled_opid, + INVALID_STATE, + INVALID_STATE + }, + }, + .initial_state = disabled_opid, + .final_states = { 0, 1, 0, 0, 0 }, +}; diff --git a/kernel/trace/rv/monitors/sncid/sncid_trace.h b/kernel/trace/rv/monitors/opid/opid_trace.h index 3ce42a57671d..3df6ff955c30 100644 --- a/kernel/trace/rv/monitors/sncid/sncid_trace.h +++ b/kernel/trace/rv/monitors/opid/opid_trace.h @@ -4,12 +4,12 @@ * Snippet to be included in rv_trace.h */ -#ifdef CONFIG_RV_MON_SNCID -DEFINE_EVENT(event_da_monitor, event_sncid, +#ifdef CONFIG_RV_MON_OPID +DEFINE_EVENT(event_da_monitor, event_opid, TP_PROTO(char *state, char *event, char *next_state, bool final_state), TP_ARGS(state, event, next_state, final_state)); -DEFINE_EVENT(error_da_monitor, error_sncid, +DEFINE_EVENT(error_da_monitor, error_opid, TP_PROTO(char *state, char *event), TP_ARGS(state, event)); -#endif /* CONFIG_RV_MON_SNCID */ +#endif /* CONFIG_RV_MON_OPID */ diff --git a/kernel/trace/rv/monitors/pagefault/Kconfig b/kernel/trace/rv/monitors/pagefault/Kconfig new file mode 100644 index 000000000000..0e013f00c33b --- /dev/null +++ b/kernel/trace/rv/monitors/pagefault/Kconfig @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_PAGEFAULT + depends on RV + select RV_LTL_MONITOR + depends on RV_MON_RTAPP + depends on X86 || RISCV + depends on MMU + default y + select LTL_MON_EVENTS_ID + bool "pagefault monitor" + help + Monitor that real-time tasks do not raise page faults, causing + undesirable latency. + + If you are developing a real-time system and not entirely sure whether + the applications are designed correctly for real-time, you want to say + Y here. + + This monitor does not affect execution speed while it is not running, + therefore it is safe to enable this in production kernel. diff --git a/kernel/trace/rv/monitors/pagefault/pagefault.c b/kernel/trace/rv/monitors/pagefault/pagefault.c new file mode 100644 index 000000000000..9fe6123b2200 --- /dev/null +++ b/kernel/trace/rv/monitors/pagefault/pagefault.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/rv.h> +#include <linux/sched/deadline.h> +#include <linux/sched/rt.h> +#include <linux/tracepoint.h> +#include <rv/instrumentation.h> + +#define MODULE_NAME "pagefault" + +#include <rv_trace.h> +#include <trace/events/exceptions.h> +#include <monitors/rtapp/rtapp.h> + +#include "pagefault.h" +#include <rv/ltl_monitor.h> + +static void ltl_atoms_fetch(struct task_struct *task, struct ltl_monitor *mon) +{ + /* + * This includes "actual" real-time tasks and also PI-boosted + * tasks. A task being PI-boosted means it is blocking an "actual" + * real-task, therefore it should also obey the monitor's rule, + * otherwise the "actual" real-task may be delayed. + */ + ltl_atom_set(mon, LTL_RT, rt_or_dl_task(task)); +} + +static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bool task_creation) +{ + if (task_creation) + ltl_atom_set(mon, LTL_PAGEFAULT, false); +} + +static void handle_page_fault(void *data, unsigned long address, struct pt_regs *regs, + unsigned long error_code) +{ + ltl_atom_pulse(current, LTL_PAGEFAULT, true); +} + +static int enable_pagefault(void) +{ + int retval; + + retval = ltl_monitor_init(); + if (retval) + return retval; + + rv_attach_trace_probe("rtapp_pagefault", page_fault_kernel, handle_page_fault); + rv_attach_trace_probe("rtapp_pagefault", page_fault_user, handle_page_fault); + + return 0; +} + +static void disable_pagefault(void) +{ + rv_detach_trace_probe("rtapp_pagefault", page_fault_kernel, handle_page_fault); + rv_detach_trace_probe("rtapp_pagefault", page_fault_user, handle_page_fault); + + ltl_monitor_destroy(); +} + +static struct rv_monitor rv_pagefault = { + .name = "pagefault", + .description = "Monitor that RT tasks do not raise page faults", + .enable = enable_pagefault, + .disable = disable_pagefault, +}; + +static int __init register_pagefault(void) +{ + return rv_register_monitor(&rv_pagefault, &rv_rtapp); +} + +static void __exit unregister_pagefault(void) +{ + rv_unregister_monitor(&rv_pagefault); +} + +module_init(register_pagefault); +module_exit(unregister_pagefault); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Nam Cao <namcao@linutronix.de>"); +MODULE_DESCRIPTION("pagefault: Monitor that RT tasks do not raise page faults"); diff --git a/kernel/trace/rv/monitors/pagefault/pagefault.h b/kernel/trace/rv/monitors/pagefault/pagefault.h new file mode 100644 index 000000000000..c580ec194009 --- /dev/null +++ b/kernel/trace/rv/monitors/pagefault/pagefault.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * C implementation of Buchi automaton, automatically generated by + * tools/verification/rvgen from the linear temporal logic specification. + * For further information, see kernel documentation: + * Documentation/trace/rv/linear_temporal_logic.rst + */ + +#include <linux/rv.h> + +#define MONITOR_NAME pagefault + +enum ltl_atom { + LTL_PAGEFAULT, + LTL_RT, + LTL_NUM_ATOM +}; +static_assert(LTL_NUM_ATOM <= RV_MAX_LTL_ATOM); + +static const char *ltl_atom_str(enum ltl_atom atom) +{ + static const char *const names[] = { + "pa", + "rt", + }; + + return names[atom]; +} + +enum ltl_buchi_state { + S0, + RV_NUM_BA_STATES +}; +static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES); + +static void ltl_start(struct task_struct *task, struct ltl_monitor *mon) +{ + bool pagefault = test_bit(LTL_PAGEFAULT, mon->atoms); + bool val3 = !pagefault; + bool rt = test_bit(LTL_RT, mon->atoms); + bool val1 = !rt; + bool val4 = val1 || val3; + + if (val4) + __set_bit(S0, mon->states); +} + +static void +ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state, unsigned long *next) +{ + bool pagefault = test_bit(LTL_PAGEFAULT, mon->atoms); + bool val3 = !pagefault; + bool rt = test_bit(LTL_RT, mon->atoms); + bool val1 = !rt; + bool val4 = val1 || val3; + + switch (state) { + case S0: + if (val4) + __set_bit(S0, next); + break; + } +} diff --git a/kernel/trace/rv/monitors/pagefault/pagefault_trace.h b/kernel/trace/rv/monitors/pagefault/pagefault_trace.h new file mode 100644 index 000000000000..fe1f82597b1a --- /dev/null +++ b/kernel/trace/rv/monitors/pagefault/pagefault_trace.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_PAGEFAULT +DEFINE_EVENT(event_ltl_monitor_id, event_pagefault, + TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next), + TP_ARGS(task, states, atoms, next)); +DEFINE_EVENT(error_ltl_monitor_id, error_pagefault, + TP_PROTO(struct task_struct *task), + TP_ARGS(task)); +#endif /* CONFIG_RV_MON_PAGEFAULT */ diff --git a/kernel/trace/rv/monitors/rtapp/Kconfig b/kernel/trace/rv/monitors/rtapp/Kconfig new file mode 100644 index 000000000000..1ce9370a9ba8 --- /dev/null +++ b/kernel/trace/rv/monitors/rtapp/Kconfig @@ -0,0 +1,11 @@ +config RV_MON_RTAPP + depends on RV + depends on RV_PER_TASK_MONITORS >= 2 + bool "rtapp monitor" + help + Collection of monitors to check for common problems with real-time + application that may cause unexpected latency. + + If you are developing a real-time system and not entirely sure whether + the applications are designed correctly for real-time, you want to say + Y here. diff --git a/kernel/trace/rv/monitors/rtapp/rtapp.c b/kernel/trace/rv/monitors/rtapp/rtapp.c new file mode 100644 index 000000000000..fd75fc927d65 --- /dev/null +++ b/kernel/trace/rv/monitors/rtapp/rtapp.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> + +#define MODULE_NAME "rtapp" + +#include "rtapp.h" + +struct rv_monitor rv_rtapp; + +struct rv_monitor rv_rtapp = { + .name = "rtapp", + .description = "Collection of monitors for detecting problems with real-time applications", +}; + +static int __init register_rtapp(void) +{ + return rv_register_monitor(&rv_rtapp, NULL); +} + +static void __exit unregister_rtapp(void) +{ + rv_unregister_monitor(&rv_rtapp); +} + +module_init(register_rtapp); +module_exit(unregister_rtapp); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Nam Cao <namcao@linutronix.de>"); +MODULE_DESCRIPTION("Collection of monitors for detecting problems with real-time applications"); diff --git a/kernel/trace/rv/monitors/rtapp/rtapp.h b/kernel/trace/rv/monitors/rtapp/rtapp.h new file mode 100644 index 000000000000..4c200d67c7f6 --- /dev/null +++ b/kernel/trace/rv/monitors/rtapp/rtapp.h @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +extern struct rv_monitor rv_rtapp; diff --git a/kernel/trace/rv/monitors/sched/Kconfig b/kernel/trace/rv/monitors/sched/Kconfig index ae3eb410abd7..aa16456da864 100644 --- a/kernel/trace/rv/monitors/sched/Kconfig +++ b/kernel/trace/rv/monitors/sched/Kconfig @@ -2,6 +2,7 @@ # config RV_MON_SCHED depends on RV + depends on RV_PER_TASK_MONITORS >= 3 bool "sched monitor" help Collection of monitors to check the scheduler behaves according to specifications. diff --git a/kernel/trace/rv/monitors/sched/sched.c b/kernel/trace/rv/monitors/sched/sched.c index 905e03c3c934..d04db4b543f9 100644 --- a/kernel/trace/rv/monitors/sched/sched.c +++ b/kernel/trace/rv/monitors/sched/sched.c @@ -21,8 +21,7 @@ struct rv_monitor rv_sched = { static int __init register_sched(void) { - rv_register_monitor(&rv_sched, NULL); - return 0; + return rv_register_monitor(&rv_sched, NULL); } static void __exit unregister_sched(void) diff --git a/kernel/trace/rv/monitors/sco/sco.c b/kernel/trace/rv/monitors/sco/sco.c index 4cff59220bfc..04c36405e2e3 100644 --- a/kernel/trace/rv/monitors/sco/sco.c +++ b/kernel/trace/rv/monitors/sco/sco.c @@ -24,12 +24,12 @@ static void handle_sched_set_state(void *data, struct task_struct *tsk, int stat da_handle_start_event_sco(sched_set_state_sco); } -static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +static void handle_schedule_entry(void *data, bool preempt) { da_handle_event_sco(schedule_entry_sco); } -static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +static void handle_schedule_exit(void *data, bool is_switch) { da_handle_start_event_sco(schedule_exit_sco); } @@ -71,8 +71,7 @@ static struct rv_monitor rv_sco = { static int __init register_sco(void) { - rv_register_monitor(&rv_sco, &rv_sched); - return 0; + return rv_register_monitor(&rv_sco, &rv_sched); } static void __exit unregister_sco(void) diff --git a/kernel/trace/rv/monitors/scpd/Kconfig b/kernel/trace/rv/monitors/scpd/Kconfig index b9114fbf680f..682d0416188b 100644 --- a/kernel/trace/rv/monitors/scpd/Kconfig +++ b/kernel/trace/rv/monitors/scpd/Kconfig @@ -2,7 +2,7 @@ # config RV_MON_SCPD depends on RV - depends on PREEMPT_TRACER + depends on TRACE_PREEMPT_TOGGLE depends on RV_MON_SCHED default y select DA_MON_EVENTS_IMPLICIT diff --git a/kernel/trace/rv/monitors/scpd/scpd.c b/kernel/trace/rv/monitors/scpd/scpd.c index cbdd6a5f8d7f..1e351ba52fee 100644 --- a/kernel/trace/rv/monitors/scpd/scpd.c +++ b/kernel/trace/rv/monitors/scpd/scpd.c @@ -30,12 +30,12 @@ static void handle_preempt_enable(void *data, unsigned long ip, unsigned long pa da_handle_start_event_scpd(preempt_enable_scpd); } -static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +static void handle_schedule_entry(void *data, bool preempt) { da_handle_event_scpd(schedule_entry_scpd); } -static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +static void handle_schedule_exit(void *data, bool is_switch) { da_handle_event_scpd(schedule_exit_scpd); } @@ -79,8 +79,7 @@ static struct rv_monitor rv_scpd = { static int __init register_scpd(void) { - rv_register_monitor(&rv_scpd, &rv_sched); - return 0; + return rv_register_monitor(&rv_scpd, &rv_sched); } static void __exit unregister_scpd(void) diff --git a/kernel/trace/rv/monitors/sleep/Kconfig b/kernel/trace/rv/monitors/sleep/Kconfig new file mode 100644 index 000000000000..6b7a122e7b47 --- /dev/null +++ b/kernel/trace/rv/monitors/sleep/Kconfig @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SLEEP + depends on RV + select RV_LTL_MONITOR + depends on HAVE_SYSCALL_TRACEPOINTS + depends on RV_MON_RTAPP + select TRACE_IRQFLAGS + default y + select LTL_MON_EVENTS_ID + bool "sleep monitor" + help + Monitor that real-time tasks do not sleep in a manner that may + cause undesirable latency. + + If you are developing a real-time system and not entirely sure whether + the applications are designed correctly for real-time, you want to say + Y here. + + Enabling this monitor may have performance impact (due to select + TRACE_IRQFLAGS). Therefore, you probably should say N for + production kernel. diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c new file mode 100644 index 000000000000..c1347da69e9d --- /dev/null +++ b/kernel/trace/rv/monitors/sleep/sleep.c @@ -0,0 +1,241 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/init.h> +#include <linux/irqflags.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/rv.h> +#include <linux/sched/deadline.h> +#include <linux/sched/rt.h> +#include <rv/instrumentation.h> + +#define MODULE_NAME "sleep" + +#include <trace/events/syscalls.h> +#include <trace/events/sched.h> +#include <trace/events/lock.h> +#include <uapi/linux/futex.h> +#include <rv_trace.h> +#include <monitors/rtapp/rtapp.h> + +#include "sleep.h" +#include <rv/ltl_monitor.h> + +static void ltl_atoms_fetch(struct task_struct *task, struct ltl_monitor *mon) +{ + /* + * This includes "actual" real-time tasks and also PI-boosted + * tasks. A task being PI-boosted means it is blocking an "actual" + * real-task, therefore it should also obey the monitor's rule, + * otherwise the "actual" real-task may be delayed. + */ + ltl_atom_set(mon, LTL_RT, rt_or_dl_task(task)); +} + +static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bool task_creation) +{ + ltl_atom_set(mon, LTL_SLEEP, false); + ltl_atom_set(mon, LTL_WAKE, false); + ltl_atom_set(mon, LTL_ABORT_SLEEP, false); + ltl_atom_set(mon, LTL_WOKEN_BY_HARDIRQ, false); + ltl_atom_set(mon, LTL_WOKEN_BY_NMI, false); + ltl_atom_set(mon, LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, false); + + if (task_creation) { + ltl_atom_set(mon, LTL_KTHREAD_SHOULD_STOP, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false); + ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false); + ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false); + ltl_atom_set(mon, LTL_FUTEX_WAIT, false); + ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false); + ltl_atom_set(mon, LTL_BLOCK_ON_RT_MUTEX, false); + } + + if (task->flags & PF_KTHREAD) { + ltl_atom_set(mon, LTL_KERNEL_THREAD, true); + + /* kernel tasks do not do syscall */ + ltl_atom_set(mon, LTL_FUTEX_WAIT, false); + ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false); + ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false); + ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false); + + if (strstarts(task->comm, "migration/")) + ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, true); + else + ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, false); + + if (strstarts(task->comm, "rcu")) + ltl_atom_set(mon, LTL_TASK_IS_RCU, true); + else + ltl_atom_set(mon, LTL_TASK_IS_RCU, false); + } else { + ltl_atom_set(mon, LTL_KTHREAD_SHOULD_STOP, false); + ltl_atom_set(mon, LTL_KERNEL_THREAD, false); + ltl_atom_set(mon, LTL_TASK_IS_RCU, false); + ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, false); + } + +} + +static void handle_sched_set_state(void *data, struct task_struct *task, int state) +{ + if (state & TASK_INTERRUPTIBLE) + ltl_atom_pulse(task, LTL_SLEEP, true); + else if (state == TASK_RUNNING) + ltl_atom_pulse(task, LTL_ABORT_SLEEP, true); +} + +static void handle_sched_wakeup(void *data, struct task_struct *task) +{ + ltl_atom_pulse(task, LTL_WAKE, true); +} + +static void handle_sched_waking(void *data, struct task_struct *task) +{ + if (this_cpu_read(hardirq_context)) { + ltl_atom_pulse(task, LTL_WOKEN_BY_HARDIRQ, true); + } else if (in_task()) { + if (current->prio <= task->prio) + ltl_atom_pulse(task, LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, true); + } else if (in_nmi()) { + ltl_atom_pulse(task, LTL_WOKEN_BY_NMI, true); + } +} + +static void handle_contention_begin(void *data, void *lock, unsigned int flags) +{ + if (flags & LCB_F_RT) + ltl_atom_update(current, LTL_BLOCK_ON_RT_MUTEX, true); +} + +static void handle_contention_end(void *data, void *lock, int ret) +{ + ltl_atom_update(current, LTL_BLOCK_ON_RT_MUTEX, false); +} + +static void handle_sys_enter(void *data, struct pt_regs *regs, long id) +{ + struct ltl_monitor *mon; + unsigned long args[6]; + int op, cmd; + + mon = ltl_get_monitor(current); + + switch (id) { +#ifdef __NR_clock_nanosleep + case __NR_clock_nanosleep: +#endif +#ifdef __NR_clock_nanosleep_time64 + case __NR_clock_nanosleep_time64: +#endif + syscall_get_arguments(current, regs, args); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, args[0] == CLOCK_MONOTONIC); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, args[0] == CLOCK_TAI); + ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, args[1] == TIMER_ABSTIME); + ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, true); + break; + +#ifdef __NR_futex + case __NR_futex: +#endif +#ifdef __NR_futex_time64 + case __NR_futex_time64: +#endif + syscall_get_arguments(current, regs, args); + op = args[1]; + cmd = op & FUTEX_CMD_MASK; + + switch (cmd) { + case FUTEX_LOCK_PI: + case FUTEX_LOCK_PI2: + ltl_atom_update(current, LTL_FUTEX_LOCK_PI, true); + break; + case FUTEX_WAIT: + case FUTEX_WAIT_BITSET: + case FUTEX_WAIT_REQUEUE_PI: + ltl_atom_update(current, LTL_FUTEX_WAIT, true); + break; + } + break; + } +} + +static void handle_sys_exit(void *data, struct pt_regs *regs, long ret) +{ + struct ltl_monitor *mon = ltl_get_monitor(current); + + ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false); + ltl_atom_set(mon, LTL_FUTEX_WAIT, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false); + ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false); + ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, false); +} + +static void handle_kthread_stop(void *data, struct task_struct *task) +{ + /* FIXME: this could race with other tracepoint handlers */ + ltl_atom_update(task, LTL_KTHREAD_SHOULD_STOP, true); +} + +static int enable_sleep(void) +{ + int retval; + + retval = ltl_monitor_init(); + if (retval) + return retval; + + rv_attach_trace_probe("rtapp_sleep", sched_waking, handle_sched_waking); + rv_attach_trace_probe("rtapp_sleep", sched_wakeup, handle_sched_wakeup); + rv_attach_trace_probe("rtapp_sleep", sched_set_state_tp, handle_sched_set_state); + rv_attach_trace_probe("rtapp_sleep", contention_begin, handle_contention_begin); + rv_attach_trace_probe("rtapp_sleep", contention_end, handle_contention_end); + rv_attach_trace_probe("rtapp_sleep", sched_kthread_stop, handle_kthread_stop); + rv_attach_trace_probe("rtapp_sleep", sys_enter, handle_sys_enter); + rv_attach_trace_probe("rtapp_sleep", sys_exit, handle_sys_exit); + return 0; +} + +static void disable_sleep(void) +{ + rv_detach_trace_probe("rtapp_sleep", sched_waking, handle_sched_waking); + rv_detach_trace_probe("rtapp_sleep", sched_wakeup, handle_sched_wakeup); + rv_detach_trace_probe("rtapp_sleep", sched_set_state_tp, handle_sched_set_state); + rv_detach_trace_probe("rtapp_sleep", contention_begin, handle_contention_begin); + rv_detach_trace_probe("rtapp_sleep", contention_end, handle_contention_end); + rv_detach_trace_probe("rtapp_sleep", sched_kthread_stop, handle_kthread_stop); + rv_detach_trace_probe("rtapp_sleep", sys_enter, handle_sys_enter); + rv_detach_trace_probe("rtapp_sleep", sys_exit, handle_sys_exit); + + ltl_monitor_destroy(); +} + +static struct rv_monitor rv_sleep = { + .name = "sleep", + .description = "Monitor that RT tasks do not undesirably sleep", + .enable = enable_sleep, + .disable = disable_sleep, +}; + +static int __init register_sleep(void) +{ + return rv_register_monitor(&rv_sleep, &rv_rtapp); +} + +static void __exit unregister_sleep(void) +{ + rv_unregister_monitor(&rv_sleep); +} + +module_init(register_sleep); +module_exit(unregister_sleep); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Nam Cao <namcao@linutronix.de>"); +MODULE_DESCRIPTION("sleep: Monitor that RT tasks do not undesirably sleep"); diff --git a/kernel/trace/rv/monitors/sleep/sleep.h b/kernel/trace/rv/monitors/sleep/sleep.h new file mode 100644 index 000000000000..2ab46fd218d2 --- /dev/null +++ b/kernel/trace/rv/monitors/sleep/sleep.h @@ -0,0 +1,257 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * C implementation of Buchi automaton, automatically generated by + * tools/verification/rvgen from the linear temporal logic specification. + * For further information, see kernel documentation: + * Documentation/trace/rv/linear_temporal_logic.rst + */ + +#include <linux/rv.h> + +#define MONITOR_NAME sleep + +enum ltl_atom { + LTL_ABORT_SLEEP, + LTL_BLOCK_ON_RT_MUTEX, + LTL_CLOCK_NANOSLEEP, + LTL_FUTEX_LOCK_PI, + LTL_FUTEX_WAIT, + LTL_KERNEL_THREAD, + LTL_KTHREAD_SHOULD_STOP, + LTL_NANOSLEEP_CLOCK_MONOTONIC, + LTL_NANOSLEEP_CLOCK_TAI, + LTL_NANOSLEEP_TIMER_ABSTIME, + LTL_RT, + LTL_SLEEP, + LTL_TASK_IS_MIGRATION, + LTL_TASK_IS_RCU, + LTL_WAKE, + LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, + LTL_WOKEN_BY_HARDIRQ, + LTL_WOKEN_BY_NMI, + LTL_NUM_ATOM +}; +static_assert(LTL_NUM_ATOM <= RV_MAX_LTL_ATOM); + +static const char *ltl_atom_str(enum ltl_atom atom) +{ + static const char *const names[] = { + "ab_sl", + "bl_on_rt_mu", + "cl_na", + "fu_lo_pi", + "fu_wa", + "ker_th", + "kth_sh_st", + "na_cl_mo", + "na_cl_ta", + "na_ti_ab", + "rt", + "sl", + "ta_mi", + "ta_rc", + "wak", + "wo_eq_hi_pr", + "wo_ha", + "wo_nm", + }; + + return names[atom]; +} + +enum ltl_buchi_state { + S0, + S1, + S2, + S3, + S4, + S5, + S6, + S7, + RV_NUM_BA_STATES +}; +static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES); + +static void ltl_start(struct task_struct *task, struct ltl_monitor *mon) +{ + bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms); + bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms); + bool val40 = task_is_rcu || task_is_migration; + bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms); + bool val41 = futex_lock_pi || val40; + bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms); + bool val5 = block_on_rt_mutex || val41; + bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms); + bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms); + bool val32 = abort_sleep || kthread_should_stop; + bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms); + bool val33 = woken_by_nmi || val32; + bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms); + bool val34 = woken_by_hardirq || val33; + bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, + mon->atoms); + bool val14 = woken_by_equal_or_higher_prio || val34; + bool wake = test_bit(LTL_WAKE, mon->atoms); + bool val13 = !wake; + bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms); + bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms); + bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms); + bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai; + bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms); + bool val25 = nanosleep_timer_abstime && val24; + bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms); + bool val18 = clock_nanosleep && val25; + bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms); + bool val9 = futex_wait || val18; + bool val11 = val9 || kernel_thread; + bool sleep = test_bit(LTL_SLEEP, mon->atoms); + bool val2 = !sleep; + bool rt = test_bit(LTL_RT, mon->atoms); + bool val1 = !rt; + bool val3 = val1 || val2; + + if (val3) + __set_bit(S0, mon->states); + if (val11 && val13) + __set_bit(S1, mon->states); + if (val11 && val14) + __set_bit(S4, mon->states); + if (val5) + __set_bit(S5, mon->states); +} + +static void +ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state, unsigned long *next) +{ + bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms); + bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms); + bool val40 = task_is_rcu || task_is_migration; + bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms); + bool val41 = futex_lock_pi || val40; + bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms); + bool val5 = block_on_rt_mutex || val41; + bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms); + bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms); + bool val32 = abort_sleep || kthread_should_stop; + bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms); + bool val33 = woken_by_nmi || val32; + bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms); + bool val34 = woken_by_hardirq || val33; + bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, + mon->atoms); + bool val14 = woken_by_equal_or_higher_prio || val34; + bool wake = test_bit(LTL_WAKE, mon->atoms); + bool val13 = !wake; + bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms); + bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms); + bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms); + bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai; + bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms); + bool val25 = nanosleep_timer_abstime && val24; + bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms); + bool val18 = clock_nanosleep && val25; + bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms); + bool val9 = futex_wait || val18; + bool val11 = val9 || kernel_thread; + bool sleep = test_bit(LTL_SLEEP, mon->atoms); + bool val2 = !sleep; + bool rt = test_bit(LTL_RT, mon->atoms); + bool val1 = !rt; + bool val3 = val1 || val2; + + switch (state) { + case S0: + if (val3) + __set_bit(S0, next); + if (val11 && val13) + __set_bit(S1, next); + if (val11 && val14) + __set_bit(S4, next); + if (val5) + __set_bit(S5, next); + break; + case S1: + if (val11 && val13) + __set_bit(S1, next); + if (val13 && val3) + __set_bit(S2, next); + if (val14 && val3) + __set_bit(S3, next); + if (val11 && val14) + __set_bit(S4, next); + if (val13 && val5) + __set_bit(S6, next); + if (val14 && val5) + __set_bit(S7, next); + break; + case S2: + if (val11 && val13) + __set_bit(S1, next); + if (val13 && val3) + __set_bit(S2, next); + if (val14 && val3) + __set_bit(S3, next); + if (val11 && val14) + __set_bit(S4, next); + if (val13 && val5) + __set_bit(S6, next); + if (val14 && val5) + __set_bit(S7, next); + break; + case S3: + if (val3) + __set_bit(S0, next); + if (val11 && val13) + __set_bit(S1, next); + if (val11 && val14) + __set_bit(S4, next); + if (val5) + __set_bit(S5, next); + break; + case S4: + if (val3) + __set_bit(S0, next); + if (val11 && val13) + __set_bit(S1, next); + if (val11 && val14) + __set_bit(S4, next); + if (val5) + __set_bit(S5, next); + break; + case S5: + if (val3) + __set_bit(S0, next); + if (val11 && val13) + __set_bit(S1, next); + if (val11 && val14) + __set_bit(S4, next); + if (val5) + __set_bit(S5, next); + break; + case S6: + if (val11 && val13) + __set_bit(S1, next); + if (val13 && val3) + __set_bit(S2, next); + if (val14 && val3) + __set_bit(S3, next); + if (val11 && val14) + __set_bit(S4, next); + if (val13 && val5) + __set_bit(S6, next); + if (val14 && val5) + __set_bit(S7, next); + break; + case S7: + if (val3) + __set_bit(S0, next); + if (val11 && val13) + __set_bit(S1, next); + if (val11 && val14) + __set_bit(S4, next); + if (val5) + __set_bit(S5, next); + break; + } +} diff --git a/kernel/trace/rv/monitors/sleep/sleep_trace.h b/kernel/trace/rv/monitors/sleep/sleep_trace.h new file mode 100644 index 000000000000..22eaf31da987 --- /dev/null +++ b/kernel/trace/rv/monitors/sleep/sleep_trace.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SLEEP +DEFINE_EVENT(event_ltl_monitor_id, event_sleep, + TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next), + TP_ARGS(task, states, atoms, next)); +DEFINE_EVENT(error_ltl_monitor_id, error_sleep, + TP_PROTO(struct task_struct *task), + TP_ARGS(task)); +#endif /* CONFIG_RV_MON_SLEEP */ diff --git a/kernel/trace/rv/monitors/sncid/sncid.c b/kernel/trace/rv/monitors/sncid/sncid.c deleted file mode 100644 index f5037cd6214c..000000000000 --- a/kernel/trace/rv/monitors/sncid/sncid.c +++ /dev/null @@ -1,96 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/ftrace.h> -#include <linux/tracepoint.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/rv.h> -#include <rv/instrumentation.h> -#include <rv/da_monitor.h> - -#define MODULE_NAME "sncid" - -#include <trace/events/sched.h> -#include <trace/events/preemptirq.h> -#include <rv_trace.h> -#include <monitors/sched/sched.h> - -#include "sncid.h" - -static struct rv_monitor rv_sncid; -DECLARE_DA_MON_PER_CPU(sncid, unsigned char); - -static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip) -{ - da_handle_event_sncid(irq_disable_sncid); -} - -static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip) -{ - da_handle_start_event_sncid(irq_enable_sncid); -} - -static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) -{ - da_handle_start_event_sncid(schedule_entry_sncid); -} - -static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) -{ - da_handle_start_event_sncid(schedule_exit_sncid); -} - -static int enable_sncid(void) -{ - int retval; - - retval = da_monitor_init_sncid(); - if (retval) - return retval; - - rv_attach_trace_probe("sncid", irq_disable, handle_irq_disable); - rv_attach_trace_probe("sncid", irq_enable, handle_irq_enable); - rv_attach_trace_probe("sncid", sched_entry_tp, handle_schedule_entry); - rv_attach_trace_probe("sncid", sched_exit_tp, handle_schedule_exit); - - return 0; -} - -static void disable_sncid(void) -{ - rv_sncid.enabled = 0; - - rv_detach_trace_probe("sncid", irq_disable, handle_irq_disable); - rv_detach_trace_probe("sncid", irq_enable, handle_irq_enable); - rv_detach_trace_probe("sncid", sched_entry_tp, handle_schedule_entry); - rv_detach_trace_probe("sncid", sched_exit_tp, handle_schedule_exit); - - da_monitor_destroy_sncid(); -} - -static struct rv_monitor rv_sncid = { - .name = "sncid", - .description = "schedule not called with interrupt disabled.", - .enable = enable_sncid, - .disable = disable_sncid, - .reset = da_monitor_reset_all_sncid, - .enabled = 0, -}; - -static int __init register_sncid(void) -{ - rv_register_monitor(&rv_sncid, &rv_sched); - return 0; -} - -static void __exit unregister_sncid(void) -{ - rv_unregister_monitor(&rv_sncid); -} - -module_init(register_sncid); -module_exit(unregister_sncid); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); -MODULE_DESCRIPTION("sncid: schedule not called with interrupt disabled."); diff --git a/kernel/trace/rv/monitors/sncid/sncid.h b/kernel/trace/rv/monitors/sncid/sncid.h deleted file mode 100644 index 21304725142b..000000000000 --- a/kernel/trace/rv/monitors/sncid/sncid.h +++ /dev/null @@ -1,49 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Automatically generated C representation of sncid automaton - * For further information about this format, see kernel documentation: - * Documentation/trace/rv/deterministic_automata.rst - */ - -enum states_sncid { - can_sched_sncid = 0, - cant_sched_sncid, - state_max_sncid -}; - -#define INVALID_STATE state_max_sncid - -enum events_sncid { - irq_disable_sncid = 0, - irq_enable_sncid, - schedule_entry_sncid, - schedule_exit_sncid, - event_max_sncid -}; - -struct automaton_sncid { - char *state_names[state_max_sncid]; - char *event_names[event_max_sncid]; - unsigned char function[state_max_sncid][event_max_sncid]; - unsigned char initial_state; - bool final_states[state_max_sncid]; -}; - -static const struct automaton_sncid automaton_sncid = { - .state_names = { - "can_sched", - "cant_sched" - }, - .event_names = { - "irq_disable", - "irq_enable", - "schedule_entry", - "schedule_exit" - }, - .function = { - { cant_sched_sncid, INVALID_STATE, can_sched_sncid, can_sched_sncid }, - { INVALID_STATE, can_sched_sncid, INVALID_STATE, INVALID_STATE }, - }, - .initial_state = can_sched_sncid, - .final_states = { 1, 0 }, -}; diff --git a/kernel/trace/rv/monitors/snep/Kconfig b/kernel/trace/rv/monitors/snep/Kconfig index 77527f971232..7dd54f434ff7 100644 --- a/kernel/trace/rv/monitors/snep/Kconfig +++ b/kernel/trace/rv/monitors/snep/Kconfig @@ -2,7 +2,7 @@ # config RV_MON_SNEP depends on RV - depends on PREEMPT_TRACER + depends on TRACE_PREEMPT_TOGGLE depends on RV_MON_SCHED default y select DA_MON_EVENTS_IMPLICIT diff --git a/kernel/trace/rv/monitors/snep/snep.c b/kernel/trace/rv/monitors/snep/snep.c index 0076ba6d7ea4..558950f524a5 100644 --- a/kernel/trace/rv/monitors/snep/snep.c +++ b/kernel/trace/rv/monitors/snep/snep.c @@ -30,12 +30,12 @@ static void handle_preempt_enable(void *data, unsigned long ip, unsigned long pa da_handle_start_event_snep(preempt_enable_snep); } -static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +static void handle_schedule_entry(void *data, bool preempt) { da_handle_event_snep(schedule_entry_snep); } -static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +static void handle_schedule_exit(void *data, bool is_switch) { da_handle_start_event_snep(schedule_exit_snep); } @@ -79,8 +79,7 @@ static struct rv_monitor rv_snep = { static int __init register_snep(void) { - rv_register_monitor(&rv_snep, &rv_sched); - return 0; + return rv_register_monitor(&rv_snep, &rv_sched); } static void __exit unregister_snep(void) diff --git a/kernel/trace/rv/monitors/snep/snep.h b/kernel/trace/rv/monitors/snep/snep.h index 6d16b9ad931e..4cd9abb77b7b 100644 --- a/kernel/trace/rv/monitors/snep/snep.h +++ b/kernel/trace/rv/monitors/snep/snep.h @@ -41,8 +41,18 @@ static const struct automaton_snep automaton_snep = { "schedule_exit" }, .function = { - { non_scheduling_context_snep, non_scheduling_context_snep, scheduling_contex_snep, INVALID_STATE }, - { INVALID_STATE, INVALID_STATE, INVALID_STATE, non_scheduling_context_snep }, + { + non_scheduling_context_snep, + non_scheduling_context_snep, + scheduling_contex_snep, + INVALID_STATE + }, + { + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + non_scheduling_context_snep + }, }, .initial_state = non_scheduling_context_snep, .final_states = { 1, 0 }, diff --git a/kernel/trace/rv/monitors/snroc/snroc.c b/kernel/trace/rv/monitors/snroc/snroc.c index bb1f60d55296..540e686e699f 100644 --- a/kernel/trace/rv/monitors/snroc/snroc.c +++ b/kernel/trace/rv/monitors/snroc/snroc.c @@ -68,8 +68,7 @@ static struct rv_monitor rv_snroc = { static int __init register_snroc(void) { - rv_register_monitor(&rv_snroc, &rv_sched); - return 0; + return rv_register_monitor(&rv_snroc, &rv_sched); } static void __exit unregister_snroc(void) diff --git a/kernel/trace/rv/monitors/tss/Kconfig b/kernel/trace/rv/monitors/sssw/Kconfig index 479f86f52e60..23b7eeb38bbf 100644 --- a/kernel/trace/rv/monitors/tss/Kconfig +++ b/kernel/trace/rv/monitors/sssw/Kconfig @@ -1,13 +1,14 @@ # SPDX-License-Identifier: GPL-2.0-only # -config RV_MON_TSS +config RV_MON_SSSW depends on RV depends on RV_MON_SCHED default y - select DA_MON_EVENTS_IMPLICIT - bool "tss monitor" + select DA_MON_EVENTS_ID + bool "sssw monitor" help - Monitor to ensure sched_switch happens only in scheduling context. + Monitor to ensure sched_set_state to sleepable leads to sleeping and + sleeping tasks require wakeup. This monitor is part of the sched monitors collection. For further information, see: diff --git a/kernel/trace/rv/monitors/sssw/sssw.c b/kernel/trace/rv/monitors/sssw/sssw.c new file mode 100644 index 000000000000..84b8d890d9d4 --- /dev/null +++ b/kernel/trace/rv/monitors/sssw/sssw.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "sssw" + +#include <trace/events/sched.h> +#include <trace/events/signal.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "sssw.h" + +static struct rv_monitor rv_sssw; +DECLARE_DA_MON_PER_TASK(sssw, unsigned char); + +static void handle_sched_set_state(void *data, struct task_struct *tsk, int state) +{ + if (state == TASK_RUNNING) + da_handle_start_event_sssw(tsk, sched_set_state_runnable_sssw); + else + da_handle_event_sssw(tsk, sched_set_state_sleepable_sssw); +} + +static void handle_sched_switch(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + if (preempt) + da_handle_event_sssw(prev, sched_switch_preempt_sssw); + else if (prev_state == TASK_RUNNING) + da_handle_event_sssw(prev, sched_switch_yield_sssw); + else if (prev_state == TASK_RTLOCK_WAIT) + /* special case of sleeping task with racy conditions */ + da_handle_event_sssw(prev, sched_switch_blocking_sssw); + else + da_handle_event_sssw(prev, sched_switch_suspend_sssw); + da_handle_event_sssw(next, sched_switch_in_sssw); +} + +static void handle_sched_wakeup(void *data, struct task_struct *p) +{ + /* + * Wakeup can also lead to signal_wakeup although the system is + * actually runnable. The monitor can safely start with this event. + */ + da_handle_start_event_sssw(p, sched_wakeup_sssw); +} + +static void handle_signal_deliver(void *data, int sig, + struct kernel_siginfo *info, + struct k_sigaction *ka) +{ + da_handle_event_sssw(current, signal_deliver_sssw); +} + +static int enable_sssw(void) +{ + int retval; + + retval = da_monitor_init_sssw(); + if (retval) + return retval; + + rv_attach_trace_probe("sssw", sched_set_state_tp, handle_sched_set_state); + rv_attach_trace_probe("sssw", sched_switch, handle_sched_switch); + rv_attach_trace_probe("sssw", sched_wakeup, handle_sched_wakeup); + rv_attach_trace_probe("sssw", signal_deliver, handle_signal_deliver); + + return 0; +} + +static void disable_sssw(void) +{ + rv_sssw.enabled = 0; + + rv_detach_trace_probe("sssw", sched_set_state_tp, handle_sched_set_state); + rv_detach_trace_probe("sssw", sched_switch, handle_sched_switch); + rv_detach_trace_probe("sssw", sched_wakeup, handle_sched_wakeup); + rv_detach_trace_probe("sssw", signal_deliver, handle_signal_deliver); + + da_monitor_destroy_sssw(); +} + +static struct rv_monitor rv_sssw = { + .name = "sssw", + .description = "set state sleep and wakeup.", + .enable = enable_sssw, + .disable = disable_sssw, + .reset = da_monitor_reset_all_sssw, + .enabled = 0, +}; + +static int __init register_sssw(void) +{ + return rv_register_monitor(&rv_sssw, &rv_sched); +} + +static void __exit unregister_sssw(void) +{ + rv_unregister_monitor(&rv_sssw); +} + +module_init(register_sssw); +module_exit(unregister_sssw); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("sssw: set state sleep and wakeup."); diff --git a/kernel/trace/rv/monitors/sssw/sssw.h b/kernel/trace/rv/monitors/sssw/sssw.h new file mode 100644 index 000000000000..243d54050c94 --- /dev/null +++ b/kernel/trace/rv/monitors/sssw/sssw.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of sssw automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_sssw { + runnable_sssw = 0, + signal_wakeup_sssw, + sleepable_sssw, + sleeping_sssw, + state_max_sssw +}; + +#define INVALID_STATE state_max_sssw + +enum events_sssw { + sched_set_state_runnable_sssw = 0, + sched_set_state_sleepable_sssw, + sched_switch_blocking_sssw, + sched_switch_in_sssw, + sched_switch_preempt_sssw, + sched_switch_suspend_sssw, + sched_switch_yield_sssw, + sched_wakeup_sssw, + signal_deliver_sssw, + event_max_sssw +}; + +struct automaton_sssw { + char *state_names[state_max_sssw]; + char *event_names[event_max_sssw]; + unsigned char function[state_max_sssw][event_max_sssw]; + unsigned char initial_state; + bool final_states[state_max_sssw]; +}; + +static const struct automaton_sssw automaton_sssw = { + .state_names = { + "runnable", + "signal_wakeup", + "sleepable", + "sleeping" + }, + .event_names = { + "sched_set_state_runnable", + "sched_set_state_sleepable", + "sched_switch_blocking", + "sched_switch_in", + "sched_switch_preempt", + "sched_switch_suspend", + "sched_switch_yield", + "sched_wakeup", + "signal_deliver" + }, + .function = { + { + runnable_sssw, + sleepable_sssw, + sleeping_sssw, + runnable_sssw, + runnable_sssw, + INVALID_STATE, + runnable_sssw, + runnable_sssw, + runnable_sssw + }, + { + INVALID_STATE, + sleepable_sssw, + INVALID_STATE, + signal_wakeup_sssw, + signal_wakeup_sssw, + INVALID_STATE, + signal_wakeup_sssw, + signal_wakeup_sssw, + runnable_sssw + }, + { + runnable_sssw, + sleepable_sssw, + sleeping_sssw, + sleepable_sssw, + sleepable_sssw, + sleeping_sssw, + signal_wakeup_sssw, + runnable_sssw, + sleepable_sssw + }, + { + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + runnable_sssw, + INVALID_STATE + }, + }, + .initial_state = runnable_sssw, + .final_states = { 1, 0, 0, 0 }, +}; diff --git a/kernel/trace/rv/monitors/sssw/sssw_trace.h b/kernel/trace/rv/monitors/sssw/sssw_trace.h new file mode 100644 index 000000000000..6c03cfc6960b --- /dev/null +++ b/kernel/trace/rv/monitors/sssw/sssw_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SSSW +DEFINE_EVENT(event_da_monitor_id, event_sssw, + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_sssw, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); +#endif /* CONFIG_RV_MON_SSSW */ diff --git a/kernel/trace/rv/monitors/sts/Kconfig b/kernel/trace/rv/monitors/sts/Kconfig new file mode 100644 index 000000000000..7d1ff0f6fc91 --- /dev/null +++ b/kernel/trace/rv/monitors/sts/Kconfig @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_STS + depends on RV + depends on TRACE_IRQFLAGS + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "sts monitor" + help + Monitor to ensure relationships between scheduler and task switches + * the scheduler is called and returns with interrupts disabled + * each call to the scheduler has up to one switch + * switches only happen inside the scheduler + * each call to the scheduler disables interrupts to switch + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/sts/sts.c b/kernel/trace/rv/monitors/sts/sts.c new file mode 100644 index 000000000000..c4a9cd67c1d2 --- /dev/null +++ b/kernel/trace/rv/monitors/sts/sts.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "sts" + +#include <trace/events/sched.h> +#include <trace/events/irq.h> +#include <trace/events/preemptirq.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "sts.h" + +static struct rv_monitor rv_sts; +DECLARE_DA_MON_PER_CPU(sts, unsigned char); + +#ifdef CONFIG_X86_LOCAL_APIC +#include <asm/trace/irq_vectors.h> + +static void handle_vector_irq_entry(void *data, int vector) +{ + da_handle_event_sts(irq_entry_sts); +} + +static void attach_vector_irq(void) +{ + rv_attach_trace_probe("sts", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_attach_trace_probe("sts", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_attach_trace_probe("sts", reschedule_entry, handle_vector_irq_entry); + rv_attach_trace_probe("sts", call_function_entry, handle_vector_irq_entry); + rv_attach_trace_probe("sts", call_function_single_entry, handle_vector_irq_entry); + } +} + +static void detach_vector_irq(void) +{ + rv_detach_trace_probe("sts", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_detach_trace_probe("sts", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_detach_trace_probe("sts", reschedule_entry, handle_vector_irq_entry); + rv_detach_trace_probe("sts", call_function_entry, handle_vector_irq_entry); + rv_detach_trace_probe("sts", call_function_single_entry, handle_vector_irq_entry); + } +} + +#else +/* We assume irq_entry tracepoints are sufficient on other architectures */ +static void attach_vector_irq(void) { } +static void detach_vector_irq(void) { } +#endif + +static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_sts(irq_disable_sts); +} + +static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_sts(irq_enable_sts); +} + +static void handle_irq_entry(void *data, int irq, struct irqaction *action) +{ + da_handle_event_sts(irq_entry_sts); +} + +static void handle_sched_switch(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + da_handle_event_sts(sched_switch_sts); +} + +static void handle_schedule_entry(void *data, bool preempt) +{ + da_handle_event_sts(schedule_entry_sts); +} + +static void handle_schedule_exit(void *data, bool is_switch) +{ + da_handle_start_event_sts(schedule_exit_sts); +} + +static int enable_sts(void) +{ + int retval; + + retval = da_monitor_init_sts(); + if (retval) + return retval; + + rv_attach_trace_probe("sts", irq_disable, handle_irq_disable); + rv_attach_trace_probe("sts", irq_enable, handle_irq_enable); + rv_attach_trace_probe("sts", irq_handler_entry, handle_irq_entry); + rv_attach_trace_probe("sts", sched_switch, handle_sched_switch); + rv_attach_trace_probe("sts", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("sts", sched_exit_tp, handle_schedule_exit); + attach_vector_irq(); + + return 0; +} + +static void disable_sts(void) +{ + rv_sts.enabled = 0; + + rv_detach_trace_probe("sts", irq_disable, handle_irq_disable); + rv_detach_trace_probe("sts", irq_enable, handle_irq_enable); + rv_detach_trace_probe("sts", irq_handler_entry, handle_irq_entry); + rv_detach_trace_probe("sts", sched_switch, handle_sched_switch); + rv_detach_trace_probe("sts", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("sts", sched_exit_tp, handle_schedule_exit); + detach_vector_irq(); + + da_monitor_destroy_sts(); +} + +/* + * This is the monitor register section. + */ +static struct rv_monitor rv_sts = { + .name = "sts", + .description = "schedule implies task switch.", + .enable = enable_sts, + .disable = disable_sts, + .reset = da_monitor_reset_all_sts, + .enabled = 0, +}; + +static int __init register_sts(void) +{ + return rv_register_monitor(&rv_sts, &rv_sched); +} + +static void __exit unregister_sts(void) +{ + rv_unregister_monitor(&rv_sts); +} + +module_init(register_sts); +module_exit(unregister_sts); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("sts: schedule implies task switch."); diff --git a/kernel/trace/rv/monitors/sts/sts.h b/kernel/trace/rv/monitors/sts/sts.h new file mode 100644 index 000000000000..3368b6599a00 --- /dev/null +++ b/kernel/trace/rv/monitors/sts/sts.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of sts automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_sts { + can_sched_sts = 0, + cant_sched_sts, + disable_to_switch_sts, + enable_to_exit_sts, + in_irq_sts, + scheduling_sts, + switching_sts, + state_max_sts +}; + +#define INVALID_STATE state_max_sts + +enum events_sts { + irq_disable_sts = 0, + irq_enable_sts, + irq_entry_sts, + sched_switch_sts, + schedule_entry_sts, + schedule_exit_sts, + event_max_sts +}; + +struct automaton_sts { + char *state_names[state_max_sts]; + char *event_names[event_max_sts]; + unsigned char function[state_max_sts][event_max_sts]; + unsigned char initial_state; + bool final_states[state_max_sts]; +}; + +static const struct automaton_sts automaton_sts = { + .state_names = { + "can_sched", + "cant_sched", + "disable_to_switch", + "enable_to_exit", + "in_irq", + "scheduling", + "switching" + }, + .event_names = { + "irq_disable", + "irq_enable", + "irq_entry", + "sched_switch", + "schedule_entry", + "schedule_exit" + }, + .function = { + { + cant_sched_sts, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + scheduling_sts, + INVALID_STATE + }, + { + INVALID_STATE, + can_sched_sts, + cant_sched_sts, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE + }, + { + INVALID_STATE, + enable_to_exit_sts, + in_irq_sts, + switching_sts, + INVALID_STATE, + INVALID_STATE + }, + { + enable_to_exit_sts, + enable_to_exit_sts, + enable_to_exit_sts, + INVALID_STATE, + INVALID_STATE, + can_sched_sts + }, + { + INVALID_STATE, + scheduling_sts, + in_irq_sts, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE + }, + { + disable_to_switch_sts, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE + }, + { + INVALID_STATE, + enable_to_exit_sts, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE + }, + }, + .initial_state = can_sched_sts, + .final_states = { 1, 0, 0, 0, 0, 0, 0 }, +}; diff --git a/kernel/trace/rv/monitors/tss/tss_trace.h b/kernel/trace/rv/monitors/sts/sts_trace.h index 4619dbb50cc0..d78beb58d5b3 100644 --- a/kernel/trace/rv/monitors/tss/tss_trace.h +++ b/kernel/trace/rv/monitors/sts/sts_trace.h @@ -4,12 +4,12 @@ * Snippet to be included in rv_trace.h */ -#ifdef CONFIG_RV_MON_TSS -DEFINE_EVENT(event_da_monitor, event_tss, +#ifdef CONFIG_RV_MON_STS +DEFINE_EVENT(event_da_monitor, event_sts, TP_PROTO(char *state, char *event, char *next_state, bool final_state), TP_ARGS(state, event, next_state, final_state)); -DEFINE_EVENT(error_da_monitor, error_tss, +DEFINE_EVENT(error_da_monitor, error_sts, TP_PROTO(char *state, char *event), TP_ARGS(state, event)); -#endif /* CONFIG_RV_MON_TSS */ +#endif /* CONFIG_RV_MON_STS */ diff --git a/kernel/trace/rv/monitors/tss/tss.c b/kernel/trace/rv/monitors/tss/tss.c deleted file mode 100644 index 542787e6524f..000000000000 --- a/kernel/trace/rv/monitors/tss/tss.c +++ /dev/null @@ -1,91 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/ftrace.h> -#include <linux/tracepoint.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/rv.h> -#include <rv/instrumentation.h> -#include <rv/da_monitor.h> - -#define MODULE_NAME "tss" - -#include <trace/events/sched.h> -#include <rv_trace.h> -#include <monitors/sched/sched.h> - -#include "tss.h" - -static struct rv_monitor rv_tss; -DECLARE_DA_MON_PER_CPU(tss, unsigned char); - -static void handle_sched_switch(void *data, bool preempt, - struct task_struct *prev, - struct task_struct *next, - unsigned int prev_state) -{ - da_handle_event_tss(sched_switch_tss); -} - -static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) -{ - da_handle_event_tss(schedule_entry_tss); -} - -static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) -{ - da_handle_start_event_tss(schedule_exit_tss); -} - -static int enable_tss(void) -{ - int retval; - - retval = da_monitor_init_tss(); - if (retval) - return retval; - - rv_attach_trace_probe("tss", sched_switch, handle_sched_switch); - rv_attach_trace_probe("tss", sched_entry_tp, handle_schedule_entry); - rv_attach_trace_probe("tss", sched_exit_tp, handle_schedule_exit); - - return 0; -} - -static void disable_tss(void) -{ - rv_tss.enabled = 0; - - rv_detach_trace_probe("tss", sched_switch, handle_sched_switch); - rv_detach_trace_probe("tss", sched_entry_tp, handle_schedule_entry); - rv_detach_trace_probe("tss", sched_exit_tp, handle_schedule_exit); - - da_monitor_destroy_tss(); -} - -static struct rv_monitor rv_tss = { - .name = "tss", - .description = "task switch while scheduling.", - .enable = enable_tss, - .disable = disable_tss, - .reset = da_monitor_reset_all_tss, - .enabled = 0, -}; - -static int __init register_tss(void) -{ - rv_register_monitor(&rv_tss, &rv_sched); - return 0; -} - -static void __exit unregister_tss(void) -{ - rv_unregister_monitor(&rv_tss); -} - -module_init(register_tss); -module_exit(unregister_tss); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); -MODULE_DESCRIPTION("tss: task switch while scheduling."); diff --git a/kernel/trace/rv/monitors/tss/tss.h b/kernel/trace/rv/monitors/tss/tss.h deleted file mode 100644 index f0a36fda1b87..000000000000 --- a/kernel/trace/rv/monitors/tss/tss.h +++ /dev/null @@ -1,47 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Automatically generated C representation of tss automaton - * For further information about this format, see kernel documentation: - * Documentation/trace/rv/deterministic_automata.rst - */ - -enum states_tss { - thread_tss = 0, - sched_tss, - state_max_tss -}; - -#define INVALID_STATE state_max_tss - -enum events_tss { - sched_switch_tss = 0, - schedule_entry_tss, - schedule_exit_tss, - event_max_tss -}; - -struct automaton_tss { - char *state_names[state_max_tss]; - char *event_names[event_max_tss]; - unsigned char function[state_max_tss][event_max_tss]; - unsigned char initial_state; - bool final_states[state_max_tss]; -}; - -static const struct automaton_tss automaton_tss = { - .state_names = { - "thread", - "sched" - }, - .event_names = { - "sched_switch", - "schedule_entry", - "schedule_exit" - }, - .function = { - { INVALID_STATE, sched_tss, INVALID_STATE }, - { sched_tss, INVALID_STATE, thread_tss }, - }, - .initial_state = thread_tss, - .final_states = { 1, 0 }, -}; diff --git a/kernel/trace/rv/monitors/wip/Kconfig b/kernel/trace/rv/monitors/wip/Kconfig index e464b9294865..87a26195792b 100644 --- a/kernel/trace/rv/monitors/wip/Kconfig +++ b/kernel/trace/rv/monitors/wip/Kconfig @@ -2,7 +2,7 @@ # config RV_MON_WIP depends on RV - depends on PREEMPT_TRACER + depends on TRACE_PREEMPT_TOGGLE select DA_MON_EVENTS_IMPLICIT bool "wip monitor" help diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c index ed758fec8608..4b4e99615a11 100644 --- a/kernel/trace/rv/monitors/wip/wip.c +++ b/kernel/trace/rv/monitors/wip/wip.c @@ -71,8 +71,7 @@ static struct rv_monitor rv_wip = { static int __init register_wip(void) { - rv_register_monitor(&rv_wip, NULL); - return 0; + return rv_register_monitor(&rv_wip, NULL); } static void __exit unregister_wip(void) diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c index 172f31c4b0f3..4145bea2729e 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.c +++ b/kernel/trace/rv/monitors/wwnr/wwnr.c @@ -70,8 +70,7 @@ static struct rv_monitor rv_wwnr = { static int __init register_wwnr(void) { - rv_register_monitor(&rv_wwnr, NULL); - return 0; + return rv_register_monitor(&rv_wwnr, NULL); } static void __exit unregister_wwnr(void) diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c index 0186ff4cbd0b..76537b8a4343 100644 --- a/kernel/trace/rv/reactor_panic.c +++ b/kernel/trace/rv/reactor_panic.c @@ -13,9 +13,9 @@ #include <linux/init.h> #include <linux/rv.h> -static void rv_panic_reaction(char *msg) +__printf(1, 0) static void rv_panic_reaction(const char *msg, va_list args) { - panic(msg); + vpanic(msg, args); } static struct rv_reactor rv_panic = { diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c index 178759dbf89f..48c934e315b3 100644 --- a/kernel/trace/rv/reactor_printk.c +++ b/kernel/trace/rv/reactor_printk.c @@ -12,9 +12,9 @@ #include <linux/init.h> #include <linux/rv.h> -static void rv_printk_reaction(char *msg) +__printf(1, 0) static void rv_printk_reaction(const char *msg, va_list args) { - printk_deferred(msg); + vprintk_deferred(msg, args); } static struct rv_reactor rv_printk = { diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index e4077500a91d..ee4e68102f17 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -143,7 +143,7 @@ #include <linux/init.h> #include <linux/slab.h> -#ifdef CONFIG_DA_MON_EVENTS +#ifdef CONFIG_RV_MON_EVENTS #define CREATE_TRACE_POINTS #include <rv_trace.h> #endif @@ -165,7 +165,7 @@ struct dentry *get_monitors_root(void) LIST_HEAD(rv_monitors_list); static int task_monitor_count; -static bool task_monitor_slots[RV_PER_TASK_MONITORS]; +static bool task_monitor_slots[CONFIG_RV_PER_TASK_MONITORS]; int rv_get_task_monitor_slot(void) { @@ -173,12 +173,12 @@ int rv_get_task_monitor_slot(void) lockdep_assert_held(&rv_interface_lock); - if (task_monitor_count == RV_PER_TASK_MONITORS) + if (task_monitor_count == CONFIG_RV_PER_TASK_MONITORS) return -EBUSY; task_monitor_count++; - for (i = 0; i < RV_PER_TASK_MONITORS; i++) { + for (i = 0; i < CONFIG_RV_PER_TASK_MONITORS; i++) { if (task_monitor_slots[i] == false) { task_monitor_slots[i] = true; return i; @@ -194,7 +194,7 @@ void rv_put_task_monitor_slot(int slot) { lockdep_assert_held(&rv_interface_lock); - if (slot < 0 || slot >= RV_PER_TASK_MONITORS) { + if (slot < 0 || slot >= CONFIG_RV_PER_TASK_MONITORS) { WARN_ONCE(1, "RV releasing an invalid slot!: %d\n", slot); return; } @@ -210,9 +210,9 @@ void rv_put_task_monitor_slot(int slot) * Monitors with a parent are nested, * Monitors without a parent could be standalone or containers. */ -bool rv_is_nested_monitor(struct rv_monitor_def *mdef) +bool rv_is_nested_monitor(struct rv_monitor *mon) { - return mdef->parent != NULL; + return mon->parent != NULL; } /* @@ -223,16 +223,16 @@ bool rv_is_nested_monitor(struct rv_monitor_def *mdef) * for enable()/disable(). Use this condition to find empty containers. * Keep both conditions in case we have some non-compliant containers. */ -bool rv_is_container_monitor(struct rv_monitor_def *mdef) +bool rv_is_container_monitor(struct rv_monitor *mon) { - struct rv_monitor_def *next; + struct rv_monitor *next; - if (list_is_last(&mdef->list, &rv_monitors_list)) + if (list_is_last(&mon->list, &rv_monitors_list)) return false; - next = list_next_entry(mdef, list); + next = list_next_entry(mon, list); - return next->parent == mdef->monitor || !mdef->monitor->enable; + return next->parent == mon || !mon->enable; } /* @@ -241,10 +241,10 @@ bool rv_is_container_monitor(struct rv_monitor_def *mdef) static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf, size_t count, loff_t *ppos) { - struct rv_monitor_def *mdef = filp->private_data; + struct rv_monitor *mon = filp->private_data; const char *buff; - buff = mdef->monitor->enabled ? "1\n" : "0\n"; + buff = mon->enabled ? "1\n" : "0\n"; return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1); } @@ -252,14 +252,14 @@ static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf /* * __rv_disable_monitor - disabled an enabled monitor */ -static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) +static int __rv_disable_monitor(struct rv_monitor *mon, bool sync) { lockdep_assert_held(&rv_interface_lock); - if (mdef->monitor->enabled) { - mdef->monitor->enabled = 0; - if (mdef->monitor->disable) - mdef->monitor->disable(); + if (mon->enabled) { + mon->enabled = 0; + if (mon->disable) + mon->disable(); /* * Wait for the execution of all events to finish. @@ -273,90 +273,90 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) return 0; } -static void rv_disable_single(struct rv_monitor_def *mdef) +static void rv_disable_single(struct rv_monitor *mon) { - __rv_disable_monitor(mdef, true); + __rv_disable_monitor(mon, true); } -static int rv_enable_single(struct rv_monitor_def *mdef) +static int rv_enable_single(struct rv_monitor *mon) { int retval; lockdep_assert_held(&rv_interface_lock); - if (mdef->monitor->enabled) + if (mon->enabled) return 0; - retval = mdef->monitor->enable(); + retval = mon->enable(); if (!retval) - mdef->monitor->enabled = 1; + mon->enabled = 1; return retval; } -static void rv_disable_container(struct rv_monitor_def *mdef) +static void rv_disable_container(struct rv_monitor *mon) { - struct rv_monitor_def *p = mdef; + struct rv_monitor *p = mon; int enabled = 0; list_for_each_entry_continue(p, &rv_monitors_list, list) { - if (p->parent != mdef->monitor) + if (p->parent != mon) break; enabled += __rv_disable_monitor(p, false); } if (enabled) tracepoint_synchronize_unregister(); - mdef->monitor->enabled = 0; + mon->enabled = 0; } -static int rv_enable_container(struct rv_monitor_def *mdef) +static int rv_enable_container(struct rv_monitor *mon) { - struct rv_monitor_def *p = mdef; + struct rv_monitor *p = mon; int retval = 0; list_for_each_entry_continue(p, &rv_monitors_list, list) { - if (retval || p->parent != mdef->monitor) + if (retval || p->parent != mon) break; retval = rv_enable_single(p); } if (retval) - rv_disable_container(mdef); + rv_disable_container(mon); else - mdef->monitor->enabled = 1; + mon->enabled = 1; return retval; } /** * rv_disable_monitor - disable a given runtime monitor - * @mdef: Pointer to the monitor definition structure. + * @mon: Pointer to the monitor definition structure. * * Returns 0 on success. */ -int rv_disable_monitor(struct rv_monitor_def *mdef) +int rv_disable_monitor(struct rv_monitor *mon) { - if (rv_is_container_monitor(mdef)) - rv_disable_container(mdef); + if (rv_is_container_monitor(mon)) + rv_disable_container(mon); else - rv_disable_single(mdef); + rv_disable_single(mon); return 0; } /** * rv_enable_monitor - enable a given runtime monitor - * @mdef: Pointer to the monitor definition structure. + * @mon: Pointer to the monitor definition structure. * * Returns 0 on success, error otherwise. */ -int rv_enable_monitor(struct rv_monitor_def *mdef) +int rv_enable_monitor(struct rv_monitor *mon) { int retval; - if (rv_is_container_monitor(mdef)) - retval = rv_enable_container(mdef); + if (rv_is_container_monitor(mon)) + retval = rv_enable_container(mon); else - retval = rv_enable_single(mdef); + retval = rv_enable_single(mon); return retval; } @@ -367,7 +367,7 @@ int rv_enable_monitor(struct rv_monitor_def *mdef) static ssize_t monitor_enable_write_data(struct file *filp, const char __user *user_buf, size_t count, loff_t *ppos) { - struct rv_monitor_def *mdef = filp->private_data; + struct rv_monitor *mon = filp->private_data; int retval; bool val; @@ -375,14 +375,12 @@ static ssize_t monitor_enable_write_data(struct file *filp, const char __user *u if (retval) return retval; - mutex_lock(&rv_interface_lock); + guard(mutex)(&rv_interface_lock); if (val) - retval = rv_enable_monitor(mdef); + retval = rv_enable_monitor(mon); else - retval = rv_disable_monitor(mdef); - - mutex_unlock(&rv_interface_lock); + retval = rv_disable_monitor(mon); return retval ? : count; } @@ -399,12 +397,12 @@ static const struct file_operations interface_enable_fops = { static ssize_t monitor_desc_read_data(struct file *filp, char __user *user_buf, size_t count, loff_t *ppos) { - struct rv_monitor_def *mdef = filp->private_data; + struct rv_monitor *mon = filp->private_data; char buff[256]; memset(buff, 0, sizeof(buff)); - snprintf(buff, sizeof(buff), "%s\n", mdef->monitor->description); + snprintf(buff, sizeof(buff), "%s\n", mon->description); return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1); } @@ -419,38 +417,30 @@ static const struct file_operations interface_desc_fops = { * the monitor dir, where the specific options of the monitor * are exposed. */ -static int create_monitor_dir(struct rv_monitor_def *mdef, struct rv_monitor_def *parent) +static int create_monitor_dir(struct rv_monitor *mon, struct rv_monitor *parent) { struct dentry *root = parent ? parent->root_d : get_monitors_root(); - const char *name = mdef->monitor->name; + struct dentry *dir __free(rv_remove) = rv_create_dir(mon->name, root); struct dentry *tmp; int retval; - mdef->root_d = rv_create_dir(name, root); - if (!mdef->root_d) + if (!dir) return -ENOMEM; - tmp = rv_create_file("enable", RV_MODE_WRITE, mdef->root_d, mdef, &interface_enable_fops); - if (!tmp) { - retval = -ENOMEM; - goto out_remove_root; - } + tmp = rv_create_file("enable", RV_MODE_WRITE, dir, mon, &interface_enable_fops); + if (!tmp) + return -ENOMEM; - tmp = rv_create_file("desc", RV_MODE_READ, mdef->root_d, mdef, &interface_desc_fops); - if (!tmp) { - retval = -ENOMEM; - goto out_remove_root; - } + tmp = rv_create_file("desc", RV_MODE_READ, dir, mon, &interface_desc_fops); + if (!tmp) + return -ENOMEM; - retval = reactor_populate_monitor(mdef); + retval = reactor_populate_monitor(mon, dir); if (retval) - goto out_remove_root; + return retval; + mon->root_d = no_free_ptr(dir); return 0; - -out_remove_root: - rv_remove(mdef->root_d); - return retval; } /* @@ -458,13 +448,12 @@ out_remove_root: */ static int monitors_show(struct seq_file *m, void *p) { - struct rv_monitor_def *mon_def = p; + struct rv_monitor *mon = container_of(p, struct rv_monitor, list); - if (mon_def->parent) - seq_printf(m, "%s:%s\n", mon_def->parent->name, - mon_def->monitor->name); + if (mon->parent) + seq_printf(m, "%s:%s\n", mon->parent->name, mon->name); else - seq_printf(m, "%s\n", mon_def->monitor->name); + seq_printf(m, "%s\n", mon->name); return 0; } @@ -496,13 +485,13 @@ static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos) */ static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos) { - struct rv_monitor_def *m_def = p; + struct rv_monitor *mon = container_of(p, struct rv_monitor, list); (*pos)++; - list_for_each_entry_continue(m_def, &rv_monitors_list, list) { - if (m_def->monitor->enabled) - return m_def; + list_for_each_entry_continue(mon, &rv_monitors_list, list) { + if (mon->enabled) + return &mon->list; } return NULL; @@ -510,7 +499,7 @@ static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos) static void *enabled_monitors_start(struct seq_file *m, loff_t *pos) { - struct rv_monitor_def *m_def; + struct list_head *head; loff_t l; mutex_lock(&rv_interface_lock); @@ -518,15 +507,15 @@ static void *enabled_monitors_start(struct seq_file *m, loff_t *pos) if (list_empty(&rv_monitors_list)) return NULL; - m_def = list_entry(&rv_monitors_list, struct rv_monitor_def, list); + head = &rv_monitors_list; for (l = 0; l <= *pos; ) { - m_def = enabled_monitors_next(m, m_def, &l); - if (!m_def) + head = enabled_monitors_next(m, head, &l); + if (!head) break; } - return m_def; + return head; } /* @@ -566,13 +555,13 @@ static const struct file_operations available_monitors_ops = { */ static void disable_all_monitors(void) { - struct rv_monitor_def *mdef; + struct rv_monitor *mon; int enabled = 0; - mutex_lock(&rv_interface_lock); + guard(mutex)(&rv_interface_lock); - list_for_each_entry(mdef, &rv_monitors_list, list) - enabled += __rv_disable_monitor(mdef, false); + list_for_each_entry(mon, &rv_monitors_list, list) + enabled += __rv_disable_monitor(mon, false); if (enabled) { /* @@ -582,8 +571,6 @@ static void disable_all_monitors(void) */ tracepoint_synchronize_unregister(); } - - mutex_unlock(&rv_interface_lock); } static int enabled_monitors_open(struct inode *inode, struct file *file) @@ -598,7 +585,7 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user size_t count, loff_t *ppos) { char buff[MAX_RV_MONITOR_NAME_SIZE + 2]; - struct rv_monitor_def *mdef; + struct rv_monitor *mon; int retval = -EINVAL; bool enable = true; char *ptr, *tmp; @@ -624,7 +611,7 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user if (!len) return count; - mutex_lock(&rv_interface_lock); + guard(mutex)(&rv_interface_lock); retval = -EINVAL; @@ -633,25 +620,23 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user if (tmp) ptr = tmp+1; - list_for_each_entry(mdef, &rv_monitors_list, list) { - if (strcmp(ptr, mdef->monitor->name) != 0) + list_for_each_entry(mon, &rv_monitors_list, list) { + if (strcmp(ptr, mon->name) != 0) continue; /* * Monitor found! */ if (enable) - retval = rv_enable_monitor(mdef); + retval = rv_enable_monitor(mon); else - retval = rv_disable_monitor(mdef); - - if (!retval) - retval = count; + retval = rv_disable_monitor(mon); - break; + if (retval) + return retval; + return count; } - mutex_unlock(&rv_interface_lock); return retval; } @@ -675,8 +660,6 @@ static bool __read_mostly monitoring_on; */ bool rv_monitoring_on(void) { - /* Ensures that concurrent monitors read consistent monitoring_on */ - smp_rmb(); return READ_ONCE(monitoring_on); } @@ -696,25 +679,21 @@ static ssize_t monitoring_on_read_data(struct file *filp, char __user *user_buf, static void turn_monitoring_off(void) { WRITE_ONCE(monitoring_on, false); - /* Ensures that concurrent monitors read consistent monitoring_on */ - smp_wmb(); } static void reset_all_monitors(void) { - struct rv_monitor_def *mdef; + struct rv_monitor *mon; - list_for_each_entry(mdef, &rv_monitors_list, list) { - if (mdef->monitor->enabled && mdef->monitor->reset) - mdef->monitor->reset(); + list_for_each_entry(mon, &rv_monitors_list, list) { + if (mon->enabled && mon->reset) + mon->reset(); } } static void turn_monitoring_on(void) { WRITE_ONCE(monitoring_on, true); - /* Ensures that concurrent monitors read consistent monitoring_on */ - smp_wmb(); } static void turn_monitoring_on_with_reset(void) @@ -744,7 +723,7 @@ static ssize_t monitoring_on_write_data(struct file *filp, const char __user *us if (retval) return retval; - mutex_lock(&rv_interface_lock); + guard(mutex)(&rv_interface_lock); if (val) turn_monitoring_on_with_reset(); @@ -757,8 +736,6 @@ static ssize_t monitoring_on_write_data(struct file *filp, const char __user *us */ tracepoint_synchronize_unregister(); - mutex_unlock(&rv_interface_lock); - return count; } @@ -768,10 +745,9 @@ static const struct file_operations monitoring_on_fops = { .read = monitoring_on_read_data, }; -static void destroy_monitor_dir(struct rv_monitor_def *mdef) +static void destroy_monitor_dir(struct rv_monitor *mon) { - reactor_cleanup_monitor(mdef); - rv_remove(mdef->root_d); + rv_remove(mon->root_d); } /** @@ -783,7 +759,7 @@ static void destroy_monitor_dir(struct rv_monitor_def *mdef) */ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent) { - struct rv_monitor_def *r, *p = NULL; + struct rv_monitor *r; int retval = 0; if (strlen(monitor->name) >= MAX_RV_MONITOR_NAME_SIZE) { @@ -792,56 +768,34 @@ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent) return -EINVAL; } - mutex_lock(&rv_interface_lock); + guard(mutex)(&rv_interface_lock); list_for_each_entry(r, &rv_monitors_list, list) { - if (strcmp(monitor->name, r->monitor->name) == 0) { + if (strcmp(monitor->name, r->name) == 0) { pr_info("Monitor %s is already registered\n", monitor->name); - retval = -EEXIST; - goto out_unlock; - } - } - - if (parent) { - list_for_each_entry(r, &rv_monitors_list, list) { - if (strcmp(parent->name, r->monitor->name) == 0) { - p = r; - break; - } + return -EEXIST; } } - if (p && rv_is_nested_monitor(p)) { + if (parent && rv_is_nested_monitor(parent)) { pr_info("Parent monitor %s is already nested, cannot nest further\n", parent->name); - retval = -EINVAL; - goto out_unlock; - } - - r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL); - if (!r) { - retval = -ENOMEM; - goto out_unlock; + return -EINVAL; } - r->monitor = monitor; - r->parent = parent; + monitor->parent = parent; - retval = create_monitor_dir(r, p); - if (retval) { - kfree(r); - goto out_unlock; - } + retval = create_monitor_dir(monitor, parent); + if (retval) + return retval; /* keep children close to the parent for easier visualisation */ - if (p) - list_add(&r->list, &p->list); + if (parent) + list_add(&monitor->list, &parent->list); else - list_add_tail(&r->list, &rv_monitors_list); + list_add_tail(&monitor->list, &rv_monitors_list); -out_unlock: - mutex_unlock(&rv_interface_lock); - return retval; + return 0; } /** @@ -852,19 +806,12 @@ out_unlock: */ int rv_unregister_monitor(struct rv_monitor *monitor) { - struct rv_monitor_def *ptr, *next; - - mutex_lock(&rv_interface_lock); + guard(mutex)(&rv_interface_lock); - list_for_each_entry_safe(ptr, next, &rv_monitors_list, list) { - if (strcmp(monitor->name, ptr->monitor->name) == 0) { - rv_disable_monitor(ptr); - list_del(&ptr->list); - destroy_monitor_dir(ptr); - } - } + rv_disable_monitor(monitor); + list_del(&monitor->list); + destroy_monitor_dir(monitor); - mutex_unlock(&rv_interface_lock); return 0; } @@ -872,39 +819,36 @@ int __init rv_init_interface(void) { struct dentry *tmp; int retval; + struct dentry *root_dir __free(rv_remove) = rv_create_dir("rv", NULL); - rv_root.root_dir = rv_create_dir("rv", NULL); - if (!rv_root.root_dir) - goto out_err; + if (!root_dir) + return 1; - rv_root.monitors_dir = rv_create_dir("monitors", rv_root.root_dir); + rv_root.monitors_dir = rv_create_dir("monitors", root_dir); if (!rv_root.monitors_dir) - goto out_err; + return 1; - tmp = rv_create_file("available_monitors", RV_MODE_READ, rv_root.root_dir, NULL, + tmp = rv_create_file("available_monitors", RV_MODE_READ, root_dir, NULL, &available_monitors_ops); if (!tmp) - goto out_err; + return 1; - tmp = rv_create_file("enabled_monitors", RV_MODE_WRITE, rv_root.root_dir, NULL, + tmp = rv_create_file("enabled_monitors", RV_MODE_WRITE, root_dir, NULL, &enabled_monitors_ops); if (!tmp) - goto out_err; + return 1; - tmp = rv_create_file("monitoring_on", RV_MODE_WRITE, rv_root.root_dir, NULL, + tmp = rv_create_file("monitoring_on", RV_MODE_WRITE, root_dir, NULL, &monitoring_on_fops); if (!tmp) - goto out_err; - retval = init_rv_reactors(rv_root.root_dir); + return 1; + retval = init_rv_reactors(root_dir); if (retval) - goto out_err; + return 1; turn_monitoring_on(); - return 0; + rv_root.root_dir = no_free_ptr(root_dir); -out_err: - rv_remove(rv_root.root_dir); - printk(KERN_ERR "RV: Error while creating the RV interface\n"); - return 1; + return 0; } diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h index 98fca0a1adbc..2c0f51ff9d5c 100644 --- a/kernel/trace/rv/rv.h +++ b/kernel/trace/rv/rv.h @@ -17,54 +17,29 @@ struct rv_interface { #define rv_create_file tracefs_create_file #define rv_remove tracefs_remove +DEFINE_FREE(rv_remove, struct dentry *, if (_T) rv_remove(_T)); + #define MAX_RV_MONITOR_NAME_SIZE 32 #define MAX_RV_REACTOR_NAME_SIZE 32 extern struct mutex rv_interface_lock; extern struct list_head rv_monitors_list; -#ifdef CONFIG_RV_REACTORS -struct rv_reactor_def { - struct list_head list; - struct rv_reactor *reactor; - /* protected by the monitor interface lock */ - int counter; -}; -#endif - -struct rv_monitor_def { - struct list_head list; - struct rv_monitor *monitor; - struct rv_monitor *parent; - struct dentry *root_d; -#ifdef CONFIG_RV_REACTORS - struct rv_reactor_def *rdef; - bool reacting; -#endif - bool task_monitor; -}; - struct dentry *get_monitors_root(void); -int rv_disable_monitor(struct rv_monitor_def *mdef); -int rv_enable_monitor(struct rv_monitor_def *mdef); -bool rv_is_container_monitor(struct rv_monitor_def *mdef); -bool rv_is_nested_monitor(struct rv_monitor_def *mdef); +int rv_disable_monitor(struct rv_monitor *mon); +int rv_enable_monitor(struct rv_monitor *mon); +bool rv_is_container_monitor(struct rv_monitor *mon); +bool rv_is_nested_monitor(struct rv_monitor *mon); #ifdef CONFIG_RV_REACTORS -int reactor_populate_monitor(struct rv_monitor_def *mdef); -void reactor_cleanup_monitor(struct rv_monitor_def *mdef); +int reactor_populate_monitor(struct rv_monitor *mon, struct dentry *root); int init_rv_reactors(struct dentry *root_dir); #else -static inline int reactor_populate_monitor(struct rv_monitor_def *mdef) +static inline int reactor_populate_monitor(struct rv_monitor *mon, struct dentry *root) { return 0; } -static inline void reactor_cleanup_monitor(struct rv_monitor_def *mdef) -{ - return; -} - static inline int init_rv_reactors(struct dentry *root_dir) { return 0; diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c index 9501ca886d83..460af07f7aba 100644 --- a/kernel/trace/rv/rv_reactors.c +++ b/kernel/trace/rv/rv_reactors.c @@ -61,6 +61,7 @@ * printk */ +#include <linux/lockdep.h> #include <linux/slab.h> #include "rv.h" @@ -70,12 +71,12 @@ */ static LIST_HEAD(rv_reactors_list); -static struct rv_reactor_def *get_reactor_rdef_by_name(char *name) +static struct rv_reactor *get_reactor_rdef_by_name(char *name) { - struct rv_reactor_def *r; + struct rv_reactor *r; list_for_each_entry(r, &rv_reactors_list, list) { - if (strcmp(name, r->reactor->name) == 0) + if (strcmp(name, r->name) == 0) return r; } return NULL; @@ -86,9 +87,9 @@ static struct rv_reactor_def *get_reactor_rdef_by_name(char *name) */ static int reactors_show(struct seq_file *m, void *p) { - struct rv_reactor_def *rea_def = p; + struct rv_reactor *reactor = container_of(p, struct rv_reactor, list); - seq_printf(m, "%s\n", rea_def->reactor->name); + seq_printf(m, "%s\n", reactor->name); return 0; } @@ -138,13 +139,13 @@ static const struct file_operations available_reactors_ops = { */ static int monitor_reactor_show(struct seq_file *m, void *p) { - struct rv_monitor_def *mdef = m->private; - struct rv_reactor_def *rdef = p; + struct rv_monitor *mon = m->private; + struct rv_reactor *reactor = container_of(p, struct rv_reactor, list); - if (mdef->rdef == rdef) - seq_printf(m, "[%s]\n", rdef->reactor->name); + if (mon->reactor == reactor) + seq_printf(m, "[%s]\n", reactor->name); else - seq_printf(m, "%s\n", rdef->reactor->name); + seq_printf(m, "%s\n", reactor->name); return 0; } @@ -158,43 +159,37 @@ static const struct seq_operations monitor_reactors_seq_ops = { .show = monitor_reactor_show }; -static void monitor_swap_reactors_single(struct rv_monitor_def *mdef, - struct rv_reactor_def *rdef, - bool reacting, bool nested) +static void monitor_swap_reactors_single(struct rv_monitor *mon, + struct rv_reactor *reactor, + bool nested) { bool monitor_enabled; /* nothing to do */ - if (mdef->rdef == rdef) + if (mon->reactor == reactor) return; - monitor_enabled = mdef->monitor->enabled; + monitor_enabled = mon->enabled; if (monitor_enabled) - rv_disable_monitor(mdef); + rv_disable_monitor(mon); - /* swap reactor's usage */ - mdef->rdef->counter--; - rdef->counter++; - - mdef->rdef = rdef; - mdef->reacting = reacting; - mdef->monitor->react = rdef->reactor->react; + mon->reactor = reactor; + mon->react = reactor->react; /* enable only once if iterating through a container */ if (monitor_enabled && !nested) - rv_enable_monitor(mdef); + rv_enable_monitor(mon); } -static void monitor_swap_reactors(struct rv_monitor_def *mdef, - struct rv_reactor_def *rdef, bool reacting) +static void monitor_swap_reactors(struct rv_monitor *mon, struct rv_reactor *reactor) { - struct rv_monitor_def *p = mdef; + struct rv_monitor *p = mon; - if (rv_is_container_monitor(mdef)) + if (rv_is_container_monitor(mon)) list_for_each_entry_continue(p, &rv_monitors_list, list) { - if (p->parent != mdef->monitor) + if (p->parent != mon) break; - monitor_swap_reactors_single(p, rdef, reacting, true); + monitor_swap_reactors_single(p, reactor, true); } /* * This call enables and disables the monitor if they were active. @@ -202,7 +197,7 @@ static void monitor_swap_reactors(struct rv_monitor_def *mdef, * All nested monitors are enabled also if they were off, we may refine * this logic in the future. */ - monitor_swap_reactors_single(mdef, rdef, reacting, false); + monitor_swap_reactors_single(mon, reactor, false); } static ssize_t @@ -210,11 +205,10 @@ monitor_reactors_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { char buff[MAX_RV_REACTOR_NAME_SIZE + 2]; - struct rv_monitor_def *mdef; - struct rv_reactor_def *rdef; + struct rv_monitor *mon; + struct rv_reactor *reactor; struct seq_file *seq_f; int retval = -EINVAL; - bool enable; char *ptr; int len; @@ -237,30 +231,20 @@ monitor_reactors_write(struct file *file, const char __user *user_buf, * See monitor_reactors_open() */ seq_f = file->private_data; - mdef = seq_f->private; - - mutex_lock(&rv_interface_lock); + mon = seq_f->private; - retval = -EINVAL; + guard(mutex)(&rv_interface_lock); - list_for_each_entry(rdef, &rv_reactors_list, list) { - if (strcmp(ptr, rdef->reactor->name) != 0) + list_for_each_entry(reactor, &rv_reactors_list, list) { + if (strcmp(ptr, reactor->name) != 0) continue; - if (rdef == get_reactor_rdef_by_name("nop")) - enable = false; - else - enable = true; + monitor_swap_reactors(mon, reactor); - monitor_swap_reactors(mdef, rdef, enable); - - retval = count; - break; + return count; } - mutex_unlock(&rv_interface_lock); - - return retval; + return -EINVAL; } /* @@ -268,7 +252,7 @@ monitor_reactors_write(struct file *file, const char __user *user_buf, */ static int monitor_reactors_open(struct inode *inode, struct file *file) { - struct rv_monitor_def *mdef = inode->i_private; + struct rv_monitor *mon = inode->i_private; struct seq_file *seq_f; int ret; @@ -284,7 +268,7 @@ static int monitor_reactors_open(struct inode *inode, struct file *file) /* * Copy the create file "private" data to the seq_file private data. */ - seq_f->private = mdef; + seq_f->private = mon; return 0; }; @@ -299,23 +283,16 @@ static const struct file_operations monitor_reactors_ops = { static int __rv_register_reactor(struct rv_reactor *reactor) { - struct rv_reactor_def *r; + struct rv_reactor *r; list_for_each_entry(r, &rv_reactors_list, list) { - if (strcmp(reactor->name, r->reactor->name) == 0) { + if (strcmp(reactor->name, r->name) == 0) { pr_info("Reactor %s is already registered\n", reactor->name); return -EINVAL; } } - r = kzalloc(sizeof(struct rv_reactor_def), GFP_KERNEL); - if (!r) - return -ENOMEM; - - r->reactor = reactor; - r->counter = 0; - - list_add_tail(&r->list, &rv_reactors_list); + list_add_tail(&reactor->list, &rv_reactors_list); return 0; } @@ -328,18 +305,14 @@ static int __rv_register_reactor(struct rv_reactor *reactor) */ int rv_register_reactor(struct rv_reactor *reactor) { - int retval = 0; - if (strlen(reactor->name) >= MAX_RV_REACTOR_NAME_SIZE) { pr_info("Reactor %s has a name longer than %d\n", reactor->name, MAX_RV_MONITOR_NAME_SIZE); return -EINVAL; } - mutex_lock(&rv_interface_lock); - retval = __rv_register_reactor(reactor); - mutex_unlock(&rv_interface_lock); - return retval; + guard(mutex)(&rv_interface_lock); + return __rv_register_reactor(reactor); } /** @@ -350,30 +323,9 @@ int rv_register_reactor(struct rv_reactor *reactor) */ int rv_unregister_reactor(struct rv_reactor *reactor) { - struct rv_reactor_def *ptr, *next; - int ret = 0; - - mutex_lock(&rv_interface_lock); - - list_for_each_entry_safe(ptr, next, &rv_reactors_list, list) { - if (strcmp(reactor->name, ptr->reactor->name) == 0) { - - if (!ptr->counter) { - list_del(&ptr->list); - } else { - printk(KERN_WARNING - "rv: the rv_reactor %s is in use by %d monitor(s)\n", - ptr->reactor->name, ptr->counter); - printk(KERN_WARNING "rv: the rv_reactor %s cannot be removed\n", - ptr->reactor->name); - ret = -EBUSY; - break; - } - } - } - - mutex_unlock(&rv_interface_lock); - return ret; + guard(mutex)(&rv_interface_lock); + list_del(&reactor->list); + return 0; } /* @@ -386,7 +338,7 @@ static bool __read_mostly reacting_on; * * Returns 1 if on, 0 otherwise. */ -bool rv_reacting_on(void) +static bool rv_reacting_on(void) { /* Ensures that concurrent monitors read consistent reacting_on */ smp_rmb(); @@ -428,7 +380,7 @@ static ssize_t reacting_on_write_data(struct file *filp, const char __user *user if (retval) return retval; - mutex_lock(&rv_interface_lock); + guard(mutex)(&rv_interface_lock); if (val) turn_reacting_on(); @@ -441,8 +393,6 @@ static ssize_t reacting_on_write_data(struct file *filp, const char __user *user */ tracepoint_synchronize_unregister(); - mutex_unlock(&rv_interface_lock); - return count; } @@ -454,43 +404,31 @@ static const struct file_operations reacting_on_fops = { /** * reactor_populate_monitor - creates per monitor reactors file - * @mdef: monitor's definition. + * @mon: The monitor. + * @root: The directory of the monitor. * * Returns 0 if successful, error otherwise. */ -int reactor_populate_monitor(struct rv_monitor_def *mdef) +int reactor_populate_monitor(struct rv_monitor *mon, struct dentry *root) { struct dentry *tmp; - tmp = rv_create_file("reactors", RV_MODE_WRITE, mdef->root_d, mdef, &monitor_reactors_ops); + tmp = rv_create_file("reactors", RV_MODE_WRITE, root, mon, &monitor_reactors_ops); if (!tmp) return -ENOMEM; /* * Configure as the rv_nop reactor. */ - mdef->rdef = get_reactor_rdef_by_name("nop"); - mdef->rdef->counter++; - mdef->reacting = false; + mon->reactor = get_reactor_rdef_by_name("nop"); return 0; } -/** - * reactor_cleanup_monitor - cleanup a monitor reference - * @mdef: monitor's definition. - */ -void reactor_cleanup_monitor(struct rv_monitor_def *mdef) -{ - lockdep_assert_held(&rv_interface_lock); - mdef->rdef->counter--; - WARN_ON_ONCE(mdef->rdef->counter < 0); -} - /* * Nop reactor register */ -static void rv_nop_reaction(char *msg) +__printf(1, 0) static void rv_nop_reaction(const char *msg, va_list args) { } @@ -502,30 +440,42 @@ static struct rv_reactor rv_nop = { int init_rv_reactors(struct dentry *root_dir) { - struct dentry *available, *reacting; int retval; - available = rv_create_file("available_reactors", RV_MODE_READ, root_dir, NULL, - &available_reactors_ops); - if (!available) - goto out_err; + struct dentry *available __free(rv_remove) = + rv_create_file("available_reactors", RV_MODE_READ, root_dir, + NULL, &available_reactors_ops); - reacting = rv_create_file("reacting_on", RV_MODE_WRITE, root_dir, NULL, &reacting_on_fops); - if (!reacting) - goto rm_available; + struct dentry *reacting __free(rv_remove) = + rv_create_file("reacting_on", RV_MODE_WRITE, root_dir, NULL, &reacting_on_fops); + + if (!reacting || !available) + return -ENOMEM; retval = __rv_register_reactor(&rv_nop); if (retval) - goto rm_reacting; + return retval; turn_reacting_on(); + retain_and_null_ptr(available); + retain_and_null_ptr(reacting); return 0; +} + +void rv_react(struct rv_monitor *monitor, const char *msg, ...) +{ + static DEFINE_WAIT_OVERRIDE_MAP(rv_react_map, LD_WAIT_FREE); + va_list args; + + if (!rv_reacting_on() || !monitor->react) + return; + + va_start(args, msg); + + lock_map_acquire_try(&rv_react_map); + monitor->react(msg, args); + lock_map_release(&rv_react_map); -rm_reacting: - rv_remove(reacting); -rm_available: - rv_remove(available); -out_err: - return -ENOMEM; + va_end(args); } diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h index 422b75f58891..4a6faddac614 100644 --- a/kernel/trace/rv/rv_trace.h +++ b/kernel/trace/rv/rv_trace.h @@ -16,24 +16,24 @@ DECLARE_EVENT_CLASS(event_da_monitor, TP_ARGS(state, event, next_state, final_state), TP_STRUCT__entry( - __array( char, state, MAX_DA_NAME_LEN ) - __array( char, event, MAX_DA_NAME_LEN ) - __array( char, next_state, MAX_DA_NAME_LEN ) - __field( bool, final_state ) + __string( state, state ) + __string( event, event ) + __string( next_state, next_state ) + __field( bool, final_state ) ), TP_fast_assign( - memcpy(__entry->state, state, MAX_DA_NAME_LEN); - memcpy(__entry->event, event, MAX_DA_NAME_LEN); - memcpy(__entry->next_state, next_state, MAX_DA_NAME_LEN); - __entry->final_state = final_state; + __assign_str(state); + __assign_str(event); + __assign_str(next_state); + __entry->final_state = final_state; ), - TP_printk("%s x %s -> %s %s", - __entry->state, - __entry->event, - __entry->next_state, - __entry->final_state ? "(final)" : "") + TP_printk("%s x %s -> %s%s", + __get_str(state), + __get_str(event), + __get_str(next_state), + __entry->final_state ? " (final)" : "") ); DECLARE_EVENT_CLASS(error_da_monitor, @@ -43,26 +43,26 @@ DECLARE_EVENT_CLASS(error_da_monitor, TP_ARGS(state, event), TP_STRUCT__entry( - __array( char, state, MAX_DA_NAME_LEN ) - __array( char, event, MAX_DA_NAME_LEN ) + __string( state, state ) + __string( event, event ) ), TP_fast_assign( - memcpy(__entry->state, state, MAX_DA_NAME_LEN); - memcpy(__entry->event, event, MAX_DA_NAME_LEN); + __assign_str(state); + __assign_str(event); ), TP_printk("event %s not expected in the state %s", - __entry->event, - __entry->state) + __get_str(event), + __get_str(state)) ); #include <monitors/wip/wip_trace.h> -#include <monitors/tss/tss_trace.h> #include <monitors/sco/sco_trace.h> #include <monitors/scpd/scpd_trace.h> #include <monitors/snep/snep_trace.h> -#include <monitors/sncid/sncid_trace.h> +#include <monitors/sts/sts_trace.h> +#include <monitors/opid/opid_trace.h> // Add new monitors based on CONFIG_DA_MON_EVENTS_IMPLICIT here #endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */ @@ -75,27 +75,27 @@ DECLARE_EVENT_CLASS(event_da_monitor_id, TP_ARGS(id, state, event, next_state, final_state), TP_STRUCT__entry( - __field( int, id ) - __array( char, state, MAX_DA_NAME_LEN ) - __array( char, event, MAX_DA_NAME_LEN ) - __array( char, next_state, MAX_DA_NAME_LEN ) - __field( bool, final_state ) + __field( int, id ) + __string( state, state ) + __string( event, event ) + __string( next_state, next_state ) + __field( bool, final_state ) ), TP_fast_assign( - memcpy(__entry->state, state, MAX_DA_NAME_LEN); - memcpy(__entry->event, event, MAX_DA_NAME_LEN); - memcpy(__entry->next_state, next_state, MAX_DA_NAME_LEN); - __entry->id = id; - __entry->final_state = final_state; + __assign_str(state); + __assign_str(event); + __assign_str(next_state); + __entry->id = id; + __entry->final_state = final_state; ), - TP_printk("%d: %s x %s -> %s %s", + TP_printk("%d: %s x %s -> %s%s", __entry->id, - __entry->state, - __entry->event, - __entry->next_state, - __entry->final_state ? "(final)" : "") + __get_str(state), + __get_str(event), + __get_str(next_state), + __entry->final_state ? " (final)" : "") ); DECLARE_EVENT_CLASS(error_da_monitor_id, @@ -105,32 +105,108 @@ DECLARE_EVENT_CLASS(error_da_monitor_id, TP_ARGS(id, state, event), TP_STRUCT__entry( - __field( int, id ) - __array( char, state, MAX_DA_NAME_LEN ) - __array( char, event, MAX_DA_NAME_LEN ) + __field( int, id ) + __string( state, state ) + __string( event, event ) ), TP_fast_assign( - memcpy(__entry->state, state, MAX_DA_NAME_LEN); - memcpy(__entry->event, event, MAX_DA_NAME_LEN); - __entry->id = id; + __assign_str(state); + __assign_str(event); + __entry->id = id; ), TP_printk("%d: event %s not expected in the state %s", __entry->id, - __entry->event, - __entry->state) + __get_str(event), + __get_str(state)) ); #include <monitors/wwnr/wwnr_trace.h> #include <monitors/snroc/snroc_trace.h> +#include <monitors/nrp/nrp_trace.h> +#include <monitors/sssw/sssw_trace.h> // Add new monitors based on CONFIG_DA_MON_EVENTS_ID here #endif /* CONFIG_DA_MON_EVENTS_ID */ +#ifdef CONFIG_LTL_MON_EVENTS_ID +DECLARE_EVENT_CLASS(event_ltl_monitor_id, + + TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next), + + TP_ARGS(task, states, atoms, next), + + TP_STRUCT__entry( + __string(comm, task->comm) + __field(pid_t, pid) + __string(states, states) + __string(atoms, atoms) + __string(next, next) + ), + + TP_fast_assign( + __assign_str(comm); + __entry->pid = task->pid; + __assign_str(states); + __assign_str(atoms); + __assign_str(next); + ), + + TP_printk("%s[%d]: (%s) x (%s) -> (%s)", __get_str(comm), __entry->pid, + __get_str(states), __get_str(atoms), __get_str(next)) +); + +DECLARE_EVENT_CLASS(error_ltl_monitor_id, + + TP_PROTO(struct task_struct *task), + + TP_ARGS(task), + + TP_STRUCT__entry( + __string(comm, task->comm) + __field(pid_t, pid) + ), + + TP_fast_assign( + __assign_str(comm); + __entry->pid = task->pid; + ), + + TP_printk("%s[%d]: violation detected", __get_str(comm), __entry->pid) +); +#include <monitors/pagefault/pagefault_trace.h> +#include <monitors/sleep/sleep_trace.h> +// Add new monitors based on CONFIG_LTL_MON_EVENTS_ID here +#endif /* CONFIG_LTL_MON_EVENTS_ID */ + +#ifdef CONFIG_RV_MON_MAINTENANCE_EVENTS +/* Tracepoint useful for monitors development, currenly only used in DA */ +TRACE_EVENT(rv_retries_error, + + TP_PROTO(char *name, char *event), + + TP_ARGS(name, event), + + TP_STRUCT__entry( + __string( name, name ) + __string( event, event ) + ), + + TP_fast_assign( + __assign_str(name); + __assign_str(event); + ), + + TP_printk(__stringify(MAX_DA_RETRY_RACING_EVENTS) + " retries reached for event %s, resetting monitor %s", + __get_str(event), __get_str(name)) +); +#endif /* CONFIG_RV_MON_MAINTENANCE_EVENTS */ #endif /* _TRACE_RV_H */ -/* This part ust be outside protection */ +/* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE rv_trace #include <trace/define_trace.h> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5b8db27fb6ef..6f2148df14d9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -20,6 +20,7 @@ #include <linux/security.h> #include <linux/seq_file.h> #include <linux/irqflags.h> +#include <linux/syscalls.h> #include <linux/debugfs.h> #include <linux/tracefs.h> #include <linux/pagemap.h> @@ -51,6 +52,7 @@ #include <linux/workqueue.h> #include <linux/sort.h> #include <linux/io.h> /* vmap_page_range() */ +#include <linux/fs_context.h> #include <asm/setup.h> /* COMMAND_LINE_SIZE */ @@ -92,17 +94,13 @@ static bool tracepoint_printk_stop_on_boot __initdata; static bool traceoff_after_boot __initdata; static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key); -/* For tracers that don't implement custom flags */ -static struct tracer_opt dummy_tracer_opt[] = { - { } +/* Store tracers and their flags per instance */ +struct tracers { + struct list_head list; + struct tracer *tracer; + struct tracer_flags *flags; }; -static int -dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) -{ - return 0; -} - /* * To prevent the comm cache from being overwritten when no * tracing is active, only save the comm when a trace event @@ -120,13 +118,14 @@ static int tracing_disabled = 1; cpumask_var_t __read_mostly tracing_buffer_mask; +#define MAX_TRACER_SIZE 100 /* * ftrace_dump_on_oops - variable to dump ftrace buffer on oops * * If there is an oops (or kernel panic) and the ftrace_dump_on_oops * is set, then ftrace_dump is called. This will output the contents * of the ftrace buffers to the console. This is very useful for - * capturing traces that lead to crashes and outputing it to a + * capturing traces that lead to crashes and outputting it to a * serial console. * * It is default off, but you can enable it with either specifying @@ -135,14 +134,47 @@ cpumask_var_t __read_mostly tracing_buffer_mask; * Set 1 if you want to dump buffers of all CPUs * Set 2 if you want to dump the buffer of the CPU that triggered oops * Set instance name if you want to dump the specific trace instance - * Multiple instance dump is also supported, and instances are seperated + * Multiple instance dump is also supported, and instances are separated * by commas. */ /* Set to string format zero to disable by default */ char ftrace_dump_on_oops[MAX_TRACER_SIZE] = "0"; /* When set, tracing will stop when a WARN*() is hit */ -int __disable_trace_on_warning; +static int __disable_trace_on_warning; + +int tracepoint_printk_sysctl(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +static const struct ctl_table trace_sysctl_table[] = { + { + .procname = "ftrace_dump_on_oops", + .data = &ftrace_dump_on_oops, + .maxlen = MAX_TRACER_SIZE, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "traceoff_on_warning", + .data = &__disable_trace_on_warning, + .maxlen = sizeof(__disable_trace_on_warning), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tracepoint_printk", + .data = &tracepoint_printk, + .maxlen = sizeof(tracepoint_printk), + .mode = 0644, + .proc_handler = tracepoint_printk_sysctl, + }, +}; + +static int __init init_trace_sysctls(void) +{ + register_sysctl_init("kernel", trace_sysctl_table); + return 0; +} +subsys_initcall(init_trace_sysctls); #ifdef CONFIG_TRACE_EVAL_MAP_FILE /* Map of enums to their values, for "eval_map" file */ @@ -397,15 +429,13 @@ static void ftrace_exports(struct ring_buffer_event *event, int flag) { struct trace_export *export; - preempt_disable_notrace(); + guard(preempt_notrace)(); export = rcu_dereference_raw_check(ftrace_exports_list); while (export) { trace_process_export(export, event, flag); export = rcu_dereference_raw_check(export->next); } - - preempt_enable_notrace(); } static inline void @@ -462,46 +492,40 @@ int register_ftrace_export(struct trace_export *export) if (WARN_ON_ONCE(!export->write)) return -1; - mutex_lock(&ftrace_export_lock); + guard(mutex)(&ftrace_export_lock); add_ftrace_export(&ftrace_exports_list, export); - mutex_unlock(&ftrace_export_lock); - return 0; } EXPORT_SYMBOL_GPL(register_ftrace_export); int unregister_ftrace_export(struct trace_export *export) { - int ret; - - mutex_lock(&ftrace_export_lock); - - ret = rm_ftrace_export(&ftrace_exports_list, export); - - mutex_unlock(&ftrace_export_lock); - - return ret; + guard(mutex)(&ftrace_export_lock); + return rm_ftrace_export(&ftrace_exports_list, export); } EXPORT_SYMBOL_GPL(unregister_ftrace_export); /* trace_flags holds trace_options default values */ #define TRACE_DEFAULT_FLAGS \ - (FUNCTION_DEFAULT_FLAGS | \ - TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | \ - TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \ - TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \ - TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \ - TRACE_ITER_HASH_PTR | TRACE_ITER_TRACE_PRINTK) + (FUNCTION_DEFAULT_FLAGS | FPROFILE_DEFAULT_FLAGS | \ + TRACE_ITER(PRINT_PARENT) | TRACE_ITER(PRINTK) | \ + TRACE_ITER(ANNOTATE) | TRACE_ITER(CONTEXT_INFO) | \ + TRACE_ITER(RECORD_CMD) | TRACE_ITER(OVERWRITE) | \ + TRACE_ITER(IRQ_INFO) | TRACE_ITER(MARKERS) | \ + TRACE_ITER(HASH_PTR) | TRACE_ITER(TRACE_PRINTK) | \ + TRACE_ITER(COPY_MARKER)) /* trace_options that are only supported by global_trace */ -#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \ - TRACE_ITER_PRINTK_MSGONLY | TRACE_ITER_RECORD_CMD) +#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER(PRINTK) | \ + TRACE_ITER(PRINTK_MSGONLY) | TRACE_ITER(RECORD_CMD) | \ + TRACE_ITER(PROF_TEXT_OFFSET) | FPROFILE_DEFAULT_FLAGS) /* trace_flags that are default zero for instances */ #define ZEROED_TRACE_FLAGS \ - (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK | TRACE_ITER_TRACE_PRINTK) + (TRACE_ITER(EVENT_FORK) | TRACE_ITER(FUNC_FORK) | TRACE_ITER(TRACE_PRINTK) | \ + TRACE_ITER(COPY_MARKER)) /* * The global_trace is the descriptor that holds the top-level tracing @@ -513,6 +537,9 @@ static struct trace_array global_trace = { static struct trace_array *printk_trace = &global_trace; +/* List of trace_arrays interested in the top level trace_marker */ +static LIST_HEAD(marker_copies); + static __always_inline bool printk_binsafe(struct trace_array *tr) { /* @@ -529,9 +556,31 @@ static void update_printk_trace(struct trace_array *tr) if (printk_trace == tr) return; - printk_trace->trace_flags &= ~TRACE_ITER_TRACE_PRINTK; + printk_trace->trace_flags &= ~TRACE_ITER(TRACE_PRINTK); printk_trace = tr; - tr->trace_flags |= TRACE_ITER_TRACE_PRINTK; + tr->trace_flags |= TRACE_ITER(TRACE_PRINTK); +} + +/* Returns true if the status of tr changed */ +static bool update_marker_trace(struct trace_array *tr, int enabled) +{ + lockdep_assert_held(&event_mutex); + + if (enabled) { + if (!list_empty(&tr->marker_list)) + return false; + + list_add_rcu(&tr->marker_list, &marker_copies); + tr->trace_flags |= TRACE_ITER(COPY_MARKER); + return true; + } + + if (list_empty(&tr->marker_list)) + return false; + + list_del_init(&tr->marker_list); + tr->trace_flags &= ~TRACE_ITER(COPY_MARKER); + return true; } void trace_set_ring_buffer_expanded(struct trace_array *tr) @@ -578,9 +627,8 @@ void trace_array_put(struct trace_array *this_tr) if (!this_tr) return; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); __trace_array_put(this_tr); - mutex_unlock(&trace_types_lock); } EXPORT_SYMBOL_GPL(trace_array_put); @@ -784,7 +832,10 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, /* copy the current bits to the new max */ ret = trace_pid_list_first(filtered_pids, &pid); while (!ret) { - trace_pid_list_set(pid_list, pid); + ret = trace_pid_list_set(pid_list, pid); + if (ret < 0) + goto out; + ret = trace_pid_list_next(filtered_pids, pid + 1, &pid); nr_pids++; } @@ -821,6 +872,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, trace_parser_clear(&parser); ret = 0; } + out: trace_parser_put(&parser); if (ret < 0) { @@ -874,7 +926,6 @@ int tracing_is_enabled(void) * return the mirror variable of the state of the ring buffer. * It's a little racy, but we don't really care. */ - smp_rmb(); return !global_trace.buffer_disabled; } @@ -1045,8 +1096,6 @@ void tracer_tracing_on(struct trace_array *tr) * important to be fast than accurate. */ tr->buffer_disabled = 0; - /* Make the flag seen by readers */ - smp_wmb(); } /** @@ -1088,7 +1137,7 @@ int __trace_array_puts(struct trace_array *tr, unsigned long ip, unsigned int trace_ctx; int alloc; - if (!(tr->trace_flags & TRACE_ITER_PRINTK)) + if (!(tr->trace_flags & TRACE_ITER(PRINTK))) return 0; if (unlikely(tracing_selftest_running && tr == &global_trace)) @@ -1101,13 +1150,11 @@ int __trace_array_puts(struct trace_array *tr, unsigned long ip, trace_ctx = tracing_gen_ctx(); buffer = tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); + guard(ring_buffer_nest)(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, trace_ctx); - if (!event) { - size = 0; - goto out; - } + if (!event) + return 0; entry = ring_buffer_event_data(event); entry->ip = ip; @@ -1123,8 +1170,6 @@ int __trace_array_puts(struct trace_array *tr, unsigned long ip, __buffer_unlock_commit(buffer, event); ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); - out: - ring_buffer_nest_end(buffer); return size; } EXPORT_SYMBOL_GPL(__trace_array_puts); @@ -1154,12 +1199,11 @@ int __trace_bputs(unsigned long ip, const char *str) struct bputs_entry *entry; unsigned int trace_ctx; int size = sizeof(struct bputs_entry); - int ret = 0; if (!printk_binsafe(tr)) return __trace_puts(ip, str, strlen(str)); - if (!(tr->trace_flags & TRACE_ITER_PRINTK)) + if (!(tr->trace_flags & TRACE_ITER(PRINTK))) return 0; if (unlikely(tracing_selftest_running || tracing_disabled)) @@ -1168,11 +1212,11 @@ int __trace_bputs(unsigned long ip, const char *str) trace_ctx = tracing_gen_ctx(); buffer = tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); + guard(ring_buffer_nest)(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, trace_ctx); if (!event) - goto out; + return 0; entry = ring_buffer_event_data(event); entry->ip = ip; @@ -1181,10 +1225,7 @@ int __trace_bputs(unsigned long ip, const char *str) __buffer_unlock_commit(buffer, event); ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); - ret = 1; - out: - ring_buffer_nest_end(buffer); - return ret; + return 1; } EXPORT_SYMBOL_GPL(__trace_bputs); @@ -1373,13 +1414,8 @@ static int tracing_arm_snapshot_locked(struct trace_array *tr) int tracing_arm_snapshot(struct trace_array *tr) { - int ret; - - mutex_lock(&trace_types_lock); - ret = tracing_arm_snapshot_locked(tr); - mutex_unlock(&trace_types_lock); - - return ret; + guard(mutex)(&trace_types_lock); + return tracing_arm_snapshot_locked(tr); } void tracing_disarm_snapshot(struct trace_array *tr) @@ -1578,8 +1614,39 @@ void tracer_tracing_off(struct trace_array *tr) * important to be fast than accurate. */ tr->buffer_disabled = 1; - /* Make the flag seen by readers */ - smp_wmb(); +} + +/** + * tracer_tracing_disable() - temporary disable the buffer from write + * @tr: The trace array to disable its buffer for + * + * Expects trace_tracing_enable() to re-enable tracing. + * The difference between this and tracer_tracing_off() is that this + * is a counter and can nest, whereas, tracer_tracing_off() can + * be called multiple times and a single trace_tracing_on() will + * enable it. + */ +void tracer_tracing_disable(struct trace_array *tr) +{ + if (WARN_ON_ONCE(!tr->array_buffer.buffer)) + return; + + ring_buffer_record_disable(tr->array_buffer.buffer); +} + +/** + * tracer_tracing_enable() - counter part of tracer_tracing_disable() + * @tr: The trace array that had tracer_tracincg_disable() called on it + * + * This is called after tracer_tracing_disable() has been called on @tr, + * when it's safe to re-enable tracing. + */ +void tracer_tracing_enable(struct trace_array *tr) +{ + if (WARN_ON_ONCE(!tr->array_buffer.buffer)) + return; + + ring_buffer_record_enable(tr->array_buffer.buffer); } /** @@ -1751,7 +1818,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, ret = get_user(ch, ubuf++); if (ret) - goto out; + goto fail; read++; cnt--; @@ -1765,7 +1832,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, while (cnt && isspace(ch)) { ret = get_user(ch, ubuf++); if (ret) - goto out; + goto fail; read++; cnt--; } @@ -1775,8 +1842,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, /* only spaces were written */ if (isspace(ch) || !ch) { *ppos += read; - ret = read; - goto out; + return read; } } @@ -1786,11 +1852,12 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, parser->buffer[parser->idx++] = ch; else { ret = -EINVAL; - goto out; + goto fail; } + ret = get_user(ch, ubuf++); if (ret) - goto out; + goto fail; read++; cnt--; } @@ -1806,13 +1873,13 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, parser->buffer[parser->idx] = 0; } else { ret = -EINVAL; - goto out; + goto fail; } *ppos += read; - ret = read; - -out: + return read; +fail: + trace_parser_fail(parser); return ret; } @@ -2104,6 +2171,7 @@ static int save_selftest(struct tracer *type) static int run_tracer_selftest(struct tracer *type) { struct trace_array *tr = &global_trace; + struct tracer_flags *saved_flags = tr->current_trace_flags; struct tracer *saved_tracer = tr->current_trace; int ret; @@ -2134,6 +2202,7 @@ static int run_tracer_selftest(struct tracer *type) tracing_reset_online_cpus(&tr->array_buffer); tr->current_trace = type; + tr->current_trace_flags = type->flags ? : type->default_flags; #ifdef CONFIG_TRACER_MAX_TRACE if (type->use_max_tr) { @@ -2150,6 +2219,7 @@ static int run_tracer_selftest(struct tracer *type) ret = type->selftest(type, tr); /* the test is responsible for resetting too */ tr->current_trace = saved_tracer; + tr->current_trace_flags = saved_flags; if (ret) { printk(KERN_CONT "FAILED!\n"); /* Add the warning after printing 'FAILED' */ @@ -2242,10 +2312,23 @@ static inline int do_run_tracer_selftest(struct tracer *type) } #endif /* CONFIG_FTRACE_STARTUP_TEST */ -static void add_tracer_options(struct trace_array *tr, struct tracer *t); +static int add_tracer(struct trace_array *tr, struct tracer *t); static void __init apply_trace_boot_options(void); +static void free_tracers(struct trace_array *tr) +{ + struct tracers *t, *n; + + lockdep_assert_held(&trace_types_lock); + + list_for_each_entry_safe(t, n, &tr->tracers, list) { + list_del(&t->list); + kfree(t->flags); + kfree(t); + } +} + /** * register_tracer - register a tracer with the ftrace system. * @type: the plugin for the tracer @@ -2254,6 +2337,7 @@ static void __init apply_trace_boot_options(void); */ int __init register_tracer(struct tracer *type) { + struct trace_array *tr; struct tracer *t; int ret = 0; @@ -2285,44 +2369,38 @@ int __init register_tracer(struct tracer *type) } } - if (!type->set_flag) - type->set_flag = &dummy_set_flag; - if (!type->flags) { - /*allocate a dummy tracer_flags*/ - type->flags = kmalloc(sizeof(*type->flags), GFP_KERNEL); - if (!type->flags) { - ret = -ENOMEM; - goto out; - } - type->flags->val = 0; - type->flags->opts = dummy_tracer_opt; - } else - if (!type->flags->opts) - type->flags->opts = dummy_tracer_opt; - /* store the tracer for __set_tracer_option */ - type->flags->trace = type; + if (type->flags) + type->flags->trace = type; ret = do_run_tracer_selftest(type); if (ret < 0) goto out; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + ret = add_tracer(tr, type); + if (ret < 0) { + /* The tracer will still exist but without options */ + pr_warn("Failed to create tracer options for %s\n", type->name); + break; + } + } + type->next = trace_types; trace_types = type; - add_tracer_options(&global_trace, type); out: mutex_unlock(&trace_types_lock); if (ret || !default_bootup_tracer) - goto out_unlock; + return ret; if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE)) - goto out_unlock; + return 0; printk(KERN_INFO "Starting tracer '%s'\n", type->name); /* Do we want this tracer to start on bootup? */ - tracing_set_tracer(&global_trace, type->name); + WARN_ON(tracing_set_tracer(&global_trace, type->name) < 0); default_bootup_tracer = NULL; apply_trace_boot_options(); @@ -2330,8 +2408,7 @@ int __init register_tracer(struct tracer *type) /* disable other selftests, since this will break it. */ disable_tracing_selftest("running a tracer"); - out_unlock: - return ret; + return 0; } static void tracing_reset_cpu(struct array_buffer *buf, int cpu) @@ -2408,9 +2485,8 @@ void tracing_reset_all_online_cpus_unlocked(void) void tracing_reset_all_online_cpus(void) { - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); tracing_reset_all_online_cpus_unlocked(); - mutex_unlock(&trace_types_lock); } int is_tracing_stopped(void) @@ -2421,18 +2497,17 @@ int is_tracing_stopped(void) static void tracing_start_tr(struct trace_array *tr) { struct trace_buffer *buffer; - unsigned long flags; if (tracing_disabled) return; - raw_spin_lock_irqsave(&tr->start_lock, flags); + guard(raw_spinlock_irqsave)(&tr->start_lock); if (--tr->stop_count) { if (WARN_ON_ONCE(tr->stop_count < 0)) { /* Someone screwed up their debugging */ tr->stop_count = 0; } - goto out; + return; } /* Prevent the buffers from switching */ @@ -2449,9 +2524,6 @@ static void tracing_start_tr(struct trace_array *tr) #endif arch_spin_unlock(&tr->max_lock); - - out: - raw_spin_unlock_irqrestore(&tr->start_lock, flags); } /** @@ -2469,11 +2541,10 @@ void tracing_start(void) static void tracing_stop_tr(struct trace_array *tr) { struct trace_buffer *buffer; - unsigned long flags; - raw_spin_lock_irqsave(&tr->start_lock, flags); + guard(raw_spinlock_irqsave)(&tr->start_lock); if (tr->stop_count++) - goto out; + return; /* Prevent the buffers from switching */ arch_spin_lock(&tr->max_lock); @@ -2489,9 +2560,6 @@ static void tracing_stop_tr(struct trace_array *tr) #endif arch_spin_unlock(&tr->max_lock); - - out: - raw_spin_unlock_irqrestore(&tr->start_lock, flags); } /** @@ -2604,19 +2672,17 @@ void trace_buffered_event_enable(void) per_cpu(trace_buffered_event, cpu) = event; - preempt_disable(); - if (cpu == smp_processor_id() && - __this_cpu_read(trace_buffered_event) != - per_cpu(trace_buffered_event, cpu)) - WARN_ON_ONCE(1); - preempt_enable(); + scoped_guard(preempt,) { + if (cpu == smp_processor_id() && + __this_cpu_read(trace_buffered_event) != + per_cpu(trace_buffered_event, cpu)) + WARN_ON_ONCE(1); + } } } static void enable_trace_buffered_event(void *data) { - /* Probably not needed, but do it anyway */ - smp_rmb(); this_cpu_dec(trace_buffered_event_cnt); } @@ -2956,7 +3022,7 @@ static void __ftrace_trace_stack(struct trace_array *tr, skip++; #endif - preempt_disable_notrace(); + guard(preempt_notrace)(); stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1; @@ -3014,8 +3080,6 @@ static void __ftrace_trace_stack(struct trace_array *tr, /* Again, don't let gcc optimize things here */ barrier(); __this_cpu_dec(ftrace_stack_reserve); - preempt_enable_notrace(); - } static inline void ftrace_trace_stack(struct trace_array *tr, @@ -3023,7 +3087,7 @@ static inline void ftrace_trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip, struct pt_regs *regs) { - if (!(tr->trace_flags & TRACE_ITER_STACKTRACE)) + if (!(tr->trace_flags & TRACE_ITER(STACKTRACE))) return; __ftrace_trace_stack(tr, buffer, trace_ctx, skip, regs); @@ -3084,7 +3148,7 @@ ftrace_trace_userstack(struct trace_array *tr, struct ring_buffer_event *event; struct userstack_entry *entry; - if (!(tr->trace_flags & TRACE_ITER_USERSTACKTRACE)) + if (!(tr->trace_flags & TRACE_ITER(USERSTACKTRACE))) return; /* @@ -3098,9 +3162,9 @@ ftrace_trace_userstack(struct trace_array *tr, * prevent recursion, since the user stack tracing may * trigger other kernel events. */ - preempt_disable(); + guard(preempt)(); if (__this_cpu_read(user_stack_count)) - goto out; + return; __this_cpu_inc(user_stack_count); @@ -3118,8 +3182,6 @@ ftrace_trace_userstack(struct trace_array *tr, out_drop_count: __this_cpu_dec(user_stack_count); - out: - preempt_enable(); } #else /* CONFIG_USER_STACKTRACE_SUPPORT */ static void ftrace_trace_userstack(struct trace_array *tr, @@ -3301,7 +3363,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) pause_graph_tracing(); trace_ctx = tracing_gen_ctx(); - preempt_disable_notrace(); + guard(preempt_notrace)(); tbuffer = get_trace_buf(); if (!tbuffer) { @@ -3316,26 +3378,23 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) size = sizeof(*entry) + sizeof(u32) * len; buffer = tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); - event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, - trace_ctx); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->ip = ip; - entry->fmt = fmt; - - memcpy(entry->buf, tbuffer, sizeof(u32) * len); - __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); + scoped_guard(ring_buffer_nest, buffer) { + event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, + trace_ctx); + if (!event) + goto out_put; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->fmt = fmt; -out: - ring_buffer_nest_end(buffer); + memcpy(entry->buf, tbuffer, sizeof(u32) * len); + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); + } out_put: put_trace_buf(); out_nobuffer: - preempt_enable_notrace(); unpause_graph_tracing(); return len; @@ -3359,7 +3418,7 @@ int __trace_array_vprintk(struct trace_buffer *buffer, pause_graph_tracing(); trace_ctx = tracing_gen_ctx(); - preempt_disable_notrace(); + guard(preempt_notrace)(); tbuffer = get_trace_buf(); @@ -3371,24 +3430,22 @@ int __trace_array_vprintk(struct trace_buffer *buffer, len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); size = sizeof(*entry) + len + 1; - ring_buffer_nest_start(buffer); - event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, - trace_ctx); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->ip = ip; - - memcpy(&entry->buf, tbuffer, len + 1); - __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL); + scoped_guard(ring_buffer_nest, buffer) { + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + trace_ctx); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + entry->ip = ip; + memcpy(&entry->buf, tbuffer, len + 1); + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL); + } out: - ring_buffer_nest_end(buffer); put_trace_buf(); out_nobuffer: - preempt_enable_notrace(); unpause_graph_tracing(); return len; @@ -3436,7 +3493,7 @@ int trace_array_printk(struct trace_array *tr, if (tr == &global_trace) return 0; - if (!(tr->trace_flags & TRACE_ITER_PRINTK)) + if (!(tr->trace_flags & TRACE_ITER(PRINTK))) return 0; va_start(ap, fmt); @@ -3473,7 +3530,7 @@ int trace_array_printk_buf(struct trace_buffer *buffer, int ret; va_list ap; - if (!(printk_trace->trace_flags & TRACE_ITER_PRINTK)) + if (!(printk_trace->trace_flags & TRACE_ITER(PRINTK))) return 0; va_start(ap, fmt); @@ -3743,7 +3800,7 @@ const char *trace_event_format(struct trace_iterator *iter, const char *fmt) if (WARN_ON_ONCE(!fmt)) return fmt; - if (!iter->tr || iter->tr->trace_flags & TRACE_ITER_HASH_PTR) + if (!iter->tr || iter->tr->trace_flags & TRACE_ITER(HASH_PTR)) return fmt; p = fmt; @@ -4065,7 +4122,7 @@ static void print_event_info(struct array_buffer *buf, struct seq_file *m) static void print_func_help_header(struct array_buffer *buf, struct seq_file *m, unsigned int flags) { - bool tgid = flags & TRACE_ITER_RECORD_TGID; + bool tgid = flags & TRACE_ITER(RECORD_TGID); print_event_info(buf, m); @@ -4076,7 +4133,7 @@ static void print_func_help_header(struct array_buffer *buf, struct seq_file *m, static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file *m, unsigned int flags) { - bool tgid = flags & TRACE_ITER_RECORD_TGID; + bool tgid = flags & TRACE_ITER(RECORD_TGID); static const char space[] = " "; int prec = tgid ? 12 : 2; @@ -4149,7 +4206,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter) struct trace_seq *s = &iter->seq; struct trace_array *tr = iter->tr; - if (!(tr->trace_flags & TRACE_ITER_ANNOTATE)) + if (!(tr->trace_flags & TRACE_ITER(ANNOTATE))) return; if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) @@ -4171,6 +4228,22 @@ static void test_cpu_buff_start(struct trace_iterator *iter) iter->cpu); } +#ifdef CONFIG_FTRACE_SYSCALLS +static bool is_syscall_event(struct trace_event *event) +{ + return (event->funcs == &enter_syscall_print_funcs) || + (event->funcs == &exit_syscall_print_funcs); + +} +#define syscall_buf_size CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT +#else +static inline bool is_syscall_event(struct trace_event *event) +{ + return false; +} +#define syscall_buf_size 0 +#endif /* CONFIG_FTRACE_SYSCALLS */ + static enum print_line_t print_trace_fmt(struct trace_iterator *iter) { struct trace_array *tr = iter->tr; @@ -4185,7 +4258,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) event = ftrace_find_event(entry->type); - if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) { if (iter->iter_flags & TRACE_FILE_LAT_FMT) trace_print_lat_context(iter); else @@ -4196,17 +4269,19 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) return TRACE_TYPE_PARTIAL_LINE; if (event) { - if (tr->trace_flags & TRACE_ITER_FIELDS) + if (tr->trace_flags & TRACE_ITER(FIELDS)) return print_event_fields(iter, event); /* * For TRACE_EVENT() events, the print_fmt is not * safe to use if the array has delta offsets * Force printing via the fields. */ - if ((tr->text_delta) && - event->type > __TRACE_LAST_TYPE) + if ((tr->text_delta)) { + /* ftrace and system call events are still OK */ + if ((event->type > __TRACE_LAST_TYPE) && + !is_syscall_event(event)) return print_event_fields(iter, event); - + } return event->funcs->trace(iter, sym_flags, event); } @@ -4224,7 +4299,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) entry = iter->ent; - if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) + if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) trace_seq_printf(s, "%d %d %llu ", entry->pid, iter->cpu, iter->ts); @@ -4250,7 +4325,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) entry = iter->ent; - if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) { SEQ_PUT_HEX_FIELD(s, entry->pid); SEQ_PUT_HEX_FIELD(s, iter->cpu); SEQ_PUT_HEX_FIELD(s, iter->ts); @@ -4279,7 +4354,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) entry = iter->ent; - if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) { SEQ_PUT_FIELD(s, entry->pid); SEQ_PUT_FIELD(s, iter->cpu); SEQ_PUT_FIELD(s, iter->ts); @@ -4350,27 +4425,27 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) } if (iter->ent->type == TRACE_BPUTS && - trace_flags & TRACE_ITER_PRINTK && - trace_flags & TRACE_ITER_PRINTK_MSGONLY) + trace_flags & TRACE_ITER(PRINTK) && + trace_flags & TRACE_ITER(PRINTK_MSGONLY)) return trace_print_bputs_msg_only(iter); if (iter->ent->type == TRACE_BPRINT && - trace_flags & TRACE_ITER_PRINTK && - trace_flags & TRACE_ITER_PRINTK_MSGONLY) + trace_flags & TRACE_ITER(PRINTK) && + trace_flags & TRACE_ITER(PRINTK_MSGONLY)) return trace_print_bprintk_msg_only(iter); if (iter->ent->type == TRACE_PRINT && - trace_flags & TRACE_ITER_PRINTK && - trace_flags & TRACE_ITER_PRINTK_MSGONLY) + trace_flags & TRACE_ITER(PRINTK) && + trace_flags & TRACE_ITER(PRINTK_MSGONLY)) return trace_print_printk_msg_only(iter); - if (trace_flags & TRACE_ITER_BIN) + if (trace_flags & TRACE_ITER(BIN)) return print_bin_fmt(iter); - if (trace_flags & TRACE_ITER_HEX) + if (trace_flags & TRACE_ITER(HEX)) return print_hex_fmt(iter); - if (trace_flags & TRACE_ITER_RAW) + if (trace_flags & TRACE_ITER(RAW)) return print_raw_fmt(iter); return print_trace_fmt(iter); @@ -4388,7 +4463,7 @@ void trace_latency_header(struct seq_file *m) if (iter->iter_flags & TRACE_FILE_LAT_FMT) print_trace_header(m, iter); - if (!(tr->trace_flags & TRACE_ITER_VERBOSE)) + if (!(tr->trace_flags & TRACE_ITER(VERBOSE))) print_lat_help_header(m); } @@ -4398,7 +4473,7 @@ void trace_default_header(struct seq_file *m) struct trace_array *tr = iter->tr; unsigned long trace_flags = tr->trace_flags; - if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + if (!(trace_flags & TRACE_ITER(CONTEXT_INFO))) return; if (iter->iter_flags & TRACE_FILE_LAT_FMT) { @@ -4406,11 +4481,11 @@ void trace_default_header(struct seq_file *m) if (trace_empty(iter)) return; print_trace_header(m, iter); - if (!(trace_flags & TRACE_ITER_VERBOSE)) + if (!(trace_flags & TRACE_ITER(VERBOSE))) print_lat_help_header(m); } else { - if (!(trace_flags & TRACE_ITER_VERBOSE)) { - if (trace_flags & TRACE_ITER_IRQ_INFO) + if (!(trace_flags & TRACE_ITER(VERBOSE))) { + if (trace_flags & TRACE_ITER(IRQ_INFO)) print_func_help_header_irq(iter->array_buffer, m, trace_flags); else @@ -4634,27 +4709,23 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) * If pause-on-trace is enabled, then stop the trace while * dumping, unless this is the "snapshot" file */ - if (!iter->snapshot && (tr->trace_flags & TRACE_ITER_PAUSE_ON_TRACE)) + if (!iter->snapshot && (tr->trace_flags & TRACE_ITER(PAUSE_ON_TRACE))) { + iter->iter_flags |= TRACE_FILE_PAUSE; tracing_stop_tr(tr); + } if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->array_buffer->buffer, - cpu, GFP_KERNEL); - } - ring_buffer_read_prepare_sync(); - for_each_tracing_cpu(cpu) { - ring_buffer_read_start(iter->buffer_iter[cpu]); + ring_buffer_read_start(iter->array_buffer->buffer, + cpu, GFP_KERNEL); tracing_iter_reset(iter, cpu); } } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->array_buffer->buffer, - cpu, GFP_KERNEL); - ring_buffer_read_prepare_sync(); - ring_buffer_read_start(iter->buffer_iter[cpu]); + ring_buffer_read_start(iter->array_buffer->buffer, + cpu, GFP_KERNEL); tracing_iter_reset(iter, cpu); } @@ -4718,20 +4789,16 @@ int tracing_open_file_tr(struct inode *inode, struct file *filp) if (ret) return ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); /* Fail if the file is marked for removal */ if (file->flags & EVENT_FILE_FL_FREED) { trace_array_put(file->tr); - ret = -ENODEV; + return -ENODEV; } else { event_file_get(file); } - mutex_unlock(&event_mutex); - if (ret) - return ret; - filp->private_data = inode->i_private; return 0; @@ -4753,12 +4820,6 @@ int tracing_single_release_file_tr(struct inode *inode, struct file *filp) return single_release(inode, filp); } -static int tracing_mark_open(struct inode *inode, struct file *filp) -{ - stream_open(inode, filp); - return tracing_open_generic_tr(inode, filp); -} - static int tracing_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; @@ -4783,7 +4844,7 @@ static int tracing_release(struct inode *inode, struct file *file) if (iter->trace && iter->trace->close) iter->trace->close(iter); - if (!iter->snapshot && tr->stop_count) + if (iter->iter_flags & TRACE_FILE_PAUSE) /* reenable tracing if it was previously enabled */ tracing_start_tr(tr); @@ -4844,7 +4905,7 @@ static int tracing_open(struct inode *inode, struct file *file) iter = __tracing_open(inode, file, false); if (IS_ERR(iter)) ret = PTR_ERR(iter); - else if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) + else if (tr->trace_flags & TRACE_ITER(LATENCY_FMT)) iter->iter_flags |= TRACE_FILE_LAT_FMT; } @@ -5008,7 +5069,7 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct trace_array *tr = file_inode(filp)->i_private; - char *mask_str; + char *mask_str __free(kfree) = NULL; int len; len = snprintf(NULL, 0, "%*pb\n", @@ -5019,16 +5080,10 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf, len = snprintf(mask_str, len, "%*pb\n", cpumask_pr_args(tr->tracing_cpumask)); - if (len >= count) { - count = -EINVAL; - goto out_err; - } - count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len); - -out_err: - kfree(mask_str); + if (len >= count) + return -EINVAL; - return count; + return simple_read_from_buffer(ubuf, count, ppos, mask_str, len); } int tracing_set_cpumask(struct trace_array *tr, @@ -5048,7 +5103,6 @@ int tracing_set_cpumask(struct trace_array *tr, */ if (cpumask_test_cpu(cpu, tr->tracing_cpumask) && !cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_inc(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled); ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu); #ifdef CONFIG_TRACER_MAX_TRACE ring_buffer_record_disable_cpu(tr->max_buffer.buffer, cpu); @@ -5056,7 +5110,6 @@ int tracing_set_cpumask(struct trace_array *tr, } if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) && cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_dec(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled); ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu); #ifdef CONFIG_TRACER_MAX_TRACE ring_buffer_record_enable_cpu(tr->max_buffer.buffer, cpu); @@ -5115,21 +5168,26 @@ static int tracing_trace_options_show(struct seq_file *m, void *v) { struct tracer_opt *trace_opts; struct trace_array *tr = m->private; + struct tracer_flags *flags; u32 tracer_flags; int i; guard(mutex)(&trace_types_lock); - tracer_flags = tr->current_trace->flags->val; - trace_opts = tr->current_trace->flags->opts; - for (i = 0; trace_options[i]; i++) { - if (tr->trace_flags & (1 << i)) + if (tr->trace_flags & (1ULL << i)) seq_printf(m, "%s\n", trace_options[i]); else seq_printf(m, "no%s\n", trace_options[i]); } + flags = tr->current_trace_flags; + if (!flags || !flags->opts) + return 0; + + tracer_flags = flags->val; + trace_opts = flags->opts; + for (i = 0; trace_opts[i].name; i++) { if (tracer_flags & trace_opts[i].bit) seq_printf(m, "%s\n", trace_opts[i].name); @@ -5145,9 +5203,10 @@ static int __set_tracer_option(struct trace_array *tr, struct tracer_opt *opts, int neg) { struct tracer *trace = tracer_flags->trace; - int ret; + int ret = 0; - ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg); + if (trace->set_flag) + ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg); if (ret) return ret; @@ -5161,36 +5220,41 @@ static int __set_tracer_option(struct trace_array *tr, /* Try to assign a tracer specific option */ static int set_tracer_option(struct trace_array *tr, char *cmp, int neg) { - struct tracer *trace = tr->current_trace; - struct tracer_flags *tracer_flags = trace->flags; + struct tracer_flags *tracer_flags = tr->current_trace_flags; struct tracer_opt *opts = NULL; int i; + if (!tracer_flags || !tracer_flags->opts) + return 0; + for (i = 0; tracer_flags->opts[i].name; i++) { opts = &tracer_flags->opts[i]; if (strcmp(cmp, opts->name) == 0) - return __set_tracer_option(tr, trace->flags, opts, neg); + return __set_tracer_option(tr, tracer_flags, opts, neg); } return -EINVAL; } /* Some tracers require overwrite to stay enabled */ -int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) +int trace_keep_overwrite(struct tracer *tracer, u64 mask, int set) { - if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set) + if (tracer->enabled && (mask & TRACE_ITER(OVERWRITE)) && !set) return -1; return 0; } -int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) +int set_tracer_flag(struct trace_array *tr, u64 mask, int enabled) { - if ((mask == TRACE_ITER_RECORD_TGID) || - (mask == TRACE_ITER_RECORD_CMD) || - (mask == TRACE_ITER_TRACE_PRINTK)) + switch (mask) { + case TRACE_ITER(RECORD_TGID): + case TRACE_ITER(RECORD_CMD): + case TRACE_ITER(TRACE_PRINTK): + case TRACE_ITER(COPY_MARKER): lockdep_assert_held(&event_mutex); + } /* do nothing if flag is already set */ if (!!(tr->trace_flags & mask) == !!enabled) @@ -5201,7 +5265,8 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) if (tr->current_trace->flag_changed(tr, mask, !!enabled)) return -EINVAL; - if (mask == TRACE_ITER_TRACE_PRINTK) { + switch (mask) { + case TRACE_ITER(TRACE_PRINTK): if (enabled) { update_printk_trace(tr); } else { @@ -5213,11 +5278,17 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) return -EINVAL; /* * An instance must always have it set. - * by default, that's the global_trace instane. + * by default, that's the global_trace instance. */ if (printk_trace == tr) update_printk_trace(&global_trace); } + break; + + case TRACE_ITER(COPY_MARKER): + update_marker_trace(tr, enabled); + /* update_marker_trace updates the tr->trace_flags */ + return 0; } if (enabled) @@ -5225,35 +5296,46 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) else tr->trace_flags &= ~mask; - if (mask == TRACE_ITER_RECORD_CMD) + switch (mask) { + case TRACE_ITER(RECORD_CMD): trace_event_enable_cmd_record(enabled); + break; - if (mask == TRACE_ITER_RECORD_TGID) { + case TRACE_ITER(RECORD_TGID): if (trace_alloc_tgid_map() < 0) { - tr->trace_flags &= ~TRACE_ITER_RECORD_TGID; + tr->trace_flags &= ~TRACE_ITER(RECORD_TGID); return -ENOMEM; } trace_event_enable_tgid_record(enabled); - } + break; - if (mask == TRACE_ITER_EVENT_FORK) + case TRACE_ITER(EVENT_FORK): trace_event_follow_fork(tr, enabled); + break; - if (mask == TRACE_ITER_FUNC_FORK) + case TRACE_ITER(FUNC_FORK): ftrace_pid_follow_fork(tr, enabled); + break; - if (mask == TRACE_ITER_OVERWRITE) { + case TRACE_ITER(OVERWRITE): ring_buffer_change_overwrite(tr->array_buffer.buffer, enabled); #ifdef CONFIG_TRACER_MAX_TRACE ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled); #endif - } + break; - if (mask == TRACE_ITER_PRINTK) { + case TRACE_ITER(PRINTK): trace_printk_start_stop_comm(enabled); trace_printk_control(enabled); + break; + +#if defined(CONFIG_FUNCTION_PROFILER) && defined(CONFIG_FUNCTION_GRAPH_TRACER) + case TRACE_GRAPH_GRAPH_TIME: + ftrace_graph_graph_time_control(enabled); + break; +#endif } return 0; @@ -5283,7 +5365,7 @@ int trace_set_options(struct trace_array *tr, char *option) if (ret < 0) ret = set_tracer_option(tr, cmp, neg); else - ret = set_tracer_flag(tr, 1 << ret, !neg); + ret = set_tracer_flag(tr, 1ULL << ret, !neg); mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); @@ -5840,17 +5922,27 @@ static inline void trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, int len) { } #endif /* !CONFIG_TRACE_EVAL_MAP_FILE */ -static void trace_insert_eval_map(struct module *mod, - struct trace_eval_map **start, int len) +static void +trace_event_update_with_eval_map(struct module *mod, + struct trace_eval_map **start, + int len) { struct trace_eval_map **map; - if (len <= 0) - return; + /* Always run sanitizer only if btf_type_tag attr exists. */ + if (len <= 0) { + if (!(IS_ENABLED(CONFIG_DEBUG_INFO_BTF) && + IS_ENABLED(CONFIG_PAHOLE_HAS_BTF_TAG) && + __has_attribute(btf_type_tag))) + return; + } map = start; - trace_event_eval_update(map, len); + trace_event_update_all(map, len); + + if (len <= 0) + return; trace_insert_eval_map_file(mod, start, len); } @@ -5863,9 +5955,9 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, char buf[MAX_TRACER_SIZE+2]; int r; - mutex_lock(&trace_types_lock); - r = sprintf(buf, "%s\n", tr->current_trace->name); - mutex_unlock(&trace_types_lock); + scoped_guard(mutex, &trace_types_lock) { + r = sprintf(buf, "%s\n", tr->current_trace->name); + } return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } @@ -6004,6 +6096,7 @@ struct trace_mod_entry { }; struct trace_scratch { + unsigned int clock_id; unsigned long text_addr; unsigned long nr_entries; struct trace_mod_entry entries[]; @@ -6032,6 +6125,7 @@ unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr) struct trace_module_delta *module_delta; struct trace_scratch *tscratch; struct trace_mod_entry *entry; + unsigned long raddr; int idx = 0, nr_entries; /* If we don't have last boot delta, return the address */ @@ -6045,7 +6139,9 @@ unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr) module_delta = READ_ONCE(tr->module_delta); if (!module_delta || !tscratch->nr_entries || tscratch->entries[0].mod_addr > addr) { - return addr + tr->text_delta; + raddr = addr + tr->text_delta; + return __is_kernel(raddr) || is_kernel_core_data(raddr) || + is_kernel_rodata(raddr) ? raddr : addr; } /* Note that entries must be sorted. */ @@ -6116,6 +6212,7 @@ static void update_last_data(struct trace_array *tr) if (tr->scratch) { struct trace_scratch *tscratch = tr->scratch; + tscratch->clock_id = tr->clock_id; memset(tscratch->entries, 0, flex_array_size(tscratch, entries, tscratch->nr_entries)); tscratch->nr_entries = 0; @@ -6162,23 +6259,16 @@ int tracing_update_buffers(struct trace_array *tr) { int ret = 0; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); update_last_data(tr); if (!tr->ring_buffer_expanded) ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); - mutex_unlock(&trace_types_lock); - return ret; } -struct trace_option_dentry; - -static void -create_trace_option_files(struct trace_array *tr, struct tracer *tracer); - /* * Used to clear out the tracer before deletion of an instance. * Must have trace_types_lock held. @@ -6194,26 +6284,15 @@ static void tracing_set_nop(struct trace_array *tr) tr->current_trace->reset(tr); tr->current_trace = &nop_trace; + tr->current_trace_flags = nop_trace.flags; } static bool tracer_options_updated; -static void add_tracer_options(struct trace_array *tr, struct tracer *t) -{ - /* Only enable if the directory has been created already. */ - if (!tr->dir) - return; - - /* Only create trace option files after update_tracer_options finish */ - if (!tracer_options_updated) - return; - - create_trace_option_files(tr, t); -} - int tracing_set_tracer(struct trace_array *tr, const char *buf) { - struct tracer *t; + struct tracer *trace = NULL; + struct tracers *t; #ifdef CONFIG_TRACER_MAX_TRACE bool had_max_tr; #endif @@ -6231,18 +6310,20 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) ret = 0; } - for (t = trace_types; t; t = t->next) { - if (strcmp(t->name, buf) == 0) + list_for_each_entry(t, &tr->tracers, list) { + if (strcmp(t->tracer->name, buf) == 0) { + trace = t->tracer; break; + } } - if (!t) + if (!trace) return -EINVAL; - if (t == tr->current_trace) + if (trace == tr->current_trace) return 0; #ifdef CONFIG_TRACER_SNAPSHOT - if (t->use_max_tr) { + if (trace->use_max_tr) { local_irq_disable(); arch_spin_lock(&tr->max_lock); ret = tr->cond_snapshot ? -EBUSY : 0; @@ -6253,14 +6334,14 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) } #endif /* Some tracers won't work on kernel command line */ - if (system_state < SYSTEM_RUNNING && t->noboot) { + if (system_state < SYSTEM_RUNNING && trace->noboot) { pr_warn("Tracer '%s' is not allowed on command line, ignored\n", - t->name); + trace->name); return -EINVAL; } /* Some tracers are only allowed for the top level buffer */ - if (!trace_ok_for_array(t, tr)) + if (!trace_ok_for_array(trace, tr)) return -EINVAL; /* If trace pipe files are being read, we can't change the tracer */ @@ -6279,8 +6360,9 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) /* Current trace needs to be nop_trace before synchronize_rcu */ tr->current_trace = &nop_trace; + tr->current_trace_flags = nop_trace.flags; - if (had_max_tr && !t->use_max_tr) { + if (had_max_tr && !trace->use_max_tr) { /* * We need to make sure that the update_max_tr sees that * current_trace changed to nop_trace to keep it from @@ -6293,7 +6375,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) tracing_disarm_snapshot(tr); } - if (!had_max_tr && t->use_max_tr) { + if (!had_max_tr && trace->use_max_tr) { ret = tracing_arm_snapshot_locked(tr); if (ret) return ret; @@ -6302,18 +6384,21 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) tr->current_trace = &nop_trace; #endif - if (t->init) { - ret = tracer_init(t, tr); + tr->current_trace_flags = t->flags ? : t->tracer->flags; + + if (trace->init) { + ret = tracer_init(trace, tr); if (ret) { #ifdef CONFIG_TRACER_MAX_TRACE - if (t->use_max_tr) + if (trace->use_max_tr) tracing_disarm_snapshot(tr); #endif + tr->current_trace_flags = nop_trace.flags; return ret; } } - tr->current_trace = t; + tr->current_trace = trace; tr->current_trace->enabled++; trace_branch_enable(tr); @@ -6467,7 +6552,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) if (ret) return ret; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); cpu = tracing_get_cpu(inode); ret = open_pipe_on_cpu(tr, cpu); if (ret) @@ -6491,7 +6576,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) /* trace pipe does not show start of buffer */ cpumask_setall(iter->started); - if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) + if (tr->trace_flags & TRACE_ITER(LATENCY_FMT)) iter->iter_flags |= TRACE_FILE_LAT_FMT; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ @@ -6511,7 +6596,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) tr->trace_ref++; - mutex_unlock(&trace_types_lock); return ret; fail: @@ -6520,7 +6604,6 @@ fail_alloc_iter: close_pipe_on_cpu(tr, cpu); fail_pipe_on_cpu: __trace_array_put(tr); - mutex_unlock(&trace_types_lock); return ret; } @@ -6529,14 +6612,13 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) struct trace_iterator *iter = file->private_data; struct trace_array *tr = inode->i_private; - mutex_lock(&trace_types_lock); - - tr->trace_ref--; + scoped_guard(mutex, &trace_types_lock) { + tr->trace_ref--; - if (iter->trace->pipe_close) - iter->trace->pipe_close(iter); - close_pipe_on_cpu(tr, iter->cpu_file); - mutex_unlock(&trace_types_lock); + if (iter->trace->pipe_close) + iter->trace->pipe_close(iter); + close_pipe_on_cpu(tr, iter->cpu_file); + } free_trace_iter_content(iter); kfree(iter); @@ -6555,7 +6637,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl if (trace_buffer_iter(iter, iter->cpu_file)) return EPOLLIN | EPOLLRDNORM; - if (tr->trace_flags & TRACE_ITER_BLOCK) + if (tr->trace_flags & TRACE_ITER(BLOCK)) /* * Always select as readable when in blocking mode */ @@ -6610,6 +6692,22 @@ static int tracing_wait_pipe(struct file *filp) return 1; } +static bool update_last_data_if_empty(struct trace_array *tr) +{ + if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + return false; + + if (!ring_buffer_empty(tr->array_buffer.buffer)) + return false; + + /* + * If the buffer contains the last boot data and all per-cpu + * buffers are empty, reset it from the kernel side. + */ + update_last_data(tr); + return true; +} + /* * Consumer reader. */ @@ -6641,6 +6739,9 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, } waitagain: + if (update_last_data_if_empty(iter->tr)) + return 0; + sret = tracing_wait_pipe(filp); if (sret <= 0) return sret; @@ -6824,7 +6925,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, ret = trace_seq_to_buffer(&iter->seq, page_address(spd.pages[i]), min((size_t)trace_seq_used(&iter->seq), - PAGE_SIZE)); + (size_t)PAGE_SIZE)); if (ret < 0) { __free_page(spd.pages[i]); break; @@ -6855,6 +6956,43 @@ out_err: } static ssize_t +tracing_syscall_buf_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; + char buf[64]; + int r; + + r = snprintf(buf, 64, "%d\n", tr->syscall_buf_sz); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_syscall_buf_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + if (val > SYSCALL_FAULT_USER_MAX) + val = SYSCALL_FAULT_USER_MAX; + + tr->syscall_buf_sz = val; + + *ppos += cnt; + + return cnt; +} + +static ssize_t tracing_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -7088,7 +7226,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) struct trace_array *tr = inode->i_private; /* disable tracing ? */ - if (tr->trace_flags & TRACE_ITER_STOP_ON_FREE) + if (tr->trace_flags & TRACE_ITER(STOP_ON_FREE)) tracer_tracing_off(tr); /* resize the ring buffer to 0 */ tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); @@ -7100,11 +7238,9 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) #define TRACE_MARKER_MAX_SIZE 4096 -static ssize_t -tracing_mark_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *fpos) +static ssize_t write_marker_to_buffer(struct trace_array *tr, const char *buf, + size_t cnt, unsigned long ip) { - struct trace_array *tr = filp->private_data; struct ring_buffer_event *event; enum event_trigger_type tt = ETT_NONE; struct trace_buffer *buffer; @@ -7112,32 +7248,11 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, int meta_size; ssize_t written; size_t size; - int len; - -/* Used in tracing_mark_raw_write() as well */ -#define FAULTED_STR "<faulted>" -#define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */ - - if (tracing_disabled) - return -EINVAL; - - if (!(tr->trace_flags & TRACE_ITER_MARKERS)) - return -EINVAL; - - if ((ssize_t)cnt < 0) - return -EINVAL; - - if (cnt > TRACE_MARKER_MAX_SIZE) - cnt = TRACE_MARKER_MAX_SIZE; meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */ again: size = cnt + meta_size; - /* If less than "<faulted>", then make sure we can still add that */ - if (cnt < FAULTED_SIZE) - size += FAULTED_SIZE - cnt; - buffer = tr->array_buffer.buffer; event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, tracing_gen_ctx()); @@ -7147,9 +7262,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, * make it smaller and try again. */ if (size > ring_buffer_max_event_size(buffer)) { - /* cnt < FAULTED size should never be bigger than max */ - if (WARN_ON_ONCE(cnt < FAULTED_SIZE)) - return -EBADF; cnt = ring_buffer_max_event_size(buffer) - meta_size; /* The above should only happen once */ if (WARN_ON_ONCE(cnt + meta_size == size)) @@ -7162,15 +7274,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, } entry = ring_buffer_event_data(event); - entry->ip = _THIS_IP_; - - len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); - if (len) { - memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); - cnt = FAULTED_SIZE; - written = -EFAULT; - } else - written = cnt; + entry->ip = ip; + memcpy(&entry->buf, buf, cnt); + written = cnt; if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) { /* do not add \n before testing triggers, but add \0 */ @@ -7194,33 +7300,351 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, return written; } +struct trace_user_buf { + char *buf; +}; + +static DEFINE_MUTEX(trace_user_buffer_mutex); +static struct trace_user_buf_info *trace_user_buffer; + +/** + * trace_user_fault_destroy - free up allocated memory of a trace user buffer + * @tinfo: The descriptor to free up + * + * Frees any data allocated in the trace info dsecriptor. + */ +void trace_user_fault_destroy(struct trace_user_buf_info *tinfo) +{ + char *buf; + int cpu; + + if (!tinfo || !tinfo->tbuf) + return; + + for_each_possible_cpu(cpu) { + buf = per_cpu_ptr(tinfo->tbuf, cpu)->buf; + kfree(buf); + } + free_percpu(tinfo->tbuf); +} + +static int user_fault_buffer_enable(struct trace_user_buf_info *tinfo, size_t size) +{ + char *buf; + int cpu; + + lockdep_assert_held(&trace_user_buffer_mutex); + + tinfo->tbuf = alloc_percpu(struct trace_user_buf); + if (!tinfo->tbuf) + return -ENOMEM; + + tinfo->ref = 1; + tinfo->size = size; + + /* Clear each buffer in case of error */ + for_each_possible_cpu(cpu) { + per_cpu_ptr(tinfo->tbuf, cpu)->buf = NULL; + } + + for_each_possible_cpu(cpu) { + buf = kmalloc_node(size, GFP_KERNEL, + cpu_to_node(cpu)); + if (!buf) + return -ENOMEM; + per_cpu_ptr(tinfo->tbuf, cpu)->buf = buf; + } + + return 0; +} + +/* For internal use. Free and reinitialize */ +static void user_buffer_free(struct trace_user_buf_info **tinfo) +{ + lockdep_assert_held(&trace_user_buffer_mutex); + + trace_user_fault_destroy(*tinfo); + kfree(*tinfo); + *tinfo = NULL; +} + +/* For internal use. Initialize and allocate */ +static int user_buffer_init(struct trace_user_buf_info **tinfo, size_t size) +{ + bool alloc = false; + int ret; + + lockdep_assert_held(&trace_user_buffer_mutex); + + if (!*tinfo) { + alloc = true; + *tinfo = kzalloc(sizeof(**tinfo), GFP_KERNEL); + if (!*tinfo) + return -ENOMEM; + } + + ret = user_fault_buffer_enable(*tinfo, size); + if (ret < 0 && alloc) + user_buffer_free(tinfo); + + return ret; +} + +/* For internal use, derefrence and free if necessary */ +static void user_buffer_put(struct trace_user_buf_info **tinfo) +{ + guard(mutex)(&trace_user_buffer_mutex); + + if (WARN_ON_ONCE(!*tinfo || !(*tinfo)->ref)) + return; + + if (--(*tinfo)->ref) + return; + + user_buffer_free(tinfo); +} + +/** + * trace_user_fault_init - Allocated or reference a per CPU buffer + * @tinfo: A pointer to the trace buffer descriptor + * @size: The size to allocate each per CPU buffer + * + * Create a per CPU buffer that can be used to copy from user space + * in a task context. When calling trace_user_fault_read(), preemption + * must be disabled, and it will enable preemption and copy user + * space data to the buffer. If any schedule switches occur, it will + * retry until it succeeds without a schedule switch knowing the buffer + * is still valid. + * + * Returns 0 on success, negative on failure. + */ +int trace_user_fault_init(struct trace_user_buf_info *tinfo, size_t size) +{ + int ret; + + if (!tinfo) + return -EINVAL; + + guard(mutex)(&trace_user_buffer_mutex); + + ret = user_buffer_init(&tinfo, size); + if (ret < 0) + trace_user_fault_destroy(tinfo); + + return ret; +} + +/** + * trace_user_fault_get - up the ref count for the user buffer + * @tinfo: A pointer to a pointer to the trace buffer descriptor + * + * Ups the ref count of the trace buffer. + * + * Returns the new ref count. + */ +int trace_user_fault_get(struct trace_user_buf_info *tinfo) +{ + if (!tinfo) + return -1; + + guard(mutex)(&trace_user_buffer_mutex); + + tinfo->ref++; + return tinfo->ref; +} + +/** + * trace_user_fault_put - dereference a per cpu trace buffer + * @tinfo: The @tinfo that was passed to trace_user_fault_get() + * + * Decrement the ref count of @tinfo. + * + * Returns the new refcount (negative on error). + */ +int trace_user_fault_put(struct trace_user_buf_info *tinfo) +{ + guard(mutex)(&trace_user_buffer_mutex); + + if (WARN_ON_ONCE(!tinfo || !tinfo->ref)) + return -1; + + --tinfo->ref; + return tinfo->ref; +} + +/** + * trace_user_fault_read - Read user space into a per CPU buffer + * @tinfo: The @tinfo allocated by trace_user_fault_get() + * @ptr: The user space pointer to read + * @size: The size of user space to read. + * @copy_func: Optional function to use to copy from user space + * @data: Data to pass to copy_func if it was supplied + * + * Preemption must be disabled when this is called, and must not + * be enabled while using the returned buffer. + * This does the copying from user space into a per CPU buffer. + * + * The @size must not be greater than the size passed in to + * trace_user_fault_init(). + * + * If @copy_func is NULL, trace_user_fault_read() will use copy_from_user(), + * otherwise it will call @copy_func. It will call @copy_func with: + * + * buffer: the per CPU buffer of the @tinfo. + * ptr: The pointer @ptr to user space to read + * size: The @size of the ptr to read + * data: The @data parameter + * + * It is expected that @copy_func will return 0 on success and non zero + * if there was a fault. + * + * Returns a pointer to the buffer with the content read from @ptr. + * Preemption must remain disabled while the caller accesses the + * buffer returned by this function. + * Returns NULL if there was a fault, or the size passed in is + * greater than the size passed to trace_user_fault_init(). + */ +char *trace_user_fault_read(struct trace_user_buf_info *tinfo, + const char __user *ptr, size_t size, + trace_user_buf_copy copy_func, void *data) +{ + int cpu = smp_processor_id(); + char *buffer = per_cpu_ptr(tinfo->tbuf, cpu)->buf; + unsigned int cnt; + int trys = 0; + int ret; + + lockdep_assert_preemption_disabled(); + + /* + * It's up to the caller to not try to copy more than it said + * it would. + */ + if (size > tinfo->size) + return NULL; + + /* + * This acts similar to a seqcount. The per CPU context switches are + * recorded, migration is disabled and preemption is enabled. The + * read of the user space memory is copied into the per CPU buffer. + * Preemption is disabled again, and if the per CPU context switches count + * is still the same, it means the buffer has not been corrupted. + * If the count is different, it is assumed the buffer is corrupted + * and reading must be tried again. + */ + + do { + /* + * If for some reason, copy_from_user() always causes a context + * switch, this would then cause an infinite loop. + * If this task is preempted by another user space task, it + * will cause this task to try again. But just in case something + * changes where the copying from user space causes another task + * to run, prevent this from going into an infinite loop. + * 100 tries should be plenty. + */ + if (WARN_ONCE(trys++ > 100, "Error: Too many tries to read user space")) + return NULL; + + /* Read the current CPU context switch counter */ + cnt = nr_context_switches_cpu(cpu); + + /* + * Preemption is going to be enabled, but this task must + * remain on this CPU. + */ + migrate_disable(); + + /* + * Now preemption is being enabled and another task can come in + * and use the same buffer and corrupt our data. + */ + preempt_enable_notrace(); + + /* Make sure preemption is enabled here */ + lockdep_assert_preemption_enabled(); + + if (copy_func) { + ret = copy_func(buffer, ptr, size, data); + } else { + ret = __copy_from_user(buffer, ptr, size); + } + + preempt_disable_notrace(); + migrate_enable(); + + /* if it faulted, no need to test if the buffer was corrupted */ + if (ret) + return NULL; + + /* + * Preemption is disabled again, now check the per CPU context + * switch counter. If it doesn't match, then another user space + * process may have schedule in and corrupted our buffer. In that + * case the copying must be retried. + */ + } while (nr_context_switches_cpu(cpu) != cnt); + + return buffer; +} + static ssize_t -tracing_mark_raw_write(struct file *filp, const char __user *ubuf, +tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { struct trace_array *tr = filp->private_data; - struct ring_buffer_event *event; - struct trace_buffer *buffer; - struct raw_data_entry *entry; - ssize_t written; - int size; - int len; - -#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) + ssize_t written = -ENODEV; + unsigned long ip; + char *buf; if (tracing_disabled) return -EINVAL; - if (!(tr->trace_flags & TRACE_ITER_MARKERS)) + if (!(tr->trace_flags & TRACE_ITER(MARKERS))) return -EINVAL; - /* The marker must at least have a tag id */ - if (cnt < sizeof(unsigned int)) + if ((ssize_t)cnt < 0) return -EINVAL; - size = sizeof(*entry) + cnt; - if (cnt < FAULT_SIZE_ID) - size += FAULT_SIZE_ID - cnt; + if (cnt > TRACE_MARKER_MAX_SIZE) + cnt = TRACE_MARKER_MAX_SIZE; + + /* Must have preemption disabled while having access to the buffer */ + guard(preempt_notrace)(); + + buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, NULL, NULL); + if (!buf) + return -EFAULT; + + /* The selftests expect this function to be the IP address */ + ip = _THIS_IP_; + + /* The global trace_marker can go to multiple instances */ + if (tr == &global_trace) { + guard(rcu)(); + list_for_each_entry_rcu(tr, &marker_copies, marker_list) { + written = write_marker_to_buffer(tr, buf, cnt, ip); + if (written < 0) + break; + } + } else { + written = write_marker_to_buffer(tr, buf, cnt, ip); + } + + return written; +} + +static ssize_t write_raw_marker_to_buffer(struct trace_array *tr, + const char *buf, size_t cnt) +{ + struct ring_buffer_event *event; + struct trace_buffer *buffer; + struct raw_data_entry *entry; + ssize_t written; + size_t size; + + /* cnt includes both the entry->id and the data behind it. */ + size = struct_offset(entry, id) + cnt; buffer = tr->array_buffer.buffer; @@ -7234,20 +7658,88 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, return -EBADF; entry = ring_buffer_event_data(event); - - len = __copy_from_user_inatomic(&entry->id, ubuf, cnt); - if (len) { - entry->id = -1; - memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); - written = -EFAULT; - } else - written = cnt; + unsafe_memcpy(&entry->id, buf, cnt, + "id and content already reserved on ring buffer" + "'buf' includes the 'id' and the data." + "'entry' was allocated with cnt from 'id'."); + written = cnt; __buffer_unlock_commit(buffer, event); return written; } +static ssize_t +tracing_mark_raw_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + struct trace_array *tr = filp->private_data; + ssize_t written = -ENODEV; + char *buf; + + if (tracing_disabled) + return -EINVAL; + + if (!(tr->trace_flags & TRACE_ITER(MARKERS))) + return -EINVAL; + + /* The marker must at least have a tag id */ + if (cnt < sizeof(unsigned int)) + return -EINVAL; + + /* raw write is all or nothing */ + if (cnt > TRACE_MARKER_MAX_SIZE) + return -EINVAL; + + /* Must have preemption disabled while having access to the buffer */ + guard(preempt_notrace)(); + + buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, NULL, NULL); + if (!buf) + return -EFAULT; + + /* The global trace_marker_raw can go to multiple instances */ + if (tr == &global_trace) { + guard(rcu)(); + list_for_each_entry_rcu(tr, &marker_copies, marker_list) { + written = write_raw_marker_to_buffer(tr, buf, cnt); + if (written < 0) + break; + } + } else { + written = write_raw_marker_to_buffer(tr, buf, cnt); + } + + return written; +} + +static int tracing_mark_open(struct inode *inode, struct file *filp) +{ + int ret; + + scoped_guard(mutex, &trace_user_buffer_mutex) { + if (!trace_user_buffer) { + ret = user_buffer_init(&trace_user_buffer, TRACE_MARKER_MAX_SIZE); + if (ret < 0) + return ret; + } else { + trace_user_buffer->ref++; + } + } + + stream_open(inode, filp); + ret = tracing_open_generic_tr(inode, filp); + if (ret < 0) + user_buffer_put(&trace_user_buffer); + return ret; +} + +static int tracing_mark_release(struct inode *inode, struct file *file) +{ + user_buffer_put(&trace_user_buffer); + return tracing_release_generic_tr(inode, file); +} + static int tracing_clock_show(struct seq_file *m, void *v) { struct trace_array *tr = m->private; @@ -7274,7 +7766,7 @@ int tracing_set_clock(struct trace_array *tr, const char *clockstr) if (i == ARRAY_SIZE(trace_clocks)) return -EINVAL; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); tr->clock_id = i; @@ -7292,7 +7784,11 @@ int tracing_set_clock(struct trace_array *tr, const char *clockstr) tracing_reset_online_cpus(&tr->max_buffer); #endif - mutex_unlock(&trace_types_lock); + if (tr->scratch && !(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) { + struct trace_scratch *tscratch = tr->scratch; + + tscratch->clock_id = i; + } return 0; } @@ -7345,15 +7841,13 @@ static int tracing_time_stamp_mode_show(struct seq_file *m, void *v) { struct trace_array *tr = m->private; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (ring_buffer_time_stamp_abs(tr->array_buffer.buffer)) seq_puts(m, "delta [absolute]\n"); else seq_puts(m, "[delta] absolute\n"); - mutex_unlock(&trace_types_lock); - return 0; } @@ -7630,6 +8124,14 @@ static const struct file_operations tracing_entries_fops = { .release = tracing_release_generic_tr, }; +static const struct file_operations tracing_syscall_buf_fops = { + .open = tracing_open_generic_tr, + .read = tracing_syscall_buf_read, + .write = tracing_syscall_buf_write, + .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, +}; + static const struct file_operations tracing_buffer_meta_fops = { .open = tracing_buffer_meta_open, .read = seq_read, @@ -7653,13 +8155,13 @@ static const struct file_operations tracing_free_buffer_fops = { static const struct file_operations tracing_mark_fops = { .open = tracing_mark_open, .write = tracing_mark_write, - .release = tracing_release_generic_tr, + .release = tracing_mark_release, }; static const struct file_operations tracing_mark_raw_fops = { .open = tracing_mark_open, .write = tracing_mark_raw_write, - .release = tracing_release_generic_tr, + .release = tracing_mark_release, }; static const struct file_operations trace_clock_fops = { @@ -7941,14 +8443,14 @@ static void clear_tracing_err_log(struct trace_array *tr) { struct tracing_log_err *err, *next; - mutex_lock(&tracing_err_log_lock); + guard(mutex)(&tracing_err_log_lock); + list_for_each_entry_safe(err, next, &tr->err_log, list) { list_del(&err->list); free_tracing_log_err(err); } tr->n_err_log_entries = 0; - mutex_unlock(&tracing_err_log_lock); } static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos) @@ -8167,6 +8669,9 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (ret < 0) { if (trace_empty(iter) && !iter->closed) { + if (update_last_data_if_empty(iter->tr)) + return 0; + if ((filp->f_flags & O_NONBLOCK)) return -EAGAIN; @@ -8216,7 +8721,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); iter->tr->trace_ref--; @@ -8227,8 +8732,6 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) info->spare_cpu, info->spare); kvfree(info); - mutex_unlock(&trace_types_lock); - return 0; } @@ -8436,14 +8939,13 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned * An ioctl call with cmd 0 to the ring buffer file will wake up all * waiters */ - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); /* Make sure the waiters see the new wait_index */ (void)atomic_fetch_inc_release(&iter->wait_index); ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); - mutex_unlock(&trace_types_lock); return 0; } @@ -8494,8 +8996,18 @@ static void tracing_buffers_mmap_close(struct vm_area_struct *vma) put_snapshot_map(iter->tr); } +static int tracing_buffers_may_split(struct vm_area_struct *vma, unsigned long addr) +{ + /* + * Trace buffer mappings require the complete buffer including + * the meta page. Partial mappings are not supported. + */ + return -EINVAL; +} + static const struct vm_operations_struct tracing_buffers_vmops = { .close = tracing_buffers_mmap_close, + .may_split = tracing_buffers_may_split, }; static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) @@ -8504,12 +9016,8 @@ static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) struct trace_iterator *iter = &info->iter; int ret = 0; - /* A memmap'ed buffer is not supported for user space mmap */ - if (iter->tr->flags & TRACE_ARRAY_FL_MEMMAP) - return -ENODEV; - - /* Currently the boot mapped buffer is not supported for mmap */ - if (iter->tr->flags & TRACE_ARRAY_FL_BOOT) + /* A memmap'ed and backup buffers are not supported for user space mmap */ + if (iter->tr->flags & (TRACE_ARRAY_FL_MEMMAP | TRACE_ARRAY_FL_VMALLOC)) return -ENODEV; ret = get_snapshot_map(iter->tr); @@ -8788,12 +9296,12 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, out_reg: ret = tracing_arm_snapshot(tr); if (ret < 0) - goto out; + return ret; ret = register_ftrace_function_probe(glob, tr, ops, count); if (ret < 0) tracing_disarm_snapshot(tr); - out: + return ret < 0 ? ret : 0; } @@ -8812,13 +9320,13 @@ static inline __init int register_snapshot_cmd(void) { return 0; } static struct dentry *tracing_get_dentry(struct trace_array *tr) { - if (WARN_ON(!tr->dir)) - return ERR_PTR(-ENODEV); - /* Top directory uses NULL as the parent */ if (tr->flags & TRACE_ARRAY_FL_GLOBAL) return NULL; + if (WARN_ON(!tr->dir)) + return ERR_PTR(-ENODEV); + /* All sub buffers have a descriptor */ return tr->dir; } @@ -8937,10 +9445,9 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, return -EINVAL; if (!!(topt->flags->val & topt->opt->bit) != val) { - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); ret = __set_tracer_option(topt->tr, topt->flags, topt->opt, !val); - mutex_unlock(&trace_types_lock); if (ret) return ret; } @@ -9023,7 +9530,7 @@ trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt, get_tr_index(tr_index, &tr, &index); - if (tr->trace_flags & (1 << index)) + if (tr->trace_flags & (1ULL << index)) buf = "1\n"; else buf = "0\n"; @@ -9052,7 +9559,7 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); - ret = set_tracer_flag(tr, 1 << index, val); + ret = set_tracer_flag(tr, 1ULL << index, val); mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); @@ -9125,39 +9632,19 @@ create_trace_option_file(struct trace_array *tr, topt->entry = trace_create_file(opt->name, TRACE_MODE_WRITE, t_options, topt, &trace_options_fops); - } -static void -create_trace_option_files(struct trace_array *tr, struct tracer *tracer) +static int +create_trace_option_files(struct trace_array *tr, struct tracer *tracer, + struct tracer_flags *flags) { struct trace_option_dentry *topts; struct trace_options *tr_topts; - struct tracer_flags *flags; struct tracer_opt *opts; int cnt; - int i; - - if (!tracer) - return; - - flags = tracer->flags; if (!flags || !flags->opts) - return; - - /* - * If this is an instance, only create flags for tracers - * the instance may have. - */ - if (!trace_ok_for_array(tracer, tr)) - return; - - for (i = 0; i < tr->nr_topts; i++) { - /* Make sure there's no duplicate flags. */ - if (WARN_ON_ONCE(tr->topts[i].tracer->flags == tracer->flags)) - return; - } + return 0; opts = flags->opts; @@ -9166,13 +9653,13 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer) topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL); if (!topts) - return; + return 0; tr_topts = krealloc(tr->topts, sizeof(*tr->topts) * (tr->nr_topts + 1), GFP_KERNEL); if (!tr_topts) { kfree(topts); - return; + return -ENOMEM; } tr->topts = tr_topts; @@ -9187,6 +9674,97 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer) "Failed to create trace option: %s", opts[cnt].name); } + return 0; +} + +static int get_global_flags_val(struct tracer *tracer) +{ + struct tracers *t; + + list_for_each_entry(t, &global_trace.tracers, list) { + if (t->tracer != tracer) + continue; + if (!t->flags) + return -1; + return t->flags->val; + } + return -1; +} + +static int add_tracer_options(struct trace_array *tr, struct tracers *t) +{ + struct tracer *tracer = t->tracer; + struct tracer_flags *flags = t->flags ?: tracer->flags; + + if (!flags) + return 0; + + /* Only add tracer options after update_tracer_options finish */ + if (!tracer_options_updated) + return 0; + + return create_trace_option_files(tr, tracer, flags); +} + +static int add_tracer(struct trace_array *tr, struct tracer *tracer) +{ + struct tracer_flags *flags; + struct tracers *t; + int ret; + + /* Only enable if the directory has been created already. */ + if (!tr->dir && !(tr->flags & TRACE_ARRAY_FL_GLOBAL)) + return 0; + + /* + * If this is an instance, only create flags for tracers + * the instance may have. + */ + if (!trace_ok_for_array(tracer, tr)) + return 0; + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return -ENOMEM; + + t->tracer = tracer; + t->flags = NULL; + list_add(&t->list, &tr->tracers); + + flags = tracer->flags; + if (!flags) { + if (!tracer->default_flags) + return 0; + + /* + * If the tracer defines default flags, it means the flags are + * per trace instance. + */ + flags = kmalloc(sizeof(*flags), GFP_KERNEL); + if (!flags) + return -ENOMEM; + + *flags = *tracer->default_flags; + flags->trace = tracer; + + t->flags = flags; + + /* If this is an instance, inherit the global_trace flags */ + if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) { + int val = get_global_flags_val(tracer); + if (!WARN_ON_ONCE(val < 0)) + flags->val = val; + } + } + + ret = add_tracer_options(tr, t); + if (ret < 0) { + list_del(&t->list); + kfree(t->flags); + kfree(t); + } + + return ret; } static struct dentry * @@ -9216,8 +9794,9 @@ static void create_trace_options_dir(struct trace_array *tr) for (i = 0; trace_options[i]; i++) { if (top_level || - !((1 << i) & TOP_LEVEL_TRACE_FLAGS)) + !((1ULL << i) & TOP_LEVEL_TRACE_FLAGS)) { create_trace_option_core_file(tr, trace_options[i], i); + } } } @@ -9249,7 +9828,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf, return ret; if (buffer) { - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (!!val == tracer_tracing_is_on(tr)) { val = 0; /* do nothing */ } else if (val) { @@ -9263,7 +9842,6 @@ rb_simple_write(struct file *filp, const char __user *ubuf, /* Wake up any waiters */ ring_buffer_wake_waiters(buffer, RING_BUFFER_ALL_CPUS); } - mutex_unlock(&trace_types_lock); } (*ppos)++; @@ -9517,6 +10095,15 @@ static void setup_trace_scratch(struct trace_array *tr, /* Scan modules to make text delta for modules. */ module_for_each_mod(make_mod_delta, tr); + + /* Set trace_clock as the same of the previous boot. */ + if (tscratch->clock_id != tr->clock_id) { + if (tscratch->clock_id >= ARRAY_SIZE(trace_clocks) || + tracing_set_clock(tr, trace_clocks[tscratch->clock_id].name) < 0) { + pr_info("the previous trace_clock info is not valid."); + goto reset; + } + } return; reset: /* Invalid trace modules */ @@ -9530,7 +10117,7 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size struct trace_scratch *tscratch; unsigned int scratch_size = 0; - rb_flags = tr->trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + rb_flags = tr->trace_flags & TRACE_ITER(OVERWRITE) ? RB_FL_OVERWRITE : 0; buf->tr = tr; @@ -9628,20 +10215,39 @@ static void init_trace_flags_index(struct trace_array *tr) tr->trace_flags_index[i] = i; } -static void __update_tracer_options(struct trace_array *tr) +static int __update_tracer(struct trace_array *tr) { struct tracer *t; + int ret = 0; + + for (t = trace_types; t && !ret; t = t->next) + ret = add_tracer(tr, t); - for (t = trace_types; t; t = t->next) - add_tracer_options(tr, t); + return ret; } -static void update_tracer_options(struct trace_array *tr) +static __init int __update_tracer_options(struct trace_array *tr) { - mutex_lock(&trace_types_lock); + struct tracers *t; + int ret = 0; + + list_for_each_entry(t, &tr->tracers, list) { + ret = add_tracer_options(tr, t); + if (ret < 0) + break; + } + + return ret; +} + +static __init void update_tracer_options(void) +{ + struct trace_array *tr; + + guard(mutex)(&trace_types_lock); tracer_options_updated = true; - __update_tracer_options(tr); - mutex_unlock(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) + __update_tracer_options(tr); } /* Must have trace_types_lock held */ @@ -9663,11 +10269,10 @@ struct trace_array *trace_array_find_get(const char *instance) { struct trace_array *tr; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); tr = trace_array_find(instance); if (tr) tr->ref++; - mutex_unlock(&trace_types_lock); return tr; } @@ -9687,9 +10292,13 @@ static int trace_array_create_dir(struct trace_array *tr) } init_tracer_tracefs(tr, tr->dir); - __update_tracer_options(tr); - - return ret; + ret = __update_tracer(tr); + if (ret) { + event_trace_del_tracer(tr); + tracefs_remove(tr->dir); + return ret; + } + return 0; } static struct trace_array * @@ -9731,16 +10340,21 @@ trace_array_create_systems(const char *name, const char *systems, raw_spin_lock_init(&tr->start_lock); + tr->syscall_buf_sz = global_trace.syscall_buf_sz; + tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; #ifdef CONFIG_TRACER_MAX_TRACE spin_lock_init(&tr->snapshot_trigger_lock); #endif tr->current_trace = &nop_trace; + tr->current_trace_flags = nop_trace.flags; INIT_LIST_HEAD(&tr->systems); INIT_LIST_HEAD(&tr->events); INIT_LIST_HEAD(&tr->hist_vars); INIT_LIST_HEAD(&tr->err_log); + INIT_LIST_HEAD(&tr->tracers); + INIT_LIST_HEAD(&tr->marker_list); #ifdef CONFIG_MODULES INIT_LIST_HEAD(&tr->mod_events); @@ -9893,13 +10507,16 @@ static int __remove_instance(struct trace_array *tr) /* Disable all the flags that were enabled coming in */ for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) { - if ((1 << i) & ZEROED_TRACE_FLAGS) - set_tracer_flag(tr, 1 << i, 0); + if ((1ULL << i) & ZEROED_TRACE_FLAGS) + set_tracer_flag(tr, 1ULL << i, 0); } if (printk_trace == tr) update_printk_trace(&global_trace); + if (update_marker_trace(tr, 0)) + synchronize_rcu(); + tracing_set_nop(tr); clear_ftrace_function_probes(tr); event_trace_del_tracer(tr); @@ -9909,11 +10526,14 @@ static int __remove_instance(struct trace_array *tr) free_percpu(tr->last_func_repeats); free_trace_buffers(tr); clear_tracing_err_log(tr); + free_tracers(tr); if (tr->range_name) { reserve_mem_release_by_name(tr->range_name); kfree(tr->range_name); } + if (tr->flags & TRACE_ARRAY_FL_VMALLOC) + vfree((void *)tr->range_addr_start); for (i = 0; i < tr->nr_topts; i++) { kfree(tr->topts[i].topts); @@ -10043,6 +10663,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer, tr, &buffer_subbuf_size_fops); + trace_create_file("syscall_user_buf_size", TRACE_MODE_WRITE, d_tracer, + tr, &tracing_syscall_buf_fops); + create_trace_options_dir(tr); #ifdef CONFIG_TRACER_MAX_TRACE @@ -10071,10 +10694,13 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) ftrace_init_tracefs(tr, d_tracer); } +#ifdef CONFIG_TRACEFS_AUTOMOUNT_DEPRECATED static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) { struct vfsmount *mnt; struct file_system_type *type; + struct fs_context *fc; + int ret; /* * To maintain backward compatibility for tools that mount @@ -10084,14 +10710,24 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) type = get_fs_type("tracefs"); if (!type) return NULL; - mnt = vfs_submount(mntpt, type, "tracefs", NULL); + + fc = fs_context_for_submount(type, mntpt); put_filesystem(type); - if (IS_ERR(mnt)) - return NULL; - mntget(mnt); + if (IS_ERR(fc)) + return ERR_CAST(fc); + + pr_warn("NOTICE: Automounting of tracing to debugfs is deprecated and will be removed in 2030\n"); + ret = vfs_parse_fs_string(fc, "source", "tracefs"); + if (!ret) + mnt = fc_mount(fc); + else + mnt = ERR_PTR(ret); + + put_fs_context(fc); return mnt; } +#endif /** * tracing_init_dentry - initialize top level trace array @@ -10116,6 +10752,7 @@ int tracing_init_dentry(void) if (WARN_ON(!tracefs_initialized())) return -ENODEV; +#ifdef CONFIG_TRACEFS_AUTOMOUNT_DEPRECATED /* * As there may still be users that expect the tracing * files to exist in debugfs/tracing, we must automount @@ -10124,6 +10761,7 @@ int tracing_init_dentry(void) */ tr->dir = debugfs_create_automount("tracing", NULL, trace_automount, NULL); +#endif return 0; } @@ -10140,7 +10778,7 @@ static void __init eval_map_work_func(struct work_struct *work) int len; len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps; - trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len); + trace_event_update_with_eval_map(NULL, __start_ftrace_eval_maps, len); } static int __init trace_eval_init(void) @@ -10178,7 +10816,7 @@ bool module_exists(const char *module) { /* All modules have the symbol __this_module */ static const char this_mod[] = "__this_module"; - char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2]; + char modname[MODULE_NAME_LEN + sizeof(this_mod) + 2]; unsigned long val; int n; @@ -10193,9 +10831,6 @@ bool module_exists(const char *module) static void trace_module_add_evals(struct module *mod) { - if (!mod->num_trace_evals) - return; - /* * Modules with bad taint do not have events created, do * not bother with enums either. @@ -10203,7 +10838,8 @@ static void trace_module_add_evals(struct module *mod) if (trace_module_has_bad_taint(mod)) return; - trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals); + /* Even if no trace_evals, this need to sanitize field types. */ + trace_event_update_with_eval_map(mod, mod->trace_evals, mod->num_trace_evals); } #ifdef CONFIG_TRACE_EVAL_MAP_FILE @@ -10315,7 +10951,7 @@ static __init void tracer_init_tracefs_work_func(struct work_struct *work) create_trace_instances(NULL); - update_tracer_options(&global_trace); + update_tracer_options(); } static __init int tracer_init_tracefs(void) @@ -10335,7 +10971,8 @@ static __init int tracer_init_tracefs(void) tracer_init_tracefs_work_func(NULL); } - rv_init_interface(); + if (rv_init_interface()) + pr_err("RV: Error while creating the RV interface\n"); return 0; } @@ -10448,7 +11085,7 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m static struct trace_iterator iter; unsigned int old_userobj; unsigned long flags; - int cnt = 0, cpu; + int cnt = 0; /* * Always turn off tracing when we dump. @@ -10465,14 +11102,13 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m /* Simulate the iterator */ trace_init_iter(&iter, tr); - for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); - } + /* While dumping, do not allow the buffer to be enable */ + tracer_tracing_disable(tr); - old_userobj = tr->trace_flags & TRACE_ITER_SYM_USEROBJ; + old_userobj = tr->trace_flags & TRACE_ITER(SYM_USEROBJ); /* don't look at user memory in panic mode */ - tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + tr->trace_flags &= ~TRACE_ITER(SYM_USEROBJ); if (dump_mode == DUMP_ORIG) iter.cpu_file = raw_smp_processor_id(); @@ -10513,10 +11149,10 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m ret = print_trace_line(&iter); if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(&iter); + + trace_printk_seq(&iter.seq); } touch_nmi_watchdog(); - - trace_printk_seq(&iter.seq); } if (!cnt) @@ -10526,9 +11162,7 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m tr->trace_flags |= old_userobj; - for_each_tracing_cpu(cpu) { - atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); - } + tracer_tracing_enable(tr); local_irq_restore(flags); } @@ -10610,7 +11244,8 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, size_t count, loff_t *ppos, int (*createfn)(const char *)) { - char *kbuf, *buf, *tmp; + char *kbuf __free(kfree) = NULL; + char *buf, *tmp; int ret = 0; size_t done = 0; size_t size; @@ -10625,10 +11260,9 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, if (size >= WRITE_BUFSIZE) size = WRITE_BUFSIZE - 1; - if (copy_from_user(kbuf, buffer + done, size)) { - ret = -EFAULT; - goto out; - } + if (copy_from_user(kbuf, buffer + done, size)) + return -EFAULT; + kbuf[size] = '\0'; buf = kbuf; do { @@ -10644,8 +11278,7 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */ pr_warn("Line length is too long: Should be less than %d\n", WRITE_BUFSIZE - 2); - ret = -EINVAL; - goto out; + return -EINVAL; } } done += size; @@ -10658,17 +11291,12 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, ret = createfn(buf); if (ret) - goto out; + return ret; buf += size; } while (done < count); } - ret = done; - -out: - kfree(kbuf); - - return ret; + return done; } #ifdef CONFIG_TRACER_MAX_TRACE @@ -10703,7 +11331,7 @@ __init static void do_allocate_snapshot(const char *name) /* * When allocate_snapshot is set, the next call to * allocate_trace_buffers() (called by trace_array_get_by_name()) - * will allocate the snapshot buffer. That will alse clear + * will allocate the snapshot buffer. That will also clear * this flag. */ allocate_snapshot = true; @@ -10712,6 +11340,42 @@ __init static void do_allocate_snapshot(const char *name) static inline void do_allocate_snapshot(const char *name) { } #endif +__init static int backup_instance_area(const char *backup, + unsigned long *addr, phys_addr_t *size) +{ + struct trace_array *backup_tr; + void *allocated_vaddr = NULL; + + backup_tr = trace_array_get_by_name(backup, NULL); + if (!backup_tr) { + pr_warn("Tracing: Instance %s is not found.\n", backup); + return -ENOENT; + } + + if (!(backup_tr->flags & TRACE_ARRAY_FL_BOOT)) { + pr_warn("Tracing: Instance %s is not boot mapped.\n", backup); + trace_array_put(backup_tr); + return -EINVAL; + } + + *size = backup_tr->range_addr_size; + + allocated_vaddr = vzalloc(*size); + if (!allocated_vaddr) { + pr_warn("Tracing: Failed to allocate memory for copying instance %s (size 0x%lx)\n", + backup, (unsigned long)*size); + trace_array_put(backup_tr); + return -ENOMEM; + } + + memcpy(allocated_vaddr, + (void *)backup_tr->range_addr_start, (size_t)*size); + *addr = (unsigned long)allocated_vaddr; + + trace_array_put(backup_tr); + return 0; +} + __init static void enable_instances(void) { struct trace_array *tr; @@ -10734,11 +11398,15 @@ __init static void enable_instances(void) char *flag_delim; char *addr_delim; char *rname __free(kfree) = NULL; + char *backup; tok = strsep(&curr_str, ","); - flag_delim = strchr(tok, '^'); - addr_delim = strchr(tok, '@'); + name = strsep(&tok, "="); + backup = tok; + + flag_delim = strchr(name, '^'); + addr_delim = strchr(name, '@'); if (addr_delim) *addr_delim++ = '\0'; @@ -10746,7 +11414,10 @@ __init static void enable_instances(void) if (flag_delim) *flag_delim++ = '\0'; - name = tok; + if (backup) { + if (backup_instance_area(backup, &addr, &size) < 0) + continue; + } if (flag_delim) { char *flag; @@ -10842,7 +11513,13 @@ __init static void enable_instances(void) tr->ref++; } - if (start) { + /* + * Backup buffers can be freed but need vfree(). + */ + if (backup) + tr->flags |= TRACE_ARRAY_FL_VMALLOC; + + if (start || backup) { tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT; tr->range_name = no_free_ptr(rname); } @@ -10871,7 +11548,7 @@ __init static int tracer_alloc_buffers(void) BUILD_BUG_ON(TRACE_ITER_LAST_BIT > TRACE_FLAGS_MAX_SIZE); if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) - goto out; + return -ENOMEM; if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; @@ -10936,6 +11613,7 @@ __init static int tracer_alloc_buffers(void) * just a bootstrap of current_trace anyway. */ global_trace.current_trace = &nop_trace; + global_trace.current_trace_flags = nop_trace.flags; global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; #ifdef CONFIG_TRACER_MAX_TRACE @@ -10949,10 +11627,7 @@ __init static int tracer_alloc_buffers(void) init_trace_flags_index(&global_trace); - register_tracer(&nop_trace); - - /* Function tracing may start here (via kernel command line) */ - init_function_trace(); + INIT_LIST_HEAD(&global_trace.tracers); /* All seems OK, enable tracing */ tracing_disabled = 0; @@ -10964,12 +11639,20 @@ __init static int tracer_alloc_buffers(void) global_trace.flags = TRACE_ARRAY_FL_GLOBAL; + global_trace.syscall_buf_sz = syscall_buf_size; + INIT_LIST_HEAD(&global_trace.systems); INIT_LIST_HEAD(&global_trace.events); INIT_LIST_HEAD(&global_trace.hist_vars); INIT_LIST_HEAD(&global_trace.err_log); + list_add(&global_trace.marker_list, &marker_copies); list_add(&global_trace.list, &ftrace_trace_arrays); + register_tracer(&nop_trace); + + /* Function tracing may start here (via kernel command line) */ + init_function_trace(); + apply_trace_boot_options(); register_snapshot_cmd(); @@ -10988,13 +11671,12 @@ out_free_cpumask: free_cpumask_var(global_trace.tracing_cpumask); out_free_buffer_mask: free_cpumask_var(tracing_buffer_mask); -out: return ret; } #ifdef CONFIG_FUNCTION_TRACER /* Used to set module cached ftrace filtering at boot up */ -__init struct trace_array *trace_get_global_array(void) +struct trace_array *trace_get_global_array(void) { return &global_trace; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 79be1995db44..b6d42fe06115 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -22,6 +22,7 @@ #include <linux/ctype.h> #include <linux/once_lite.h> #include <linux/ftrace_regs.h> +#include <linux/llist.h> #include "pid_list.h" @@ -131,6 +132,8 @@ enum trace_type { #define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long)) #define HIST_STACKTRACE_SKIP 5 +#define SYSCALL_FAULT_USER_MAX 165 + /* * syscalls are special, and need special handling, this is why * they are not included in trace_entries.h @@ -183,8 +186,7 @@ struct trace_array; * the trace, etc.) */ struct trace_array_cpu { - atomic_t disabled; - void *buffer_page; /* ring buffer spare */ + local_t disabled; unsigned long entries; unsigned long saved_latency; @@ -217,7 +219,7 @@ struct array_buffer { int cpu; }; -#define TRACE_FLAGS_MAX_SIZE 32 +#define TRACE_FLAGS_MAX_SIZE 64 struct trace_options { struct tracer *tracer; @@ -381,8 +383,8 @@ struct trace_array { #ifdef CONFIG_FTRACE_SYSCALLS int sys_refcount_enter; int sys_refcount_exit; - struct trace_event_file __rcu *enter_syscall_files[NR_syscalls]; - struct trace_event_file __rcu *exit_syscall_files[NR_syscalls]; + struct trace_event_file *enter_syscall_files[NR_syscalls]; + struct trace_event_file *exit_syscall_files[NR_syscalls]; #endif int stop_count; int clock_id; @@ -391,7 +393,8 @@ struct trace_array { int buffer_percent; unsigned int n_err_log_entries; struct tracer *current_trace; - unsigned int trace_flags; + struct tracer_flags *current_trace_flags; + u64 trace_flags; unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; unsigned int flags; raw_spinlock_t start_lock; @@ -404,6 +407,8 @@ struct trace_array { struct trace_options *topts; struct list_head systems; struct list_head events; + struct list_head marker_list; + struct list_head tracers; struct trace_event_file *trace_marker_file; cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ /* one per_cpu trace_pipe can be opened by only one user */ @@ -430,6 +435,7 @@ struct trace_array { int function_enabled; #endif int no_filter_buffering_ref; + unsigned int syscall_buf_sz; struct list_head hist_vars; #ifdef CONFIG_TRACER_SNAPSHOT struct cond_snapshot *cond_snapshot; @@ -448,6 +454,7 @@ enum { TRACE_ARRAY_FL_LAST_BOOT = BIT(2), TRACE_ARRAY_FL_MOD_INIT = BIT(3), TRACE_ARRAY_FL_MEMMAP = BIT(4), + TRACE_ARRAY_FL_VMALLOC = BIT(5), }; #ifdef CONFIG_MODULES @@ -631,9 +638,10 @@ struct tracer { u32 old_flags, u32 bit, int set); /* Return 0 if OK with change, else return non-zero */ int (*flag_changed)(struct trace_array *tr, - u32 mask, int set); + u64 mask, int set); struct tracer *next; struct tracer_flags *flags; + struct tracer_flags *default_flags; int enabled; bool print_max; bool allow_instances; @@ -665,12 +673,29 @@ bool tracing_is_disabled(void); bool tracer_tracing_is_on(struct trace_array *tr); void tracer_tracing_on(struct trace_array *tr); void tracer_tracing_off(struct trace_array *tr); +void tracer_tracing_disable(struct trace_array *tr); +void tracer_tracing_enable(struct trace_array *tr); struct dentry *trace_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops); + +/** + * tracer_tracing_is_on_cpu - show real state of ring buffer enabled on for a cpu + * @tr : the trace array to know if ring buffer is enabled + * @cpu: The cpu buffer to check if enabled + * + * Shows real state of the per CPU buffer if it is enabled or not. + */ +static inline bool tracer_tracing_is_on_cpu(struct trace_array *tr, int cpu) +{ + if (tr->array_buffer.buffer) + return ring_buffer_record_is_on_cpu(tr->array_buffer.buffer, cpu); + return false; +} + int tracing_init_dentry(void); struct ring_buffer_event; @@ -920,8 +945,6 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) -extern void ftrace_graph_sleep_time_control(bool enable); - #ifdef CONFIG_FUNCTION_PROFILER extern void ftrace_graph_graph_time_control(bool enable); #else @@ -941,7 +964,8 @@ extern int __trace_graph_entry(struct trace_array *tr, extern int __trace_graph_retaddr_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned int trace_ctx, - unsigned long retaddr); + unsigned long retaddr, + struct ftrace_regs *fregs); extern void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, unsigned int trace_ctx, @@ -1092,7 +1116,8 @@ static inline void ftrace_graph_addr_finish(struct fgraph_ops *gops, struct ftra #endif /* CONFIG_DYNAMIC_FTRACE */ extern unsigned int fgraph_max_depth; -extern bool fgraph_sleep_time; +extern int fgraph_no_sleep_time; +extern bool fprofile_no_sleep_time; static inline bool ftrace_graph_ignore_func(struct fgraph_ops *gops, struct ftrace_graph_ent *trace) @@ -1137,11 +1162,6 @@ struct ftrace_func_command { char *params, int enable); }; extern bool ftrace_filter_param __initdata; -static inline int ftrace_trace_task(struct trace_array *tr) -{ - return this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid) != - FTRACE_PID_IGNORE; -} extern int ftrace_is_dead(void); int ftrace_create_function_files(struct trace_array *tr, struct dentry *parent); @@ -1159,10 +1179,6 @@ void ftrace_clear_pids(struct trace_array *tr); int init_function_trace(void); void ftrace_pid_follow_fork(struct trace_array *tr, bool enable); #else -static inline int ftrace_trace_task(struct trace_array *tr) -{ - return 1; -} static inline int ftrace_is_dead(void) { return 0; } static inline int ftrace_create_function_files(struct trace_array *tr, @@ -1275,6 +1291,7 @@ bool ftrace_event_is_function(struct trace_event_call *call); */ struct trace_parser { bool cont; + bool fail; char *buffer; unsigned idx; unsigned size; @@ -1282,7 +1299,7 @@ struct trace_parser { static inline bool trace_parser_loaded(struct trace_parser *parser) { - return (parser->idx != 0); + return !parser->fail && parser->idx != 0; } static inline bool trace_parser_cont(struct trace_parser *parser) @@ -1296,6 +1313,11 @@ static inline void trace_parser_clear(struct trace_parser *parser) parser->idx = 0; } +static inline void trace_parser_fail(struct trace_parser *parser) +{ + parser->fail = true; +} + extern int trace_parser_get_init(struct trace_parser *parser, int size); extern void trace_parser_put(struct trace_parser *parser); extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, @@ -1322,11 +1344,11 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, # define FUNCTION_FLAGS \ C(FUNCTION, "function-trace"), \ C(FUNC_FORK, "function-fork"), -# define FUNCTION_DEFAULT_FLAGS TRACE_ITER_FUNCTION +# define FUNCTION_DEFAULT_FLAGS TRACE_ITER(FUNCTION) #else # define FUNCTION_FLAGS # define FUNCTION_DEFAULT_FLAGS 0UL -# define TRACE_ITER_FUNC_FORK 0UL +# define TRACE_ITER_FUNC_FORK_BIT -1 #endif #ifdef CONFIG_STACKTRACE @@ -1336,6 +1358,24 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, # define STACK_FLAGS #endif +#ifdef CONFIG_FUNCTION_PROFILER +# define PROFILER_FLAGS \ + C(PROF_TEXT_OFFSET, "prof-text-offset"), +# ifdef CONFIG_FUNCTION_GRAPH_TRACER +# define FPROFILE_FLAGS \ + C(GRAPH_TIME, "graph-time"), +# define FPROFILE_DEFAULT_FLAGS TRACE_ITER(GRAPH_TIME) +# else +# define FPROFILE_FLAGS +# define FPROFILE_DEFAULT_FLAGS 0UL +# endif +#else +# define PROFILER_FLAGS +# define FPROFILE_FLAGS +# define FPROFILE_DEFAULT_FLAGS 0UL +# define TRACE_ITER_PROF_TEXT_OFFSET_BIT -1 +#endif + /* * trace_iterator_flags is an enumeration that defines bit * positions into trace_flags that controls the output. @@ -1368,12 +1408,15 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(MARKERS, "markers"), \ C(EVENT_FORK, "event-fork"), \ C(TRACE_PRINTK, "trace_printk_dest"), \ + C(COPY_MARKER, "copy_trace_marker"), \ C(PAUSE_ON_TRACE, "pause-on-trace"), \ C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \ FUNCTION_FLAGS \ FGRAPH_FLAGS \ STACK_FLAGS \ - BRANCH_FLAGS + BRANCH_FLAGS \ + PROFILER_FLAGS \ + FPROFILE_FLAGS /* * By defining C, we can make TRACE_FLAGS a list of bit names @@ -1389,20 +1432,17 @@ enum trace_iterator_bits { }; /* - * By redefining C, we can make TRACE_FLAGS a list of masks that - * use the bits as defined above. + * And use TRACE_ITER(flag) to define the bit masks. */ -#undef C -#define C(a, b) TRACE_ITER_##a = (1 << TRACE_ITER_##a##_BIT) - -enum trace_iterator_flags { TRACE_FLAGS }; +#define TRACE_ITER(flag) \ + (TRACE_ITER_##flag##_BIT < 0 ? 0 : 1ULL << (TRACE_ITER_##flag##_BIT)) /* * TRACE_ITER_SYM_MASK masks the options in trace_flags that * control the output of kernel symbols. */ #define TRACE_ITER_SYM_MASK \ - (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) + (TRACE_ITER(PRINT_PARENT)|TRACE_ITER(SYM_OFFSET)|TRACE_ITER(SYM_ADDR)) extern struct tracer nop_trace; @@ -1411,7 +1451,7 @@ extern int enable_branch_tracing(struct trace_array *tr); extern void disable_branch_tracing(void); static inline int trace_branch_enable(struct trace_array *tr) { - if (tr->trace_flags & TRACE_ITER_BRANCH) + if (tr->trace_flags & TRACE_ITER(BRANCH)) return enable_branch_tracing(tr); return 0; } @@ -1507,6 +1547,23 @@ void trace_buffered_event_enable(void); void early_enable_events(struct trace_array *tr, char *buf, bool disable_first); +struct trace_user_buf; +struct trace_user_buf_info { + struct trace_user_buf __percpu *tbuf; + size_t size; + int ref; +}; + +typedef int (*trace_user_buf_copy)(char *dst, const char __user *src, + size_t size, void *data); +int trace_user_fault_init(struct trace_user_buf_info *tinfo, size_t size); +int trace_user_fault_get(struct trace_user_buf_info *tinfo); +int trace_user_fault_put(struct trace_user_buf_info *tinfo); +void trace_user_fault_destroy(struct trace_user_buf_info *tinfo); +char *trace_user_fault_read(struct trace_user_buf_info *tinfo, + const char __user *ptr, size_t size, + trace_user_buf_copy copy_func, void *data); + static inline void __trace_event_discard_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) @@ -1728,13 +1785,13 @@ extern void clear_event_triggers(struct trace_array *tr); enum { EVENT_TRIGGER_FL_PROBE = BIT(0), + EVENT_TRIGGER_FL_COUNT = BIT(1), }; struct event_trigger_data { unsigned long count; int ref; int flags; - const struct event_trigger_ops *ops; struct event_command *cmd_ops; struct event_filter __rcu *filter; char *filter_str; @@ -1745,6 +1802,7 @@ struct event_trigger_data { char *name; struct list_head named_list; struct event_trigger_data *named_data; + struct llist_node llist; }; /* Avoid typos */ @@ -1759,6 +1817,10 @@ struct enable_trigger_data { bool hist; }; +bool event_trigger_count(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event); + extern int event_enable_trigger_print(struct seq_file *m, struct event_trigger_data *data); extern void event_enable_trigger_free(struct event_trigger_data *data); @@ -1772,6 +1834,9 @@ extern int event_enable_register_trigger(char *glob, extern void event_enable_unregister_trigger(char *glob, struct event_trigger_data *test, struct trace_event_file *file); +extern struct event_trigger_data * +trigger_data_alloc(struct event_command *cmd_ops, char *cmd, char *param, + void *private_data); extern void trigger_data_free(struct event_trigger_data *data); extern int event_trigger_init(struct event_trigger_data *data); extern int trace_event_trigger_enable_disable(struct trace_event_file *file, @@ -1798,11 +1863,6 @@ extern bool event_trigger_check_remove(const char *glob); extern bool event_trigger_empty_param(const char *param); extern int event_trigger_separate_filter(char *param_and_filter, char **param, char **filter, bool param_required); -extern struct event_trigger_data * -event_trigger_alloc(struct event_command *cmd_ops, - char *cmd, - char *param, - void *private_data); extern int event_trigger_parse_num(char *trigger, struct event_trigger_data *trigger_data); extern int event_trigger_set_filter(struct event_command *cmd_ops, @@ -1824,64 +1884,6 @@ extern void event_file_get(struct trace_event_file *file); extern void event_file_put(struct trace_event_file *file); /** - * struct event_trigger_ops - callbacks for trace event triggers - * - * The methods in this structure provide per-event trigger hooks for - * various trigger operations. - * - * The @init and @free methods are used during trigger setup and - * teardown, typically called from an event_command's @parse() - * function implementation. - * - * The @print method is used to print the trigger spec. - * - * The @trigger method is the function that actually implements the - * trigger and is called in the context of the triggering event - * whenever that event occurs. - * - * All the methods below, except for @init() and @free(), must be - * implemented. - * - * @trigger: The trigger 'probe' function called when the triggering - * event occurs. The data passed into this callback is the data - * that was supplied to the event_command @reg() function that - * registered the trigger (see struct event_command) along with - * the trace record, rec. - * - * @init: An optional initialization function called for the trigger - * when the trigger is registered (via the event_command reg() - * function). This can be used to perform per-trigger - * initialization such as incrementing a per-trigger reference - * count, for instance. This is usually implemented by the - * generic utility function @event_trigger_init() (see - * trace_event_triggers.c). - * - * @free: An optional de-initialization function called for the - * trigger when the trigger is unregistered (via the - * event_command @reg() function). This can be used to perform - * per-trigger de-initialization such as decrementing a - * per-trigger reference count and freeing corresponding trigger - * data, for instance. This is usually implemented by the - * generic utility function @event_trigger_free() (see - * trace_event_triggers.c). - * - * @print: The callback function invoked to have the trigger print - * itself. This is usually implemented by a wrapper function - * that calls the generic utility function @event_trigger_print() - * (see trace_event_triggers.c). - */ -struct event_trigger_ops { - void (*trigger)(struct event_trigger_data *data, - struct trace_buffer *buffer, - void *rec, - struct ring_buffer_event *rbe); - int (*init)(struct event_trigger_data *data); - void (*free)(struct event_trigger_data *data); - int (*print)(struct seq_file *m, - struct event_trigger_data *data); -}; - -/** * struct event_command - callbacks and data members for event commands * * Event commands are invoked by users by writing the command name @@ -1930,7 +1932,7 @@ struct event_trigger_ops { * * @reg: Adds the trigger to the list of triggers associated with the * event, and enables the event trigger itself, after - * initializing it (via the event_trigger_ops @init() function). + * initializing it (via the event_command @init() function). * This is also where commands can use the @trigger_type value to * make the decision as to whether or not multiple instances of * the trigger should be allowed. This is usually implemented by @@ -1939,7 +1941,7 @@ struct event_trigger_ops { * * @unreg: Removes the trigger from the list of triggers associated * with the event, and disables the event trigger itself, after - * initializing it (via the event_trigger_ops @free() function). + * initializing it (via the event_command @free() function). * This is usually implemented by the generic utility function * @unregister_trigger() (see trace_event_triggers.c). * @@ -1953,12 +1955,41 @@ struct event_trigger_ops { * ignored. This is usually implemented by the generic utility * function @set_trigger_filter() (see trace_event_triggers.c). * - * @get_trigger_ops: The callback function invoked to retrieve the - * event_trigger_ops implementation associated with the command. - * This callback function allows a single event_command to - * support multiple trigger implementations via different sets of - * event_trigger_ops, depending on the value of the @param - * string. + * All the methods below, except for @init() and @free(), must be + * implemented. + * + * @trigger: The trigger 'probe' function called when the triggering + * event occurs. The data passed into this callback is the data + * that was supplied to the event_command @reg() function that + * registered the trigger (see struct event_command) along with + * the trace record, rec. + * + * @count_func: If defined and a numeric parameter is passed to the + * trigger, then this function will be called before @trigger + * is called. If this function returns false, then @trigger is not + * executed. + * + * @init: An optional initialization function called for the trigger + * when the trigger is registered (via the event_command reg() + * function). This can be used to perform per-trigger + * initialization such as incrementing a per-trigger reference + * count, for instance. This is usually implemented by the + * generic utility function @event_trigger_init() (see + * trace_event_triggers.c). + * + * @free: An optional de-initialization function called for the + * trigger when the trigger is unregistered (via the + * event_command @reg() function). This can be used to perform + * per-trigger de-initialization such as decrementing a + * per-trigger reference count and freeing corresponding trigger + * data, for instance. This is usually implemented by the + * generic utility function @event_trigger_free() (see + * trace_event_triggers.c). + * + * @print: The callback function invoked to have the trigger print + * itself. This is usually implemented by a wrapper function + * that calls the generic utility function @event_trigger_print() + * (see trace_event_triggers.c). */ struct event_command { struct list_head list; @@ -1979,7 +2010,18 @@ struct event_command { int (*set_filter)(char *filter_str, struct event_trigger_data *data, struct trace_event_file *file); - const struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); + void (*trigger)(struct event_trigger_data *data, + struct trace_buffer *buffer, + void *rec, + struct ring_buffer_event *rbe); + bool (*count_func)(struct event_trigger_data *data, + struct trace_buffer *buffer, + void *rec, + struct ring_buffer_event *rbe); + int (*init)(struct event_trigger_data *data); + void (*free)(struct event_trigger_data *data); + int (*print)(struct seq_file *m, + struct event_trigger_data *data); }; /** @@ -2000,7 +2042,7 @@ struct event_command { * either committed or discarded. At that point, if any commands * have deferred their triggers, those commands are finally * invoked following the close of the current event. In other - * words, if the event_trigger_ops @func() probe implementation + * words, if the event_command @func() probe implementation * itself logs to the trace buffer, this flag should be set, * otherwise it can be left unspecified. * @@ -2042,8 +2084,8 @@ extern const char *__stop___tracepoint_str[]; void trace_printk_control(bool enabled); void trace_printk_start_comm(void); -int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); -int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); +int trace_keep_overwrite(struct tracer *tracer, u64 mask, int set); +int set_tracer_flag(struct trace_array *tr, u64 mask, int enabled); /* Used from boot time tracer */ extern int trace_set_options(struct trace_array *tr, char *option); @@ -2109,13 +2151,13 @@ static inline const char *get_syscall_name(int syscall) #ifdef CONFIG_EVENT_TRACING void trace_event_init(void); -void trace_event_eval_update(struct trace_eval_map **map, int len); +void trace_event_update_all(struct trace_eval_map **map, int len); /* Used from boot time tracer */ extern int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set); extern int trigger_process_regex(struct trace_event_file *file, char *buff); #else static inline void __init trace_event_init(void) { } -static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { } +static inline void trace_event_update_all(struct trace_eval_map **map, int len) { } #endif #ifdef CONFIG_TRACER_SNAPSHOT @@ -2188,7 +2230,7 @@ static inline bool is_good_system_name(const char *name) static inline void sanitize_event_name(char *name) { while (*name++ != '\0') - if (*name == ':' || *name == '.') + if (*name == ':' || *name == '.' || *name == '*') *name = '_'; } @@ -2226,4 +2268,25 @@ static inline int rv_init_interface(void) */ #define FTRACE_TRAMPOLINE_MARKER ((unsigned long) INT_MAX) +/* + * This is used to get the address of the args array based on + * the type of the entry. + */ +#define FGRAPH_ENTRY_ARGS(e) \ + ({ \ + unsigned long *_args; \ + struct ftrace_graph_ent_entry *_e = e; \ + \ + if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && \ + e->ent.type == TRACE_GRAPH_RETADDR_ENT) { \ + struct fgraph_retaddr_ent_entry *_re; \ + \ + _re = (typeof(_re))_e; \ + _args = _re->args; \ + } else { \ + _args = _e->args; \ + } \ + _args; \ + }) + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 6d08a5523ce0..6809b370e991 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -32,7 +32,6 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) { struct trace_array *tr = branch_tracer; struct trace_buffer *buffer; - struct trace_array_cpu *data; struct ring_buffer_event *event; struct trace_branch *entry; unsigned long flags; @@ -54,8 +53,7 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) raw_local_irq_save(flags); current->trace_recursion |= TRACE_BRANCH_BIT; - data = this_cpu_ptr(tr->array_buffer.data); - if (atomic_read(&data->disabled)) + if (!tracer_tracing_is_on_cpu(tr, raw_smp_processor_id())) goto out; trace_ctx = tracing_gen_ctx_flags(flags); diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index a322e4f249a5..c4dfbc293bae 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -16,7 +16,7 @@ #include "trace_output.h" /* for trace_event_sem */ #include "trace_dynevent.h" -static DEFINE_MUTEX(dyn_event_ops_mutex); +DEFINE_MUTEX(dyn_event_ops_mutex); static LIST_HEAD(dyn_event_ops_list); bool trace_event_dyn_try_get_ref(struct trace_event_call *dyn_call) @@ -116,6 +116,20 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type return ret; } +/* + * Locked version of event creation. The event creation must be protected by + * dyn_event_ops_mutex because of protecting trace_probe_log. + */ +int dyn_event_create(const char *raw_command, struct dyn_event_operations *type) +{ + int ret; + + mutex_lock(&dyn_event_ops_mutex); + ret = type->create(raw_command); + mutex_unlock(&dyn_event_ops_mutex); + return ret; +} + static int create_dyn_event(const char *raw_command) { struct dyn_event_operations *ops; @@ -130,9 +144,16 @@ static int create_dyn_event(const char *raw_command) if (!ret || ret != -ECANCELED) break; } - mutex_unlock(&dyn_event_ops_mutex); - if (ret == -ECANCELED) + if (ret == -ECANCELED) { + static const char *err_msg[] = {"No matching dynamic event type"}; + + /* Wrong dynamic event. Leave an error message. */ + tracing_log_err(NULL, "dynevent", raw_command, err_msg, + 0, 0); ret = -EINVAL; + } + + mutex_unlock(&dyn_event_ops_mutex); return ret; } @@ -216,6 +237,10 @@ static int dyn_event_open(struct inode *inode, struct file *file) { int ret; + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + ret = tracing_check_open_get_tr(NULL); if (ret) return ret; diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h index 936477a111d3..beee3f8d7544 100644 --- a/kernel/trace/trace_dynevent.h +++ b/kernel/trace/trace_dynevent.h @@ -100,6 +100,7 @@ void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos); void dyn_event_seq_stop(struct seq_file *m, void *v); int dyn_events_release_all(struct dyn_event_operations *type); int dyn_event_release(const char *raw_command, struct dyn_event_operations *type); +int dyn_event_create(const char *raw_command, struct dyn_event_operations *type); /* * for_each_dyn_event - iterate over the dyn_event list diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 4ef4df6623a8..f6a8d29c0d76 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -80,11 +80,11 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, F_STRUCT( __field_struct( struct ftrace_graph_ent, graph_ent ) __field_packed( unsigned long, graph_ent, func ) - __field_packed( unsigned int, graph_ent, depth ) + __field_packed( unsigned long, graph_ent, depth ) __dynamic_array(unsigned long, args ) ), - F_printk("--> %ps (%u)", (void *)__entry->func, __entry->depth) + F_printk("--> %ps (%lu)", (void *)__entry->func, __entry->depth) ); #ifdef CONFIG_FUNCTION_GRAPH_RETADDR @@ -95,13 +95,14 @@ FTRACE_ENTRY_PACKED(fgraph_retaddr_entry, fgraph_retaddr_ent_entry, TRACE_GRAPH_RETADDR_ENT, F_STRUCT( - __field_struct( struct fgraph_retaddr_ent, graph_ent ) - __field_packed( unsigned long, graph_ent, func ) - __field_packed( int, graph_ent, depth ) - __field_packed( unsigned long, graph_ent, retaddr ) + __field_struct( struct fgraph_retaddr_ent, graph_rent ) + __field_packed( unsigned long, graph_rent.ent, func ) + __field_packed( unsigned long, graph_rent.ent, depth ) + __field_packed( unsigned long, graph_rent, retaddr ) + __dynamic_array(unsigned long, args ) ), - F_printk("--> %ps (%d) <- %ps", (void *)__entry->func, __entry->depth, + F_printk("--> %ps (%lu) <- %ps", (void *)__entry->func, __entry->depth, (void *)__entry->retaddr) ); @@ -124,13 +125,13 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, __field_struct( struct ftrace_graph_ret, ret ) __field_packed( unsigned long, ret, func ) __field_packed( unsigned long, ret, retval ) - __field_packed( int, ret, depth ) + __field_packed( unsigned int, ret, depth ) __field_packed( unsigned int, ret, overrun ) __field(unsigned long long, calltime ) __field(unsigned long long, rettime ) ), - F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d retval: %lx", + F_printk("<-- %ps (%u) (start: %llx end: %llx) over: %u retval: %lx", (void *)__entry->func, __entry->depth, __entry->calltime, __entry->rettime, __entry->depth, __entry->retval) @@ -146,13 +147,13 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, F_STRUCT( __field_struct( struct ftrace_graph_ret, ret ) __field_packed( unsigned long, ret, func ) - __field_packed( int, ret, depth ) + __field_packed( unsigned int, ret, depth ) __field_packed( unsigned int, ret, overrun ) __field(unsigned long long, calltime ) __field(unsigned long long, rettime ) ), - F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d", + F_printk("<-- %ps (%u) (start: %llx end: %llx) over: %u", (void *)__entry->func, __entry->depth, __entry->calltime, __entry->rettime, __entry->depth) diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index c08355c3ef32..3ee39715d5e4 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -9,14 +9,15 @@ * Copyright (C) 2021, VMware Inc, Tzvetomir Stoyanov tz.stoyanov@gmail.com> * */ +#include <linux/cleanup.h> +#include <linux/ftrace.h> #include <linux/module.h> #include <linux/mutex.h> -#include <linux/ftrace.h> #include "trace_dynevent.h" #include "trace_probe.h" -#include "trace_probe_tmpl.h" #include "trace_probe_kernel.h" +#include "trace_probe_tmpl.h" #define EPROBE_EVENT_SYSTEM "eprobes" @@ -60,6 +61,9 @@ static void trace_event_probe_cleanup(struct trace_eprobe *ep) kfree(ep); } +DEFINE_FREE(trace_event_probe_cleanup, struct trace_eprobe *, + if (!IS_ERR_OR_NULL(_T)) trace_event_probe_cleanup(_T)) + static struct trace_eprobe *to_trace_eprobe(struct dyn_event *ev) { return container_of(ev, struct trace_eprobe, devent); @@ -196,10 +200,10 @@ static struct trace_eprobe *alloc_event_probe(const char *group, struct trace_event_call *event, int nargs) { - struct trace_eprobe *ep; + struct trace_eprobe *ep __free(trace_event_probe_cleanup) = NULL; const char *event_name; const char *sys_name; - int ret = -ENOMEM; + int ret; if (!event) return ERR_PTR(-ENODEV); @@ -210,25 +214,22 @@ static struct trace_eprobe *alloc_event_probe(const char *group, ep = kzalloc(struct_size(ep, tp.args, nargs), GFP_KERNEL); if (!ep) { trace_event_put_ref(event); - goto error; + return ERR_PTR(-ENOMEM); } ep->event = event; ep->event_name = kstrdup(event_name, GFP_KERNEL); if (!ep->event_name) - goto error; + return ERR_PTR(-ENOMEM); ep->event_system = kstrdup(sys_name, GFP_KERNEL); if (!ep->event_system) - goto error; + return ERR_PTR(-ENOMEM); ret = trace_probe_init(&ep->tp, this_event, group, false, nargs); if (ret < 0) - goto error; + return ERR_PTR(ret); dyn_event_init(&ep->devent, &eprobe_dyn_event_ops); - return ep; -error: - trace_event_probe_cleanup(ep); - return ERR_PTR(ret); + return_ptr(ep); } static int eprobe_event_define_fields(struct trace_event_call *event_call) @@ -343,10 +344,15 @@ get_event_field(struct fetch_insn *code, void *rec) val = *(unsigned int *)addr; break; default: - if (field->is_signed) - val = *(long *)addr; - else - val = *(unsigned long *)addr; + if (field->size == sizeof(long)) { + if (field->is_signed) + val = *(long *)addr; + else + val = *(unsigned long *)addr; + break; + } + /* This is an array, point to the addr itself */ + val = (unsigned long)addr; break; } return val; @@ -478,13 +484,6 @@ static void eprobe_trigger_func(struct event_trigger_data *data, __eprobe_trace_func(edata, rec); } -static const struct event_trigger_ops eprobe_trigger_ops = { - .trigger = eprobe_trigger_func, - .print = eprobe_trigger_print, - .init = eprobe_trigger_init, - .free = eprobe_trigger_free, -}; - static int eprobe_trigger_cmd_parse(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, char *cmd, @@ -507,12 +506,6 @@ static void eprobe_trigger_unreg_func(char *glob, } -static const struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd, - char *param) -{ - return &eprobe_trigger_ops; -} - static struct event_command event_trigger_cmd = { .name = "eprobe", .trigger_type = ETT_EVENT_EPROBE, @@ -521,8 +514,11 @@ static struct event_command event_trigger_cmd = { .reg = eprobe_trigger_reg_func, .unreg = eprobe_trigger_unreg_func, .unreg_all = NULL, - .get_trigger_ops = eprobe_trigger_get_ops, .set_filter = NULL, + .trigger = eprobe_trigger_func, + .print = eprobe_trigger_print, + .init = eprobe_trigger_init, + .free = eprobe_trigger_free, }; static struct event_trigger_data * @@ -542,7 +538,6 @@ new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file) trigger->flags = EVENT_TRIGGER_FL_PROBE; trigger->count = -1; - trigger->ops = &eprobe_trigger_ops; /* * EVENT PROBE triggers are not registered as commands with @@ -795,23 +790,6 @@ find_and_get_event(const char *system, const char *event_name) return NULL; } -static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i) -{ - struct traceprobe_parse_context ctx = { - .event = ep->event, - .flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT, - }; - int ret; - - ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], &ctx); - /* Handle symbols "@" */ - if (!ret) - ret = traceprobe_update_arg(&ep->tp.args[i]); - - traceprobe_finish_parse(&ctx); - return ret; -} - static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const char *argv[]) { struct event_filter *dummy = NULL; @@ -848,13 +826,10 @@ static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const ch ret = create_event_filter(top_trace_array(), ep->event, ep->filter_str, true, &dummy); free_event_filter(dummy); - if (ret) - goto error; - - return 0; -error: - kfree(ep->filter_str); - ep->filter_str = NULL; + if (ret) { + kfree(ep->filter_str); + ep->filter_str = NULL; + } return ret; } @@ -866,40 +841,52 @@ static int __trace_eprobe_create(int argc, const char *argv[]) * Fetch args (no space): * <name>=$<field>[:TYPE] */ + struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL; + struct trace_eprobe *ep __free(trace_event_probe_cleanup) = NULL; + const char *trlog __free(trace_probe_log_clear) = NULL; const char *event = NULL, *group = EPROBE_EVENT_SYSTEM; const char *sys_event = NULL, *sys_name = NULL; struct trace_event_call *event_call; - struct trace_eprobe *ep = NULL; - char buf1[MAX_EVENT_NAME_LEN]; - char buf2[MAX_EVENT_NAME_LEN]; - char gbuf[MAX_EVENT_NAME_LEN]; + char *buf1 __free(kfree) = NULL; + char *buf2 __free(kfree) = NULL; + char *gbuf __free(kfree) = NULL; int ret = 0, filter_idx = 0; int i, filter_cnt; if (argc < 2 || argv[0][0] != 'e') return -ECANCELED; - trace_probe_log_init("event_probe", argc, argv); + trlog = trace_probe_log_init("event_probe", argc, argv); event = strchr(&argv[0][1], ':'); if (event) { + gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!gbuf) + return -ENOMEM; event++; ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) - goto parse_error; + return -EINVAL; } trace_probe_log_set_index(1); sys_event = argv[1]; + + buf2 = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!buf2) + return -ENOMEM; + ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, 0); if (ret || !sys_event || !sys_name) { trace_probe_log_err(0, NO_EVENT_INFO); - goto parse_error; + return -EINVAL; } if (!event) { - strscpy(buf1, sys_event, MAX_EVENT_NAME_LEN); + buf1 = kstrdup(sys_event, GFP_KERNEL); + if (!buf1) + return -ENOMEM; event = buf1; } @@ -915,8 +902,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) if (argc - 2 > MAX_TRACE_ARGS) { trace_probe_log_set_index(2); trace_probe_log_err(0, TOO_MANY_ARGS); - ret = -E2BIG; - goto error; + return -E2BIG; } scoped_guard(mutex, &event_mutex) { @@ -930,29 +916,39 @@ static int __trace_eprobe_create(int argc, const char *argv[]) trace_probe_log_err(0, BAD_ATTACH_EVENT); /* This must return -ENOMEM or missing event, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM && ret != -ENODEV); - ep = NULL; - goto error; + return ret; } if (filter_idx) { trace_probe_log_set_index(filter_idx); ret = trace_eprobe_parse_filter(ep, filter_cnt, argv + filter_idx); if (ret) - goto parse_error; + return -EINVAL; } else ep->filter_str = NULL; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + ctx->event = ep->event; + ctx->flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT; + argc -= 2; argv += 2; /* parse arguments */ for (i = 0; i < argc; i++) { trace_probe_log_set_index(i + 2); - ret = trace_eprobe_tp_update_arg(ep, argv, i); + + ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], ctx); + /* Handle symbols "@" */ + if (!ret) + ret = traceprobe_update_arg(&ep->tp.args[i]); if (ret) - goto error; + return ret; } ret = traceprobe_set_print_fmt(&ep->tp, PROBE_PRINT_EVENT); if (ret < 0) - goto error; + return ret; + init_trace_eprobe_call(ep); scoped_guard(mutex, &event_mutex) { ret = trace_probe_register_event_call(&ep->tp); @@ -961,20 +957,17 @@ static int __trace_eprobe_create(int argc, const char *argv[]) trace_probe_log_set_index(0); trace_probe_log_err(0, EVENT_EXIST); } - goto error; + return ret; } ret = dyn_event_add(&ep->devent, &ep->tp.event->call); if (ret < 0) { trace_probe_unregister_event_call(&ep->tp); - goto error; + return ret; } + /* To avoid freeing registered eprobe event, clear ep. */ + ep = NULL; } return ret; -parse_error: - ret = -EINVAL; -error: - trace_event_probe_cleanup(ep); - return ret; } /* diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 069e92856bda..76067529db61 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -360,7 +360,7 @@ static bool process_string(const char *fmt, int len, struct trace_event_call *ca /* Anything else, this isn't a function */ break; } - /* A function could be wrapped in parethesis, try the next one */ + /* A function could be wrapped in parenthesis, try the next one */ s = r + 1; } while (s < e); @@ -400,6 +400,20 @@ static bool process_string(const char *fmt, int len, struct trace_event_call *ca return true; } +static void handle_dereference_arg(const char *arg_str, u64 string_flags, int len, + u64 *dereference_flags, int arg, + struct trace_event_call *call) +{ + if (string_flags & (1ULL << arg)) { + if (process_string(arg_str, len, call)) + *dereference_flags &= ~(1ULL << arg); + } else if (process_pointer(arg_str, len, call)) + *dereference_flags &= ~(1ULL << arg); + else + pr_warn("TRACE EVENT ERROR: Bad dereference argument: '%.*s'\n", + len, arg_str); +} + /* * Examine the print fmt of the event looking for unsafe dereference * pointers using %p* that could be recorded in the trace event and @@ -553,7 +567,7 @@ static void test_event_printk(struct trace_event_call *call) * If start_arg is zero, then this is the start of the * first argument. The processing of the argument happens * when the end of the argument is found, as it needs to - * handle paranthesis and such. + * handle parenthesis and such. */ if (!start_arg) { start_arg = i; @@ -563,11 +577,9 @@ static void test_event_printk(struct trace_event_call *call) } if (dereference_flags & (1ULL << arg)) { - if (string_flags & (1ULL << arg)) { - if (process_string(fmt + start_arg, e - start_arg, call)) - dereference_flags &= ~(1ULL << arg); - } else if (process_pointer(fmt + start_arg, e - start_arg, call)) - dereference_flags &= ~(1ULL << arg); + handle_dereference_arg(fmt + start_arg, string_flags, + e - start_arg, + &dereference_flags, arg, call); } start_arg = i; @@ -578,11 +590,9 @@ static void test_event_printk(struct trace_event_call *call) } if (dereference_flags & (1ULL << arg)) { - if (string_flags & (1ULL << arg)) { - if (process_string(fmt + start_arg, i - start_arg, call)) - dereference_flags &= ~(1ULL << arg); - } else if (process_pointer(fmt + start_arg, i - start_arg, call)) - dereference_flags &= ~(1ULL << arg); + handle_dereference_arg(fmt + start_arg, string_flags, + i - start_arg, + &dereference_flags, arg, call); } /* @@ -622,7 +632,6 @@ EXPORT_SYMBOL_GPL(trace_event_raw_init); bool trace_event_ignore_this_pid(struct trace_event_file *trace_file) { struct trace_array *tr = trace_file->tr; - struct trace_array_cpu *data; struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; @@ -632,9 +641,11 @@ bool trace_event_ignore_this_pid(struct trace_event_file *trace_file) if (!pid_list && !no_pid_list) return false; - data = this_cpu_ptr(tr->array_buffer.data); - - return data->ignore_pid; + /* + * This is recorded at every sched_switch for this task. + * Thus, even if the task migrates the ignore value will be the same. + */ + return this_cpu_read(tr->array_buffer.data->ignore_pid) != 0; } EXPORT_SYMBOL_GPL(trace_event_ignore_this_pid); @@ -689,6 +700,8 @@ int trace_event_reg(struct trace_event_call *call, #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: + if (!call->class->perf_probe) + return -ENODEV; return tracepoint_probe_register(call->tp, call->class->perf_probe, call); @@ -757,6 +770,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, { struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; + bool soft_mode = atomic_read(&file->sm_ref) != 0; int ret = 0; int disable; @@ -771,19 +785,19 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, * is set we do not want the event to be enabled before we * clear the bit. * - * When soft_disable is not set but the SOFT_MODE flag is, + * When soft_disable is not set but the soft_mode is, * we do nothing. Do not disable the tracepoint, otherwise - * "soft enable"s (clearing the SOFT_DISABLED bit) wont work. + * "soft enable"s (clearing the SOFT_DISABLED bit) won't work. */ if (soft_disable) { if (atomic_dec_return(&file->sm_ref) > 0) break; disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED; - clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); + soft_mode = false; /* Disable use of trace_buffered_event */ trace_buffered_event_disable(); } else - disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE); + disable = !soft_mode; if (disable && (file->flags & EVENT_FILE_FL_ENABLED)) { clear_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags); @@ -801,8 +815,8 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, WARN_ON_ONCE(ret); } - /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ - if (file->flags & EVENT_FILE_FL_SOFT_MODE) + /* If in soft mode, just set the SOFT_DISABLE_BIT, else clear it */ + if (soft_mode) set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); else clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); @@ -812,7 +826,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, * When soft_disable is set and enable is set, we want to * register the tracepoint for the event, but leave the event * as is. That means, if the event was already enabled, we do - * nothing (but set SOFT_MODE). If the event is disabled, we + * nothing (but set soft_mode). If the event is disabled, we * set SOFT_DISABLED before enabling the event tracepoint, so * it still seems to be disabled. */ @@ -821,7 +835,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, else { if (atomic_inc_return(&file->sm_ref) > 1) break; - set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); + soft_mode = true; /* Enable use of trace_buffered_event */ trace_buffered_event_enable(); } @@ -829,17 +843,17 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, if (!(file->flags & EVENT_FILE_FL_ENABLED)) { bool cmd = false, tgid = false; - /* Keep the event disabled, when going to SOFT_MODE. */ + /* Keep the event disabled, when going to soft mode. */ if (soft_disable) set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); - if (tr->trace_flags & TRACE_ITER_RECORD_CMD) { + if (tr->trace_flags & TRACE_ITER(RECORD_CMD)) { cmd = true; tracing_start_cmdline_record(); set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } - if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { + if (tr->trace_flags & TRACE_ITER(RECORD_TGID)) { tgid = true; tracing_start_tgid_record(); set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); @@ -1382,7 +1396,7 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) if (!tr) return -ENOENT; - /* Modules events can be appened with :mod:<module> */ + /* Modules events can be appended with :mod:<module> */ mod = strstr(buf, ":mod:"); if (mod) { *mod = '\0'; @@ -1617,11 +1631,10 @@ static void *s_start(struct seq_file *m, loff_t *pos) loff_t l; iter = kzalloc(sizeof(*iter), GFP_KERNEL); + mutex_lock(&event_mutex); if (!iter) return NULL; - mutex_lock(&event_mutex); - iter->type = SET_EVENT_FILE; iter->file = list_entry(&tr->events, struct trace_event_file, list); @@ -1781,8 +1794,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, !(flags & EVENT_FILE_FL_SOFT_DISABLED)) strcpy(buf, "1"); - if (flags & EVENT_FILE_FL_SOFT_DISABLED || - flags & EVENT_FILE_FL_SOFT_MODE) + if (atomic_read(&file->sm_ref) != 0) strcat(buf, "*"); strcat(buf, "\n"); @@ -3125,7 +3137,10 @@ __register_event(struct trace_event_call *call, struct module *mod) if (ret < 0) return ret; + down_write(&trace_event_sem); list_add(&call->list, &ftrace_events); + up_write(&trace_event_sem); + if (call->flags & TRACE_EVENT_FL_DYNAMIC) atomic_set(&call->refcnt, 0); else @@ -3253,43 +3268,120 @@ static void add_str_to_module(struct module *module, char *str) list_add(&modstr->next, &module_strings); } +#define ATTRIBUTE_STR "__attribute__(" +#define ATTRIBUTE_STR_LEN (sizeof(ATTRIBUTE_STR) - 1) + +/* Remove all __attribute__() from @type. Return allocated string or @type. */ +static char *sanitize_field_type(const char *type) +{ + char *attr, *tmp, *next, *ret = (char *)type; + int depth; + + next = (char *)type; + while ((attr = strstr(next, ATTRIBUTE_STR))) { + /* Retry if "__attribute__(" is a part of another word. */ + if (attr != next && !isspace(attr[-1])) { + next = attr + ATTRIBUTE_STR_LEN; + continue; + } + + if (ret == type) { + ret = kstrdup(type, GFP_KERNEL); + if (WARN_ON_ONCE(!ret)) + return NULL; + attr = ret + (attr - type); + } + + /* the ATTRIBUTE_STR already has the first '(' */ + depth = 1; + next = attr + ATTRIBUTE_STR_LEN; + do { + tmp = strpbrk(next, "()"); + /* There is unbalanced parentheses */ + if (WARN_ON_ONCE(!tmp)) { + kfree(ret); + return (char *)type; + } + + if (*tmp == '(') + depth++; + else + depth--; + next = tmp + 1; + } while (depth > 0); + next = skip_spaces(next); + strcpy(attr, next); + next = attr; + } + return ret; +} + +static char *find_replacable_eval(const char *type, const char *eval_string, + int len) +{ + char *ptr; + + if (!eval_string) + return NULL; + + ptr = strchr(type, '['); + if (!ptr) + return NULL; + ptr++; + + if (!isalpha(*ptr) && *ptr != '_') + return NULL; + + if (strncmp(eval_string, ptr, len) != 0) + return NULL; + + return ptr; +} + static void update_event_fields(struct trace_event_call *call, struct trace_eval_map *map) { struct ftrace_event_field *field; + const char *eval_string = NULL; struct list_head *head; + int len = 0; char *ptr; char *str; - int len = strlen(map->eval_string); /* Dynamic events should never have field maps */ - if (WARN_ON_ONCE(call->flags & TRACE_EVENT_FL_DYNAMIC)) + if (call->flags & TRACE_EVENT_FL_DYNAMIC) return; + if (map) { + eval_string = map->eval_string; + len = strlen(map->eval_string); + } + head = trace_get_fields(call); list_for_each_entry(field, head, link) { - ptr = strchr(field->type, '['); - if (!ptr) - continue; - ptr++; - - if (!isalpha(*ptr) && *ptr != '_') - continue; + str = sanitize_field_type(field->type); + if (!str) + return; - if (strncmp(map->eval_string, ptr, len) != 0) - continue; + ptr = find_replacable_eval(str, eval_string, len); + if (ptr) { + if (str == field->type) { + str = kstrdup(field->type, GFP_KERNEL); + if (WARN_ON_ONCE(!str)) + return; + ptr = str + (ptr - field->type); + } - str = kstrdup(field->type, GFP_KERNEL); - if (WARN_ON_ONCE(!str)) - return; - ptr = str + (ptr - field->type); - ptr = eval_replace(ptr, map, len); - /* enum/sizeof string smaller than value */ - if (WARN_ON_ONCE(!ptr)) { - kfree(str); - continue; + ptr = eval_replace(ptr, map, len); + /* enum/sizeof string smaller than value */ + if (WARN_ON_ONCE(!ptr)) { + kfree(str); + continue; + } } + if (str == field->type) + continue; /* * If the event is part of a module, then we need to free the string * when the module is removed. Otherwise, it will stay allocated @@ -3299,14 +3391,18 @@ static void update_event_fields(struct trace_event_call *call, add_str_to_module(call->module, str); field->type = str; + if (field->filter_type == FILTER_OTHER) + field->filter_type = filter_assign_type(field->type); } } -void trace_event_eval_update(struct trace_eval_map **map, int len) +/* Update all events for replacing eval and sanitizing */ +void trace_event_update_all(struct trace_eval_map **map, int len) { struct trace_event_call *call, *p; const char *last_system = NULL; bool first = false; + bool updated; int last_i; int i; @@ -3319,6 +3415,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) last_system = call->class->system; } + updated = false; /* * Since calls are grouped by systems, the likelihood that the * next call in the iteration belongs to the same system as the @@ -3338,8 +3435,12 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) } update_event_printk(call, map[i]); update_event_fields(call, map[i]); + updated = true; } } + /* If not updated yet, update field for sanitizing. */ + if (!updated) + update_event_fields(call, NULL); cond_resched(); } up_write(&trace_event_sem); @@ -3573,7 +3674,7 @@ static int probe_remove_event_call(struct trace_event_call *call) continue; /* * We can't rely on ftrace_event_enable_disable(enable => 0) - * we are going to do, EVENT_FILE_FL_SOFT_MODE can suppress + * we are going to do, soft mode can suppress * TRACE_REG_UNREGISTER. */ if (file->flags & EVENT_FILE_FL_ENABLED) @@ -3684,7 +3785,7 @@ static void trace_module_remove_events(struct module *mod) if (call->module == mod) __trace_remove_event_call(call); } - /* Check for any strings allocade for this module */ + /* Check for any strings allocated for this module */ list_for_each_entry_safe(modstr, m, &module_strings, next) { if (modstr->module != mod) continue; @@ -3739,6 +3840,8 @@ __trace_add_event_dirs(struct trace_array *tr) struct trace_event_call *call; int ret; + lockdep_assert_held(&trace_event_sem); + list_for_each_entry(call, &ftrace_events, list) { ret = __trace_add_new_event(call, tr); if (ret < 0) @@ -3986,7 +4089,7 @@ static int free_probe_data(void *data) edata->ref--; if (!edata->ref) { - /* Remove the SOFT_MODE flag */ + /* Remove soft mode */ __ftrace_event_enable_disable(edata->file, 0, 1); trace_event_put_ref(edata->file->event_call); kfree(edata); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 2048560264bb..385af8405392 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -142,7 +142,7 @@ static bool is_not(const char *str) } /** - * struct prog_entry - a singe entry in the filter program + * struct prog_entry - a single entry in the filter program * @target: Index to jump to on a branch (actually one minus the index) * @when_to_branch: The value of the result of the predicate to do a branch * @pred: The predicate to execute. @@ -1250,7 +1250,9 @@ static void append_filter_err(struct trace_array *tr, static inline struct event_filter *event_filter(struct trace_event_file *file) { - return file->filter; + return rcu_dereference_protected(file->filter, + lockdep_is_held(&event_mutex)); + } /* caller must hold event_mutex */ @@ -1320,7 +1322,7 @@ void free_event_filter(struct event_filter *filter) static inline void __remove_filter(struct trace_event_file *file) { filter_disable(file); - remove_filter_string(file->filter); + remove_filter_string(event_filter(file)); } static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir, @@ -1335,22 +1337,149 @@ static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir, } } +struct filter_list { + struct list_head list; + struct event_filter *filter; +}; + +struct filter_head { + struct list_head list; + union { + struct rcu_head rcu; + struct rcu_work rwork; + }; +}; + +static void free_filter_list(struct filter_head *filter_list) +{ + struct filter_list *filter_item, *tmp; + + list_for_each_entry_safe(filter_item, tmp, &filter_list->list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); + } + kfree(filter_list); +} + +static void free_filter_list_work(struct work_struct *work) +{ + struct filter_head *filter_list; + + filter_list = container_of(to_rcu_work(work), struct filter_head, rwork); + free_filter_list(filter_list); +} + +static void free_filter_list_tasks(struct rcu_head *rhp) +{ + struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu); + + INIT_RCU_WORK(&filter_list->rwork, free_filter_list_work); + queue_rcu_work(system_wq, &filter_list->rwork); +} + +/* + * The tracepoint_synchronize_unregister() is a double rcu call. + * It calls synchronize_rcu_tasks_trace() followed by synchronize_rcu(). + * Instead of waiting for it, simply call these via the call_rcu*() + * variants. + */ +static void delay_free_filter(struct filter_head *head) +{ + call_rcu_tasks_trace(&head->rcu, free_filter_list_tasks); +} + +static void try_delay_free_filter(struct event_filter *filter) +{ + struct filter_head *head; + struct filter_list *item; + + head = kmalloc(sizeof(*head), GFP_KERNEL); + if (!head) + goto free_now; + + INIT_LIST_HEAD(&head->list); + + item = kmalloc(sizeof(*item), GFP_KERNEL); + if (!item) { + kfree(head); + goto free_now; + } + + item->filter = filter; + list_add_tail(&item->list, &head->list); + delay_free_filter(head); + return; + + free_now: + /* Make sure the filter is not being used */ + tracepoint_synchronize_unregister(); + __free_filter(filter); +} + static inline void __free_subsystem_filter(struct trace_event_file *file) { - __free_filter(file->filter); + __free_filter(event_filter(file)); file->filter = NULL; } +static inline void event_set_filter(struct trace_event_file *file, + struct event_filter *filter) +{ + rcu_assign_pointer(file->filter, filter); +} + +static inline void event_clear_filter(struct trace_event_file *file) +{ + RCU_INIT_POINTER(file->filter, NULL); +} + static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, - struct trace_array *tr) + struct trace_array *tr, + struct event_filter *filter) { struct trace_event_file *file; + struct filter_head *head; + struct filter_list *item; + + head = kmalloc(sizeof(*head), GFP_KERNEL); + if (!head) + goto free_now; + + INIT_LIST_HEAD(&head->list); list_for_each_entry(file, &tr->events, list) { if (file->system != dir) continue; + item = kmalloc(sizeof(*item), GFP_KERNEL); + if (!item) + goto free_now; + item->filter = event_filter(file); + list_add_tail(&item->list, &head->list); + event_clear_filter(file); + } + + item = kmalloc(sizeof(*item), GFP_KERNEL); + if (!item) + goto free_now; + + item->filter = filter; + list_add_tail(&item->list, &head->list); + + delay_free_filter(head); + return; + free_now: + tracepoint_synchronize_unregister(); + + if (head) + free_filter_list(head); + + list_for_each_entry(file, &tr->events, list) { + if (file->system != dir || !file->filter) + continue; __free_subsystem_filter(file); } + __free_filter(filter); } int filter_assign_type(const char *type) @@ -2120,22 +2249,6 @@ static inline void event_set_filtered_flag(struct trace_event_file *file) trace_buffered_event_enable(); } -static inline void event_set_filter(struct trace_event_file *file, - struct event_filter *filter) -{ - rcu_assign_pointer(file->filter, filter); -} - -static inline void event_clear_filter(struct trace_event_file *file) -{ - RCU_INIT_POINTER(file->filter, NULL); -} - -struct filter_list { - struct list_head list; - struct event_filter *filter; -}; - static int process_system_preds(struct trace_subsystem_dir *dir, struct trace_array *tr, struct filter_parse_error *pe, @@ -2144,11 +2257,16 @@ static int process_system_preds(struct trace_subsystem_dir *dir, struct trace_event_file *file; struct filter_list *filter_item; struct event_filter *filter = NULL; - struct filter_list *tmp; - LIST_HEAD(filter_list); + struct filter_head *filter_list; bool fail = true; int err; + filter_list = kmalloc(sizeof(*filter_list), GFP_KERNEL); + if (!filter_list) + return -ENOMEM; + + INIT_LIST_HEAD(&filter_list->list); + list_for_each_entry(file, &tr->events, list) { if (file->system != dir) @@ -2175,7 +2293,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir, if (!filter_item) goto fail_mem; - list_add_tail(&filter_item->list, &filter_list); + list_add_tail(&filter_item->list, &filter_list->list); /* * Regardless of if this returned an error, we still * replace the filter for the call. @@ -2195,31 +2313,22 @@ static int process_system_preds(struct trace_subsystem_dir *dir, * Do a synchronize_rcu() and to ensure all calls are * done with them before we free them. */ - tracepoint_synchronize_unregister(); - list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { - __free_filter(filter_item->filter); - list_del(&filter_item->list); - kfree(filter_item); - } + delay_free_filter(filter_list); return 0; fail: /* No call succeeded */ - list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { - list_del(&filter_item->list); - kfree(filter_item); - } + free_filter_list(filter_list); parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); return -EINVAL; fail_mem: __free_filter(filter); + /* If any call succeeded, we still need to sync */ if (!fail) - tracepoint_synchronize_unregister(); - list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { - __free_filter(filter_item->filter); - list_del(&filter_item->list); - kfree(filter_item); - } + delay_free_filter(filter_list); + else + free_filter_list(filter_list); + return -ENOMEM; } @@ -2361,9 +2470,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) event_clear_filter(file); - /* Make sure the filter is not being used */ - tracepoint_synchronize_unregister(); - __free_filter(filter); + try_delay_free_filter(filter); return 0; } @@ -2387,11 +2494,8 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) event_set_filter(file, filter); - if (tmp) { - /* Make sure the call is done with the filter */ - tracepoint_synchronize_unregister(); - __free_filter(tmp); - } + if (tmp) + try_delay_free_filter(tmp); } return err; @@ -2417,9 +2521,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, filter = system->filter; system->filter = NULL; /* Ensure all filters are no longer used */ - tracepoint_synchronize_unregister(); - filter_free_subsystem_filters(dir, tr); - __free_filter(filter); + filter_free_subsystem_filters(dir, tr, filter); return 0; } @@ -2810,6 +2912,10 @@ static __init int ftrace_test_event_filter(void) if (i == DATA_CNT) printk(KERN_CONT "OK\n"); + /* Need to call ftrace_test_filter to prevent a warning */ + if (!trace_ftrace_test_filter_enabled()) + trace_ftrace_test_filter(1, 2, 3, 4, 5, 6, 7, 8); + return 0; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1260c23cfa5f..5e6e70540eef 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -114,6 +114,7 @@ enum hist_field_fn { HIST_FIELD_FN_BUCKET, HIST_FIELD_FN_TIMESTAMP, HIST_FIELD_FN_CPU, + HIST_FIELD_FN_COMM, HIST_FIELD_FN_STRING, HIST_FIELD_FN_DYNSTRING, HIST_FIELD_FN_RELDYNSTRING, @@ -506,6 +507,7 @@ enum hist_field_flags { HIST_FIELD_FL_CONST = 1 << 18, HIST_FIELD_FL_PERCENT = 1 << 19, HIST_FIELD_FL_GRAPH = 1 << 20, + HIST_FIELD_FL_COMM = 1 << 21, }; struct var_defs { @@ -885,6 +887,15 @@ static u64 hist_field_cpu(struct hist_field *hist_field, return cpu; } +static u64 hist_field_comm(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + return (u64)(unsigned long)current->comm; +} + /** * check_field_for_var_ref - Check if a VAR_REF field references a variable * @hist_field: The VAR_REF field to check @@ -1338,6 +1349,8 @@ static const char *hist_field_name(struct hist_field *field, field_name = hist_field_name(field->operands[0], ++level); else if (field->flags & HIST_FIELD_FL_CPU) field_name = "common_cpu"; + else if (field->flags & HIST_FIELD_FL_COMM) + field_name = "common_comm"; else if (field->flags & HIST_FIELD_FL_EXPR || field->flags & HIST_FIELD_FL_VAR_REF) { if (field->system) { @@ -2015,6 +2028,13 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, goto out; } + if (flags & HIST_FIELD_FL_COMM) { + hist_field->fn_num = HIST_FIELD_FN_COMM; + hist_field->size = MAX_FILTER_STR_VAL; + hist_field->type = "char[]"; + goto out; + } + if (WARN_ON_ONCE(!field)) goto out; @@ -2359,9 +2379,11 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, hist_data->attrs->ts_in_usecs = true; } else if (strcmp(field_name, "common_stacktrace") == 0) { *flags |= HIST_FIELD_FL_STACKTRACE; - } else if (strcmp(field_name, "common_cpu") == 0) + } else if (strcmp(field_name, "common_cpu") == 0) { *flags |= HIST_FIELD_FL_CPU; - else if (strcmp(field_name, "hitcount") == 0) + } else if (strcmp(field_name, "common_comm") == 0) { + *flags |= HIST_FIELD_FL_COMM | HIST_FIELD_FL_STRING; + } else if (strcmp(field_name, "hitcount") == 0) *flags |= HIST_FIELD_FL_HITCOUNT; else { field = trace_find_event_field(file->event_call, field_name); @@ -2377,6 +2399,8 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, *flags |= HIST_FIELD_FL_CPU; } else if (field && field->filter_type == FILTER_STACKTRACE) { *flags |= HIST_FIELD_FL_STACKTRACE; + } else if (field && field->filter_type == FILTER_COMM) { + *flags |= HIST_FIELD_FL_COMM | HIST_FIELD_FL_STRING; } else { hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name)); @@ -3248,14 +3272,16 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data, var = create_var(hist_data, file, field_name, val->size, val->type); if (IS_ERR(var)) { hist_err(tr, HIST_ERR_VAR_CREATE_FIND_FAIL, errpos(field_name)); - kfree(val); + destroy_hist_field(val, 0); ret = PTR_ERR(var); goto err; } field_var = kzalloc(sizeof(struct field_var), GFP_KERNEL); if (!field_var) { - kfree(val); + destroy_hist_field(val, 0); + kfree_const(var->type); + kfree(var->var.name); kfree(var); ret = -ENOMEM; goto err; @@ -4327,6 +4353,8 @@ static u64 hist_fn_call(struct hist_field *hist_field, return hist_field_timestamp(hist_field, elt, buffer, rbe, event); case HIST_FIELD_FN_CPU: return hist_field_cpu(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_COMM: + return hist_field_comm(hist_field, elt, buffer, rbe, event); case HIST_FIELD_FN_STRING: return hist_field_string(hist_field, elt, buffer, rbe, event); case HIST_FIELD_FN_DYNSTRING: @@ -5212,22 +5240,25 @@ static inline void add_to_key(char *compound_key, void *key, size_t size = key_field->size; if (key_field->flags & HIST_FIELD_FL_STRING) { - struct ftrace_event_field *field; - field = key_field->field; - if (field->filter_type == FILTER_DYN_STRING || - field->filter_type == FILTER_RDYN_STRING) - size = *(u32 *)(rec + field->offset) >> 16; - else if (field->filter_type == FILTER_STATIC_STRING) - size = field->size; + if (key_field->flags & HIST_FIELD_FL_COMM) { + size = strlen((char *)key); + } else { + struct ftrace_event_field *field; + + field = key_field->field; + if (field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_RDYN_STRING) + size = *(u32 *)(rec + field->offset) >> 16; + else if (field->filter_type == FILTER_STATIC_STRING) + size = field->size; + } /* ensure NULL-termination */ if (size > key_field->size - 1) size = key_field->size - 1; - - strncpy(compound_key + key_field->offset, (char *)key, size); - } else - memcpy(compound_key + key_field->offset, key, size); + } + memcpy(compound_key + key_field->offset, key, size); } static void @@ -5246,17 +5277,94 @@ hist_trigger_actions(struct hist_trigger_data *hist_data, } } +/* + * The hist_pad structure is used to save information to create + * a histogram from the histogram trigger. It's too big to store + * on the stack, so when the histogram trigger is initialized + * a percpu array of 4 hist_pad structures is allocated. + * This will cover every context from normal, softirq, irq and NMI + * in the very unlikely event that a trigger happens at each of + * these contexts and interrupts a currently active trigger. + */ +struct hist_pad { + unsigned long entries[HIST_STACKTRACE_DEPTH]; + u64 var_ref_vals[TRACING_MAP_VARS_MAX]; + char compound_key[HIST_KEY_SIZE_MAX]; +}; + +static struct hist_pad __percpu *hist_pads; +static DEFINE_PER_CPU(int, hist_pad_cnt); +static refcount_t hist_pad_ref; + +/* One hist_pad for every context (normal, softirq, irq, NMI) */ +#define MAX_HIST_CNT 4 + +static int alloc_hist_pad(void) +{ + lockdep_assert_held(&event_mutex); + + if (refcount_read(&hist_pad_ref)) { + refcount_inc(&hist_pad_ref); + return 0; + } + + hist_pads = __alloc_percpu(sizeof(struct hist_pad) * MAX_HIST_CNT, + __alignof__(struct hist_pad)); + if (!hist_pads) + return -ENOMEM; + + refcount_set(&hist_pad_ref, 1); + return 0; +} + +static void free_hist_pad(void) +{ + lockdep_assert_held(&event_mutex); + + if (!refcount_dec_and_test(&hist_pad_ref)) + return; + + free_percpu(hist_pads); + hist_pads = NULL; +} + +static struct hist_pad *get_hist_pad(void) +{ + struct hist_pad *hist_pad; + int cnt; + + if (WARN_ON_ONCE(!hist_pads)) + return NULL; + + preempt_disable(); + + hist_pad = per_cpu_ptr(hist_pads, smp_processor_id()); + + if (this_cpu_read(hist_pad_cnt) == MAX_HIST_CNT) { + preempt_enable(); + return NULL; + } + + cnt = this_cpu_inc_return(hist_pad_cnt) - 1; + + return &hist_pad[cnt]; +} + +static void put_hist_pad(void) +{ + this_cpu_dec(hist_pad_cnt); + preempt_enable(); +} + static void event_hist_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe) { struct hist_trigger_data *hist_data = data->private_data; bool use_compound_key = (hist_data->n_keys > 1); - unsigned long entries[HIST_STACKTRACE_DEPTH]; - u64 var_ref_vals[TRACING_MAP_VARS_MAX]; - char compound_key[HIST_KEY_SIZE_MAX]; struct tracing_map_elt *elt = NULL; struct hist_field *key_field; + struct hist_pad *hist_pad; u64 field_contents; void *key = NULL; unsigned int i; @@ -5264,12 +5372,18 @@ static void event_hist_trigger(struct event_trigger_data *data, if (unlikely(!rbe)) return; - memset(compound_key, 0, hist_data->key_size); + hist_pad = get_hist_pad(); + if (!hist_pad) + return; + + memset(hist_pad->compound_key, 0, hist_data->key_size); for_each_hist_key_field(i, hist_data) { key_field = hist_data->fields[i]; if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { + unsigned long *entries = hist_pad->entries; + memset(entries, 0, HIST_STACKTRACE_SIZE); if (key_field->field) { unsigned long *stack, n_entries; @@ -5293,26 +5407,31 @@ static void event_hist_trigger(struct event_trigger_data *data, } if (use_compound_key) - add_to_key(compound_key, key, key_field, rec); + add_to_key(hist_pad->compound_key, key, key_field, rec); } if (use_compound_key) - key = compound_key; + key = hist_pad->compound_key; if (hist_data->n_var_refs && - !resolve_var_refs(hist_data, key, var_ref_vals, false)) - return; + !resolve_var_refs(hist_data, key, hist_pad->var_ref_vals, false)) + goto out; elt = tracing_map_insert(hist_data->map, key); if (!elt) - return; + goto out; - hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, var_ref_vals); + hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, hist_pad->var_ref_vals); - if (resolve_var_refs(hist_data, key, var_ref_vals, true)) - hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals); + if (resolve_var_refs(hist_data, key, hist_pad->var_ref_vals, true)) { + hist_trigger_actions(hist_data, elt, buffer, rec, rbe, + key, hist_pad->var_ref_vals); + } hist_poll_wakeup(); + + out: + put_hist_pad(); } static void hist_trigger_stacktrace_print(struct seq_file *m, @@ -5577,7 +5696,7 @@ static void hist_trigger_show(struct seq_file *m, seq_puts(m, "\n\n"); seq_puts(m, "# event histogram\n#\n# trigger info: "); - data->ops->print(m, data); + data->cmd_ops->print(m, data); seq_puts(m, "#\n\n"); hist_data = data->private_data; @@ -5899,7 +6018,7 @@ static void hist_trigger_debug_show(struct seq_file *m, seq_puts(m, "\n\n"); seq_puts(m, "# event histogram\n#\n# trigger info: "); - data->ops->print(m, data); + data->cmd_ops->print(m, data); seq_puts(m, "#\n\n"); hist_data = data->private_data; @@ -6011,6 +6130,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) if (hist_field->flags & HIST_FIELD_FL_CPU) seq_puts(m, "common_cpu"); + if (hist_field->flags & HIST_FIELD_FL_COMM) + seq_puts(m, "common_comm"); else if (hist_field->flags & HIST_FIELD_FL_CONST) seq_printf(m, "%llu", hist_field->constant); else if (field_name) { @@ -6157,6 +6278,9 @@ static int event_hist_trigger_init(struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; + if (alloc_hist_pad() < 0) + return -ENOMEM; + if (!data->ref && hist_data->attrs->name) save_named_trigger(hist_data->attrs->name, data); @@ -6201,24 +6325,24 @@ static void event_hist_trigger_free(struct event_trigger_data *data) destroy_hist_data(hist_data); } + free_hist_pad(); } -static const struct event_trigger_ops event_hist_trigger_ops = { - .trigger = event_hist_trigger, - .print = event_hist_trigger_print, - .init = event_hist_trigger_init, - .free = event_hist_trigger_free, -}; - static int event_hist_trigger_named_init(struct event_trigger_data *data) { + int ret; + data->ref++; save_named_trigger(data->named_data->name, data); - event_hist_trigger_init(data->named_data); + ret = event_hist_trigger_init(data->named_data); + if (ret < 0) { + kfree(data->cmd_ops); + data->cmd_ops = &trigger_hist_cmd; + } - return 0; + return ret; } static void event_hist_trigger_named_free(struct event_trigger_data *data) @@ -6230,24 +6354,14 @@ static void event_hist_trigger_named_free(struct event_trigger_data *data) data->ref--; if (!data->ref) { + struct event_command *cmd_ops = data->cmd_ops; + del_named_trigger(data); trigger_data_free(data); + kfree(cmd_ops); } } -static const struct event_trigger_ops event_hist_trigger_named_ops = { - .trigger = event_hist_trigger, - .print = event_hist_trigger_print, - .init = event_hist_trigger_named_init, - .free = event_hist_trigger_named_free, -}; - -static const struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd, - char *param) -{ - return &event_hist_trigger_ops; -} - static void hist_clear(struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; @@ -6441,13 +6555,24 @@ static int hist_register_trigger(char *glob, data->paused = true; if (named_data) { + struct event_command *cmd_ops; + data->private_data = named_data->private_data; set_named_trigger_data(data, named_data); - data->ops = &event_hist_trigger_named_ops; + /* Copy the command ops and update some of the functions */ + cmd_ops = kmalloc(sizeof(*cmd_ops), GFP_KERNEL); + if (!cmd_ops) { + ret = -ENOMEM; + goto out; + } + *cmd_ops = *data->cmd_ops; + cmd_ops->init = event_hist_trigger_named_init; + cmd_ops->free = event_hist_trigger_named_free; + data->cmd_ops = cmd_ops; } - if (data->ops->init) { - ret = data->ops->init(data); + if (data->cmd_ops->init) { + ret = data->cmd_ops->init(data); if (ret < 0) goto out; } @@ -6561,8 +6686,8 @@ static void hist_unregister_trigger(char *glob, } } - if (test && test->ops->free) - test->ops->free(test); + if (test && test->cmd_ops->free) + test->cmd_ops->free(test); if (hist_data->enable_timestamps) { if (!hist_data->remove || test) @@ -6614,8 +6739,8 @@ static void hist_unreg_all(struct trace_event_file *file) update_cond_flag(file); if (hist_data->enable_timestamps) tracing_set_filter_buffering(file->tr, false); - if (test->ops->free) - test->ops->free(test); + if (test->cmd_ops->free) + test->cmd_ops->free(test); } } } @@ -6705,7 +6830,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, return PTR_ERR(hist_data); } - trigger_data = event_trigger_alloc(cmd_ops, cmd, param, hist_data); + trigger_data = trigger_data_alloc(cmd_ops, cmd, param, hist_data); if (!trigger_data) { ret = -ENOMEM; goto out_free; @@ -6791,8 +6916,11 @@ static struct event_command trigger_hist_cmd = { .reg = hist_register_trigger, .unreg = hist_unregister_trigger, .unreg_all = hist_unreg_all, - .get_trigger_ops = event_hist_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = event_hist_trigger, + .print = event_hist_trigger_print, + .init = event_hist_trigger_init, + .free = event_hist_trigger_free, }; __init int register_trigger_hist_cmd(void) @@ -6824,66 +6952,6 @@ hist_enable_trigger(struct event_trigger_data *data, } } -static void -hist_enable_count_trigger(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *event) -{ - if (!data->count) - return; - - if (data->count != -1) - (data->count)--; - - hist_enable_trigger(data, buffer, rec, event); -} - -static const struct event_trigger_ops hist_enable_trigger_ops = { - .trigger = hist_enable_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static const struct event_trigger_ops hist_enable_count_trigger_ops = { - .trigger = hist_enable_count_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static const struct event_trigger_ops hist_disable_trigger_ops = { - .trigger = hist_enable_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static const struct event_trigger_ops hist_disable_count_trigger_ops = { - .trigger = hist_enable_count_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static const struct event_trigger_ops * -hist_enable_get_trigger_ops(char *cmd, char *param) -{ - const struct event_trigger_ops *ops; - bool enable; - - enable = (strcmp(cmd, ENABLE_HIST_STR) == 0); - - if (enable) - ops = param ? &hist_enable_count_trigger_ops : - &hist_enable_trigger_ops; - else - ops = param ? &hist_disable_count_trigger_ops : - &hist_disable_trigger_ops; - - return ops; -} - static void hist_enable_unreg_all(struct trace_event_file *file) { struct event_trigger_data *test, *n; @@ -6893,8 +6961,8 @@ static void hist_enable_unreg_all(struct trace_event_file *file) list_del_rcu(&test->list); update_cond_flag(file); trace_event_trigger_enable_disable(file, 0); - if (test->ops->free) - test->ops->free(test); + if (test->cmd_ops->free) + test->cmd_ops->free(test); } } } @@ -6906,8 +6974,12 @@ static struct event_command trigger_hist_enable_cmd = { .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .unreg_all = hist_enable_unreg_all, - .get_trigger_ops = hist_enable_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = hist_enable_trigger, + .count_func = event_trigger_count, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, }; static struct event_command trigger_hist_disable_cmd = { @@ -6917,8 +6989,12 @@ static struct event_command trigger_hist_disable_cmd = { .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .unreg_all = hist_enable_unreg_all, - .get_trigger_ops = hist_enable_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = hist_enable_trigger, + .count_func = event_trigger_count, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, }; static __init void unregister_trigger_hist_enable_disable_cmds(void) diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 33cfbd4ed76d..4554c458b78c 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -359,7 +359,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, fmt = synth_field_fmt(se->fields[i]->type); /* parameter types */ - if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) + if (tr && tr->trace_flags & TRACE_ITER(VERBOSE)) trace_seq_printf(s, "%s ", fmt); snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt); @@ -375,7 +375,6 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, n_u64++; } else { trace_seq_printf(s, print_fmt, se->fields[i]->name, - STR_VAR_LEN_MAX, (char *)&entry->fields[n_u64].as_u64, i == se->n_fields - 1 ? "" : " "); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); @@ -536,12 +535,12 @@ static notrace void trace_event_raw_event_synth(void *__data, * is being performed within another event. */ buffer = trace_file->tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); + guard(ring_buffer_nest)(buffer); entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry) + fields_size); if (!entry) - goto out; + return; for (i = 0, n_u64 = 0; i < event->n_fields; i++) { val_idx = var_ref_idx[i]; @@ -584,8 +583,6 @@ static notrace void trace_event_raw_event_synth(void *__data, } trace_event_buffer_commit(&fbuffer); -out: - ring_buffer_nest_end(buffer); } static void free_synth_event_print_fmt(struct trace_event_call *call) diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index b66b6d235d91..06b75bcfc7b8 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -6,6 +6,7 @@ */ #include <linux/security.h> +#include <linux/kthread.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/mutex.h> @@ -17,15 +18,77 @@ static LIST_HEAD(trigger_commands); static DEFINE_MUTEX(trigger_cmd_mutex); +static struct task_struct *trigger_kthread; +static struct llist_head trigger_data_free_list; +static DEFINE_MUTEX(trigger_data_kthread_mutex); + +/* Bulk garbage collection of event_trigger_data elements */ +static int trigger_kthread_fn(void *ignore) +{ + struct event_trigger_data *data, *tmp; + struct llist_node *llnodes; + + /* Once this task starts, it lives forever */ + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (llist_empty(&trigger_data_free_list)) + schedule(); + + __set_current_state(TASK_RUNNING); + + llnodes = llist_del_all(&trigger_data_free_list); + + /* make sure current triggers exit before free */ + tracepoint_synchronize_unregister(); + + llist_for_each_entry_safe(data, tmp, llnodes, llist) + kfree(data); + } + + return 0; +} + void trigger_data_free(struct event_trigger_data *data) { if (data->cmd_ops->set_filter) data->cmd_ops->set_filter(NULL, data, NULL); - /* make sure current triggers exit before free */ - tracepoint_synchronize_unregister(); + if (unlikely(!trigger_kthread)) { + guard(mutex)(&trigger_data_kthread_mutex); + /* Check again after taking mutex */ + if (!trigger_kthread) { + struct task_struct *kthread; + + kthread = kthread_create(trigger_kthread_fn, NULL, + "trigger_data_free"); + if (!IS_ERR(kthread)) + WRITE_ONCE(trigger_kthread, kthread); + } + } + + if (!trigger_kthread) { + /* Do it the slow way */ + tracepoint_synchronize_unregister(); + kfree(data); + return; + } + + llist_add(&data->llist, &trigger_data_free_list); + wake_up_process(trigger_kthread); +} + +static inline void data_ops_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) +{ + const struct event_command *cmd_ops = data->cmd_ops; + + if (data->flags & EVENT_TRIGGER_FL_COUNT) { + if (!cmd_ops->count_func(data, buffer, rec, event)) + return; + } - kfree(data); + cmd_ops->trigger(data, buffer, rec, event); } /** @@ -70,7 +133,7 @@ event_triggers_call(struct trace_event_file *file, if (data->paused) continue; if (!rec) { - data->ops->trigger(data, buffer, rec, event); + data_ops_trigger(data, buffer, rec, event); continue; } filter = rcu_dereference_sched(data->filter); @@ -80,7 +143,7 @@ event_triggers_call(struct trace_event_file *file, tt |= data->cmd_ops->trigger_type; continue; } - data->ops->trigger(data, buffer, rec, event); + data_ops_trigger(data, buffer, rec, event); } return tt; } @@ -122,7 +185,7 @@ event_triggers_post_call(struct trace_event_file *file, if (data->paused) continue; if (data->cmd_ops->trigger_type & tt) - data->ops->trigger(data, NULL, NULL, NULL); + data_ops_trigger(data, NULL, NULL, NULL); } } EXPORT_SYMBOL_GPL(event_triggers_post_call); @@ -191,7 +254,7 @@ static int trigger_show(struct seq_file *m, void *v) } data = list_entry(v, struct event_trigger_data, list); - data->ops->print(m, data); + data->cmd_ops->print(m, data); return 0; } @@ -245,7 +308,8 @@ int trigger_process_regex(struct trace_event_file *file, char *buff) char *command, *next; struct event_command *p; - next = buff = skip_spaces(buff); + next = buff = strim(buff); + command = strsep(&next, ": \t"); if (next) { next = skip_spaces(next); @@ -282,8 +346,6 @@ static ssize_t event_trigger_regex_write(struct file *file, if (IS_ERR(buf)) return PTR_ERR(buf); - strim(buf); - guard(mutex)(&event_mutex); event_file = event_file_file(file); @@ -300,13 +362,9 @@ static ssize_t event_trigger_regex_write(struct file *file, static int event_trigger_regex_release(struct inode *inode, struct file *file) { - mutex_lock(&event_mutex); - if (file->f_mode & FMODE_READ) seq_release(inode, file); - mutex_unlock(&event_mutex); - return 0; } @@ -378,7 +436,37 @@ __init int unregister_event_command(struct event_command *cmd) } /** - * event_trigger_print - Generic event_trigger_ops @print implementation + * event_trigger_count - Optional count function for event triggers + * @data: Trigger-specific data + * @buffer: The ring buffer that the event is being written to + * @rec: The trace entry for the event, NULL for unconditional invocation + * @event: The event meta data in the ring buffer + * + * For triggers that can take a count parameter that doesn't do anything + * special, they can use this function to assign to their .count_func + * field. + * + * This simply does a count down of the @data->count field. + * + * If the @data->count is greater than zero, it will decrement it. + * + * Returns false if @data->count is zero, otherwise true. + */ +bool event_trigger_count(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) +{ + if (!data->count) + return false; + + if (data->count != -1) + (data->count)--; + + return true; +} + +/** + * event_trigger_print - Generic event_command @print implementation * @name: The name of the event trigger * @m: The seq_file being printed to * @data: Trigger-specific data @@ -413,7 +501,7 @@ event_trigger_print(const char *name, struct seq_file *m, } /** - * event_trigger_init - Generic event_trigger_ops @init implementation + * event_trigger_init - Generic event_command @init implementation * @data: Trigger-specific data * * Common implementation of event trigger initialization. @@ -430,7 +518,7 @@ int event_trigger_init(struct event_trigger_data *data) } /** - * event_trigger_free - Generic event_trigger_ops @free implementation + * event_trigger_free - Generic event_command @free implementation * @data: Trigger-specific data * * Common implementation of event trigger de-initialization. @@ -492,8 +580,8 @@ clear_event_triggers(struct trace_array *tr) list_for_each_entry_safe(data, n, &file->triggers, list) { trace_event_trigger_enable_disable(file, 0); list_del_rcu(&data->list); - if (data->ops->free) - data->ops->free(data); + if (data->cmd_ops->free) + data->cmd_ops->free(data); } } } @@ -552,16 +640,14 @@ static int register_trigger(char *glob, lockdep_assert_held(&event_mutex); list_for_each_entry(test, &file->triggers, list) { - if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) { - ret = -EEXIST; - goto out; - } + if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) + return -EEXIST; } - if (data->ops->init) { - ret = data->ops->init(data); + if (data->cmd_ops->init) { + ret = data->cmd_ops->init(data); if (ret < 0) - goto out; + return ret; } list_add_rcu(&data->list, &file->triggers); @@ -572,7 +658,6 @@ static int register_trigger(char *glob, list_del_rcu(&data->list); update_cond_flag(file); } -out: return ret; } @@ -598,8 +683,8 @@ static bool try_unregister_trigger(char *glob, } if (data) { - if (data->ops->free) - data->ops->free(data); + if (data->cmd_ops->free) + data->cmd_ops->free(data); return true; } @@ -647,7 +732,7 @@ static void unregister_trigger(char *glob, * param - text following cmd and ':' and stripped of filter * filter - the optional filter text following (and including) 'if' * - * To illustrate the use of these componenents, here are some concrete + * To illustrate the use of these components, here are some concrete * examples. For the following triggers: * * echo 'traceon:5 if pid == 0' > trigger @@ -770,7 +855,7 @@ int event_trigger_separate_filter(char *param_and_filter, char **param, if (!param_and_filter) { if (param_required) ret = -EINVAL; - goto out; + return ret; } /* @@ -781,7 +866,7 @@ int event_trigger_separate_filter(char *param_and_filter, char **param, */ if (!param_required && param_and_filter && !isdigit(param_and_filter[0])) { *filter = param_and_filter; - goto out; + return ret; } /* @@ -799,44 +884,45 @@ int event_trigger_separate_filter(char *param_and_filter, char **param, if (!**filter) *filter = NULL; } -out: return ret; } /** - * event_trigger_alloc - allocate and init event_trigger_data for a trigger + * trigger_data_alloc - allocate and init event_trigger_data for a trigger * @cmd_ops: The event_command operations for the trigger * @cmd: The cmd string * @param: The param string * @private_data: User data to associate with the event trigger * * Allocate an event_trigger_data instance and initialize it. The - * @cmd_ops are used along with the @cmd and @param to get the - * trigger_ops to assign to the event_trigger_data. @private_data can - * also be passed in and associated with the event_trigger_data. + * @cmd_ops defines how the trigger will operate. If @param is set, + * and @cmd_ops->trigger_ops->count_func is non NULL, then the + * data->count is set to @param and before the trigger is executed, the + * @cmd_ops->trigger_ops->count_func() is called. If that function returns + * false, the @cmd_ops->trigger_ops->trigger() function will not be called. + * @private_data can also be passed in and associated with the + * event_trigger_data. * - * Use event_trigger_free() to free an event_trigger_data object. + * Use trigger_data_free() to free an event_trigger_data object. * * Return: The trigger_data object success, NULL otherwise */ -struct event_trigger_data *event_trigger_alloc(struct event_command *cmd_ops, - char *cmd, - char *param, - void *private_data) +struct event_trigger_data *trigger_data_alloc(struct event_command *cmd_ops, + char *cmd, + char *param, + void *private_data) { struct event_trigger_data *trigger_data; - const struct event_trigger_ops *trigger_ops; - - trigger_ops = cmd_ops->get_trigger_ops(cmd, param); trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); if (!trigger_data) return NULL; trigger_data->count = -1; - trigger_data->ops = trigger_ops; trigger_data->cmd_ops = cmd_ops; trigger_data->private_data = private_data; + if (param && cmd_ops->count_func) + trigger_data->flags |= EVENT_TRIGGER_FL_COUNT; INIT_LIST_HEAD(&trigger_data->list); INIT_LIST_HEAD(&trigger_data->named_list); @@ -989,15 +1075,14 @@ event_trigger_parse(struct event_command *cmd_ops, return ret; ret = -ENOMEM; - trigger_data = event_trigger_alloc(cmd_ops, cmd, param, file); + trigger_data = trigger_data_alloc(cmd_ops, cmd, param, file); if (!trigger_data) - goto out; + return ret; if (remove) { event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); - kfree(trigger_data); - ret = 0; - goto out; + trigger_data_free(trigger_data); + return 0; } ret = event_trigger_parse_num(param, trigger_data); @@ -1017,13 +1102,12 @@ event_trigger_parse(struct event_command *cmd_ops, /* Down the counter of trigger_data or free it if not used anymore */ event_trigger_free(trigger_data); - out: return ret; out_free: event_trigger_reset_filter(cmd_ops, trigger_data); - kfree(trigger_data); - goto out; + trigger_data_free(trigger_data); + return ret; } /** @@ -1057,10 +1141,10 @@ int set_trigger_filter(char *filter_str, s = strsep(&filter_str, " \t"); if (!strlen(s) || strcmp(s, "if") != 0) - goto out; + return ret; if (!filter_str) - goto out; + return ret; /* The filter is for the 'trigger' event, not the triggered event */ ret = create_event_filter(file->tr, file->event_call, @@ -1104,7 +1188,6 @@ int set_trigger_filter(char *filter_str, ret = -ENOMEM; } } - out: return ret; } @@ -1278,31 +1361,28 @@ traceon_trigger(struct event_trigger_data *data, tracing_on(); } -static void -traceon_count_trigger(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *event) +static bool +traceon_count_func(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) { struct trace_event_file *file = data->private_data; if (file) { if (tracer_tracing_is_on(file->tr)) - return; + return false; } else { if (tracing_is_on()) - return; + return false; } if (!data->count) - return; + return false; if (data->count != -1) (data->count)--; - if (file) - tracer_tracing_on(file->tr); - else - tracing_on(); + return true; } static void @@ -1326,31 +1406,28 @@ traceoff_trigger(struct event_trigger_data *data, tracing_off(); } -static void -traceoff_count_trigger(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *event) +static bool +traceoff_count_func(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) { struct trace_event_file *file = data->private_data; if (file) { if (!tracer_tracing_is_on(file->tr)) - return; + return false; } else { if (!tracing_is_on()) - return; + return false; } if (!data->count) - return; + return false; if (data->count != -1) (data->count)--; - if (file) - tracer_tracing_off(file->tr); - else - tracing_off(); + return true; } static int @@ -1367,58 +1444,18 @@ traceoff_trigger_print(struct seq_file *m, struct event_trigger_data *data) data->filter_str); } -static const struct event_trigger_ops traceon_trigger_ops = { - .trigger = traceon_trigger, - .print = traceon_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static const struct event_trigger_ops traceon_count_trigger_ops = { - .trigger = traceon_count_trigger, - .print = traceon_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static const struct event_trigger_ops traceoff_trigger_ops = { - .trigger = traceoff_trigger, - .print = traceoff_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static const struct event_trigger_ops traceoff_count_trigger_ops = { - .trigger = traceoff_count_trigger, - .print = traceoff_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static const struct event_trigger_ops * -onoff_get_trigger_ops(char *cmd, char *param) -{ - const struct event_trigger_ops *ops; - - /* we register both traceon and traceoff to this callback */ - if (strcmp(cmd, "traceon") == 0) - ops = param ? &traceon_count_trigger_ops : - &traceon_trigger_ops; - else - ops = param ? &traceoff_count_trigger_ops : - &traceoff_trigger_ops; - - return ops; -} - static struct event_command trigger_traceon_cmd = { .name = "traceon", .trigger_type = ETT_TRACE_ONOFF, .parse = event_trigger_parse, .reg = register_trigger, .unreg = unregister_trigger, - .get_trigger_ops = onoff_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = traceon_trigger, + .count_func = traceon_count_func, + .print = traceon_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, }; static struct event_command trigger_traceoff_cmd = { @@ -1428,8 +1465,12 @@ static struct event_command trigger_traceoff_cmd = { .parse = event_trigger_parse, .reg = register_trigger, .unreg = unregister_trigger, - .get_trigger_ops = onoff_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = traceoff_trigger, + .count_func = traceoff_count_func, + .print = traceoff_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, }; #ifdef CONFIG_TRACER_SNAPSHOT @@ -1446,20 +1487,6 @@ snapshot_trigger(struct event_trigger_data *data, tracing_snapshot(); } -static void -snapshot_count_trigger(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *event) -{ - if (!data->count) - return; - - if (data->count != -1) - (data->count)--; - - snapshot_trigger(data, buffer, rec, event); -} - static int register_snapshot_trigger(char *glob, struct event_trigger_data *data, @@ -1491,34 +1518,18 @@ snapshot_trigger_print(struct seq_file *m, struct event_trigger_data *data) data->filter_str); } -static const struct event_trigger_ops snapshot_trigger_ops = { - .trigger = snapshot_trigger, - .print = snapshot_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static const struct event_trigger_ops snapshot_count_trigger_ops = { - .trigger = snapshot_count_trigger, - .print = snapshot_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static const struct event_trigger_ops * -snapshot_get_trigger_ops(char *cmd, char *param) -{ - return param ? &snapshot_count_trigger_ops : &snapshot_trigger_ops; -} - static struct event_command trigger_snapshot_cmd = { .name = "snapshot", .trigger_type = ETT_SNAPSHOT, .parse = event_trigger_parse, .reg = register_snapshot_trigger, .unreg = unregister_snapshot_trigger, - .get_trigger_ops = snapshot_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = snapshot_trigger, + .count_func = event_trigger_count, + .print = snapshot_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, }; static __init int register_trigger_snapshot_cmd(void) @@ -1560,25 +1571,11 @@ stacktrace_trigger(struct event_trigger_data *data, struct trace_event_file *file = data->private_data; if (file) - __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP); + __trace_stack(file->tr, tracing_gen_ctx_dec(), STACK_SKIP); else trace_dump_stack(STACK_SKIP); } -static void -stacktrace_count_trigger(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *event) -{ - if (!data->count) - return; - - if (data->count != -1) - (data->count)--; - - stacktrace_trigger(data, buffer, rec, event); -} - static int stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data) { @@ -1586,26 +1583,6 @@ stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data) data->filter_str); } -static const struct event_trigger_ops stacktrace_trigger_ops = { - .trigger = stacktrace_trigger, - .print = stacktrace_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static const struct event_trigger_ops stacktrace_count_trigger_ops = { - .trigger = stacktrace_count_trigger, - .print = stacktrace_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static const struct event_trigger_ops * -stacktrace_get_trigger_ops(char *cmd, char *param) -{ - return param ? &stacktrace_count_trigger_ops : &stacktrace_trigger_ops; -} - static struct event_command trigger_stacktrace_cmd = { .name = "stacktrace", .trigger_type = ETT_STACKTRACE, @@ -1613,8 +1590,12 @@ static struct event_command trigger_stacktrace_cmd = { .parse = event_trigger_parse, .reg = register_trigger, .unreg = unregister_trigger, - .get_trigger_ops = stacktrace_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = stacktrace_trigger, + .count_func = event_trigger_count, + .print = stacktrace_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, }; static __init int register_trigger_stacktrace_cmd(void) @@ -1649,24 +1630,24 @@ event_enable_trigger(struct event_trigger_data *data, set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); } -static void -event_enable_count_trigger(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *event) +static bool +event_enable_count_func(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) { struct enable_trigger_data *enable_data = data->private_data; if (!data->count) - return; + return false; /* Skip if the event is in a state we want to switch to */ if (enable_data->enable == !(enable_data->file->flags & EVENT_FILE_FL_SOFT_DISABLED)) - return; + return false; if (data->count != -1) (data->count)--; - event_enable_trigger(data, buffer, rec, event); + return true; } int event_enable_trigger_print(struct seq_file *m, @@ -1711,34 +1692,6 @@ void event_enable_trigger_free(struct event_trigger_data *data) } } -static const struct event_trigger_ops event_enable_trigger_ops = { - .trigger = event_enable_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static const struct event_trigger_ops event_enable_count_trigger_ops = { - .trigger = event_enable_count_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static const struct event_trigger_ops event_disable_trigger_ops = { - .trigger = event_enable_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static const struct event_trigger_ops event_disable_count_trigger_ops = { - .trigger = event_enable_count_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - int event_enable_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, char *cmd, char *param_and_filter) @@ -1772,7 +1725,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, ret = -EINVAL; event_enable_file = find_event_file(tr, system, event); if (!event_enable_file) - goto out; + return ret; #ifdef CONFIG_HIST_TRIGGERS hist = ((strcmp(cmd, ENABLE_HIST_STR) == 0) || @@ -1787,16 +1740,16 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL); if (!enable_data) - goto out; + return ret; enable_data->hist = hist; enable_data->enable = enable; enable_data->file = event_enable_file; - trigger_data = event_trigger_alloc(cmd_ops, cmd, param, enable_data); + trigger_data = trigger_data_alloc(cmd_ops, cmd, param, enable_data); if (!trigger_data) { kfree(enable_data); - goto out; + return ret; } if (remove) { @@ -1804,7 +1757,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, kfree(trigger_data); kfree(enable_data); ret = 0; - goto out; + return ret; } /* Up the trigger_data count to make sure nothing frees it on failure */ @@ -1834,7 +1787,6 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, goto out_disable; event_trigger_free(trigger_data); - out: return ret; out_disable: trace_event_enable_disable(event_enable_file, 0, 1); @@ -1845,7 +1797,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, event_trigger_free(trigger_data); kfree(enable_data); - goto out; + return ret; } int event_enable_register_trigger(char *glob, @@ -1865,15 +1817,14 @@ int event_enable_register_trigger(char *glob, (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) && (test_enable_data->file == enable_data->file)) { - ret = -EEXIST; - goto out; + return -EEXIST; } } - if (data->ops->init) { - ret = data->ops->init(data); + if (data->cmd_ops->init) { + ret = data->cmd_ops->init(data); if (ret < 0) - goto out; + return ret; } list_add_rcu(&data->list, &file->triggers); @@ -1884,7 +1835,6 @@ int event_enable_register_trigger(char *glob, list_del_rcu(&data->list); update_cond_flag(file); } -out: return ret; } @@ -1912,30 +1862,8 @@ void event_enable_unregister_trigger(char *glob, } } - if (data && data->ops->free) - data->ops->free(data); -} - -static const struct event_trigger_ops * -event_enable_get_trigger_ops(char *cmd, char *param) -{ - const struct event_trigger_ops *ops; - bool enable; - -#ifdef CONFIG_HIST_TRIGGERS - enable = ((strcmp(cmd, ENABLE_EVENT_STR) == 0) || - (strcmp(cmd, ENABLE_HIST_STR) == 0)); -#else - enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; -#endif - if (enable) - ops = param ? &event_enable_count_trigger_ops : - &event_enable_trigger_ops; - else - ops = param ? &event_disable_count_trigger_ops : - &event_disable_trigger_ops; - - return ops; + if (data && data->cmd_ops->free) + data->cmd_ops->free(data); } static struct event_command trigger_enable_cmd = { @@ -1944,8 +1872,12 @@ static struct event_command trigger_enable_cmd = { .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, - .get_trigger_ops = event_enable_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = event_enable_trigger, + .count_func = event_enable_count_func, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, }; static struct event_command trigger_disable_cmd = { @@ -1954,8 +1886,12 @@ static struct event_command trigger_disable_cmd = { .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, - .get_trigger_ops = event_enable_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = event_enable_trigger, + .count_func = event_enable_count_func, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, }; static __init void unregister_trigger_enable_disable_cmds(void) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index af42aaa3d172..dca6e50b3b21 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -496,7 +496,7 @@ static bool user_event_enabler_queue_fault(struct user_event_mm *mm, { struct user_event_enabler_fault *fault; - fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT | __GFP_NOWARN); + fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT); if (!fault) return false; @@ -835,7 +835,7 @@ void user_event_mm_remove(struct task_struct *t) * so we use a work queue after call_rcu() to run within. */ INIT_RCU_WORK(&mm->put_rwork, delayed_user_event_mm_put); - queue_rcu_work(system_wq, &mm->put_rwork); + queue_rcu_work(system_percpu_wq, &mm->put_rwork); } void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm) @@ -1041,7 +1041,7 @@ static int user_field_array_size(const char *type) static int user_field_size(const char *type) { - /* long is not allowed from a user, since it's ambigious in size */ + /* long is not allowed from a user, since it's ambiguous in size */ if (strcmp(type, "s64") == 0) return sizeof(s64); if (strcmp(type, "u64") == 0) @@ -1079,7 +1079,7 @@ static int user_field_size(const char *type) if (str_has_prefix(type, "__rel_loc ")) return sizeof(u32); - /* Uknown basic type, error */ + /* Unknown basic type, error */ return -EINVAL; } @@ -1449,12 +1449,7 @@ static struct trace_event_functions user_event_funcs = { static int user_event_set_call_visible(struct user_event *user, bool visible) { - int ret; - const struct cred *old_cred; - struct cred *cred; - - cred = prepare_creds(); - + CLASS(prepare_creds, cred)(); if (!cred) return -ENOMEM; @@ -1469,17 +1464,12 @@ static int user_event_set_call_visible(struct user_event *user, bool visible) */ cred->fsuid = GLOBAL_ROOT_UID; - old_cred = override_creds(cred); - - if (visible) - ret = trace_add_event_call(&user->call); - else - ret = trace_remove_event_call(&user->call); - - revert_creds(old_cred); - put_cred(cred); + scoped_with_creds(cred) { + if (visible) + return trace_add_event_call(&user->call); - return ret; + return trace_remove_event_call(&user->call); + } } static int destroy_user_event(struct user_event *user) @@ -2475,7 +2465,7 @@ static long user_events_ioctl_reg(struct user_event_file_info *info, /* * Prevent users from using the same address and bit multiple times * within the same mm address space. This can cause unexpected behavior - * for user processes that is far easier to debug if this is explictly + * for user processes that is far easier to debug if this is explicitly * an error upon registering. */ if (current_user_event_enabler_exists((unsigned long)reg.enable_addr, diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index b40fa59159ac..262c0556e4af 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -4,15 +4,18 @@ * Copyright (C) 2022 Google LLC. */ #define pr_fmt(fmt) "trace_fprobe: " fmt -#include <asm/ptrace.h> #include <linux/fprobe.h> +#include <linux/list.h> #include <linux/module.h> +#include <linux/mutex.h> #include <linux/rculist.h> #include <linux/security.h> #include <linux/tracepoint.h> #include <linux/uaccess.h> +#include <asm/ptrace.h> + #include "trace_dynevent.h" #include "trace_probe.h" #include "trace_probe_kernel.h" @@ -21,7 +24,6 @@ #define FPROBE_EVENT_SYSTEM "fprobes" #define TRACEPOINT_EVENT_SYSTEM "tracepoints" #define RETHOOK_MAXACTIVE_MAX 4096 -#define TRACEPOINT_STUB ERR_PTR(-ENOENT) static int trace_fprobe_create(const char *raw_command); static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev); @@ -38,15 +40,157 @@ static struct dyn_event_operations trace_fprobe_ops = { .match = trace_fprobe_match, }; +/* List of tracepoint_user */ +static LIST_HEAD(tracepoint_user_list); +static DEFINE_MUTEX(tracepoint_user_mutex); + +/* While living tracepoint_user, @tpoint can be NULL and @refcount != 0. */ +struct tracepoint_user { + struct list_head list; + const char *name; + struct tracepoint *tpoint; + unsigned int refcount; +}; + +/* NOTE: you must lock tracepoint_user_mutex. */ +#define for_each_tracepoint_user(tuser) \ + list_for_each_entry(tuser, &tracepoint_user_list, list) + +static int tracepoint_user_register(struct tracepoint_user *tuser) +{ + struct tracepoint *tpoint = tuser->tpoint; + + if (!tpoint) + return 0; + + return tracepoint_probe_register_prio_may_exist(tpoint, + tpoint->probestub, NULL, 0); +} + +static void tracepoint_user_unregister(struct tracepoint_user *tuser) +{ + if (!tuser->tpoint) + return; + + WARN_ON_ONCE(tracepoint_probe_unregister(tuser->tpoint, tuser->tpoint->probestub, NULL)); + tuser->tpoint = NULL; +} + +static unsigned long tracepoint_user_ip(struct tracepoint_user *tuser) +{ + if (!tuser->tpoint) + return 0UL; + + return (unsigned long)tuser->tpoint->probestub; +} + +static void __tracepoint_user_free(struct tracepoint_user *tuser) +{ + if (!tuser) + return; + kfree(tuser->name); + kfree(tuser); +} + +DEFINE_FREE(tuser_free, struct tracepoint_user *, __tracepoint_user_free(_T)) + +static struct tracepoint_user *__tracepoint_user_init(const char *name, struct tracepoint *tpoint) +{ + struct tracepoint_user *tuser __free(tuser_free) = NULL; + int ret; + + tuser = kzalloc(sizeof(*tuser), GFP_KERNEL); + if (!tuser) + return NULL; + tuser->name = kstrdup(name, GFP_KERNEL); + if (!tuser->name) + return NULL; + + /* Register tracepoint if it is loaded. */ + if (tpoint) { + tuser->tpoint = tpoint; + ret = tracepoint_user_register(tuser); + if (ret) + return ERR_PTR(ret); + } + + tuser->refcount = 1; + INIT_LIST_HEAD(&tuser->list); + list_add(&tuser->list, &tracepoint_user_list); + + return_ptr(tuser); +} + +static struct tracepoint *find_tracepoint(const char *tp_name, + struct module **tp_mod); + +/* + * Get tracepoint_user if exist, or allocate new one and register it. + * If tracepoint is on a module, get its refcounter too. + * This returns errno or NULL (not loaded yet) or tracepoint_user. + */ +static struct tracepoint_user *tracepoint_user_find_get(const char *name, struct module **pmod) +{ + struct module *mod __free(module_put) = NULL; + struct tracepoint_user *tuser; + struct tracepoint *tpoint; + + if (!name || !pmod) + return ERR_PTR(-EINVAL); + + /* Get and lock the module which has tracepoint. */ + tpoint = find_tracepoint(name, &mod); + + guard(mutex)(&tracepoint_user_mutex); + /* Search existing tracepoint_user */ + for_each_tracepoint_user(tuser) { + if (!strcmp(tuser->name, name)) { + tuser->refcount++; + *pmod = no_free_ptr(mod); + return tuser; + } + } + + /* The corresponding tracepoint_user is not found. */ + tuser = __tracepoint_user_init(name, tpoint); + if (!IS_ERR_OR_NULL(tuser)) + *pmod = no_free_ptr(mod); + + return tuser; +} + +static void tracepoint_user_put(struct tracepoint_user *tuser) +{ + scoped_guard(mutex, &tracepoint_user_mutex) { + if (--tuser->refcount > 0) + return; + + list_del(&tuser->list); + tracepoint_user_unregister(tuser); + } + + __tracepoint_user_free(tuser); +} + +DEFINE_FREE(tuser_put, struct tracepoint_user *, + if (!IS_ERR_OR_NULL(_T)) + tracepoint_user_put(_T)) + /* * Fprobe event core functions */ + +/* + * @tprobe is true for tracepoint probe. + * @tuser can be NULL if the trace_fprobe is disabled or the tracepoint is not + * loaded with a module. If @tuser != NULL, this trace_fprobe is enabled. + */ struct trace_fprobe { struct dyn_event devent; struct fprobe fp; const char *symbol; - struct tracepoint *tpoint; - struct module *mod; + bool tprobe; + struct tracepoint_user *tuser; struct trace_probe tp; }; @@ -76,7 +220,7 @@ static bool trace_fprobe_is_return(struct trace_fprobe *tf) static bool trace_fprobe_is_tracepoint(struct trace_fprobe *tf) { - return tf->tpoint != NULL; + return tf->tprobe; } static const char *trace_fprobe_symbol(struct trace_fprobe *tf) @@ -379,13 +523,14 @@ static int fentry_dispatcher(struct fprobe *fp, unsigned long entry_ip, void *entry_data) { struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); + unsigned int flags = trace_probe_load_flag(&tf->tp); int ret = 0; - if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE)) + if (flags & TP_FLAG_TRACE) fentry_trace_func(tf, entry_ip, fregs); #ifdef CONFIG_PERF_EVENTS - if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE)) + if (flags & TP_FLAG_PROFILE) ret = fentry_perf_func(tf, entry_ip, fregs); #endif return ret; @@ -397,11 +542,12 @@ static void fexit_dispatcher(struct fprobe *fp, unsigned long entry_ip, void *entry_data) { struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); + unsigned int flags = trace_probe_load_flag(&tf->tp); - if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE)) + if (flags & TP_FLAG_TRACE) fexit_trace_func(tf, entry_ip, ret_ip, fregs, entry_data); #ifdef CONFIG_PERF_EVENTS - if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE)) + if (flags & TP_FLAG_PROFILE) fexit_perf_func(tf, entry_ip, ret_ip, fregs, entry_data); #endif } @@ -411,6 +557,8 @@ static void free_trace_fprobe(struct trace_fprobe *tf) { if (tf) { trace_probe_cleanup(&tf->tp); + if (tf->tuser) + tracepoint_user_put(tf->tuser); kfree(tf->symbol); kfree(tf); } @@ -425,9 +573,8 @@ DEFINE_FREE(free_trace_fprobe, struct trace_fprobe *, if (!IS_ERR_OR_NULL(_T)) f static struct trace_fprobe *alloc_trace_fprobe(const char *group, const char *event, const char *symbol, - struct tracepoint *tpoint, - struct module *mod, - int nargs, bool is_return) + int nargs, bool is_return, + bool is_tracepoint) { struct trace_fprobe *tf __free(free_trace_fprobe) = NULL; int ret = -ENOMEM; @@ -445,8 +592,7 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group, else tf->fp.entry_handler = fentry_dispatcher; - tf->tpoint = tpoint; - tf->mod = mod; + tf->tprobe = is_tracepoint; ret = trace_probe_init(&tf->tp, event, group, false, nargs); if (ret < 0) @@ -469,98 +615,6 @@ static struct trace_fprobe *find_trace_fprobe(const char *event, return NULL; } -static inline int __enable_trace_fprobe(struct trace_fprobe *tf) -{ - if (trace_fprobe_is_registered(tf)) - enable_fprobe(&tf->fp); - - return 0; -} - -static void __disable_trace_fprobe(struct trace_probe *tp) -{ - struct trace_fprobe *tf; - - list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { - if (!trace_fprobe_is_registered(tf)) - continue; - disable_fprobe(&tf->fp); - } -} - -/* - * Enable trace_probe - * if the file is NULL, enable "perf" handler, or enable "trace" handler. - */ -static int enable_trace_fprobe(struct trace_event_call *call, - struct trace_event_file *file) -{ - struct trace_probe *tp; - struct trace_fprobe *tf; - bool enabled; - int ret = 0; - - tp = trace_probe_primary_from_call(call); - if (WARN_ON_ONCE(!tp)) - return -ENODEV; - enabled = trace_probe_is_enabled(tp); - - /* This also changes "enabled" state */ - if (file) { - ret = trace_probe_add_file(tp, file); - if (ret) - return ret; - } else - trace_probe_set_flag(tp, TP_FLAG_PROFILE); - - if (!enabled) { - list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { - /* TODO: check the fprobe is gone */ - __enable_trace_fprobe(tf); - } - } - - return 0; -} - -/* - * Disable trace_probe - * if the file is NULL, disable "perf" handler, or disable "trace" handler. - */ -static int disable_trace_fprobe(struct trace_event_call *call, - struct trace_event_file *file) -{ - struct trace_probe *tp; - - tp = trace_probe_primary_from_call(call); - if (WARN_ON_ONCE(!tp)) - return -ENODEV; - - if (file) { - if (!trace_probe_get_file_link(tp, file)) - return -ENOENT; - if (!trace_probe_has_single_file(tp)) - goto out; - trace_probe_clear_flag(tp, TP_FLAG_TRACE); - } else - trace_probe_clear_flag(tp, TP_FLAG_PROFILE); - - if (!trace_probe_is_enabled(tp)) - __disable_trace_fprobe(tp); - - out: - if (file) - /* - * Synchronization is done in below function. For perf event, - * file == NULL and perf_trace_event_unreg() calls - * tracepoint_synchronize_unregister() to ensure synchronize - * event. We don't need to care about it. - */ - trace_probe_remove_file(tp, file); - - return 0; -} - /* Event entry printers */ static enum print_line_t print_fentry_event(struct trace_iterator *iter, int flags, @@ -578,7 +632,7 @@ print_fentry_event(struct trace_iterator *iter, int flags, trace_seq_printf(s, "%s: (", trace_probe_name(tp)); - if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) + if (!seq_print_ip_sym_offset(s, field->ip, flags)) goto out; trace_seq_putc(s, ')'); @@ -608,12 +662,12 @@ print_fexit_event(struct trace_iterator *iter, int flags, trace_seq_printf(s, "%s: (", trace_probe_name(tp)); - if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) + if (!seq_print_ip_sym_offset(s, field->ret_ip, flags)) goto out; trace_seq_puts(s, " <- "); - if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) + if (!seq_print_ip_sym_no_offset(s, field->func, flags)) goto out; trace_seq_putc(s, ')'); @@ -712,20 +766,52 @@ static int unregister_fprobe_event(struct trace_fprobe *tf) static int __regsiter_tracepoint_fprobe(struct trace_fprobe *tf) { - struct tracepoint *tpoint = tf->tpoint; - unsigned long ip = (unsigned long)tpoint->probestub; + struct tracepoint_user *tuser __free(tuser_put) = NULL; + struct module *mod __free(module_put) = NULL; + unsigned long ip; int ret; + if (WARN_ON_ONCE(tf->tuser)) + return -EINVAL; + + /* If the tracepoint is in a module, it must be locked in this function. */ + tuser = tracepoint_user_find_get(tf->symbol, &mod); + /* This tracepoint is not loaded yet */ + if (IS_ERR(tuser)) + return PTR_ERR(tuser); + if (!tuser) + return -ENOMEM; + + /* Register fprobe only if the tracepoint is loaded. */ + if (tuser->tpoint) { + ip = tracepoint_user_ip(tuser); + if (WARN_ON_ONCE(!ip)) + return -ENOENT; + + ret = register_fprobe_ips(&tf->fp, &ip, 1); + if (ret < 0) + return ret; + } + + tf->tuser = no_free_ptr(tuser); + return 0; +} + +/* Returns an error if the target function is not available, or 0 */ +static int trace_fprobe_verify_target(struct trace_fprobe *tf) +{ + int ret; + + /* Tracepoint should have a stub function. */ + if (trace_fprobe_is_tracepoint(tf)) + return 0; + /* - * Here, we do 2 steps to enable fprobe on a tracepoint. - * At first, put __probestub_##TP function on the tracepoint - * and put a fprobe on the stub function. + * Note: since we don't lock the module, even if this succeeded, + * register_fprobe() later can fail. */ - ret = tracepoint_probe_register_prio_may_exist(tpoint, - tpoint->probestub, NULL, 0); - if (ret < 0) - return ret; - return register_fprobe_ips(&tf->fp, &ip, 1); + ret = fprobe_count_ips_from_filter(tf->symbol, NULL); + return (ret < 0) ? ret : 0; } /* Internal register function - just handle fprobe and flags */ @@ -747,20 +833,10 @@ static int __register_trace_fprobe(struct trace_fprobe *tf) return ret; } - /* Set/clear disabled flag according to tp->flag */ - if (trace_probe_is_enabled(&tf->tp)) - tf->fp.flags &= ~FPROBE_FL_DISABLED; - else - tf->fp.flags |= FPROBE_FL_DISABLED; - - if (trace_fprobe_is_tracepoint(tf)) { - - /* This tracepoint is not loaded yet */ - if (tf->tpoint == TRACEPOINT_STUB) - return 0; + tf->fp.flags &= ~FPROBE_FL_DISABLED; + if (trace_fprobe_is_tracepoint(tf)) return __regsiter_tracepoint_fprobe(tf); - } /* TODO: handle filter, nofilter or symbol list */ return register_fprobe(&tf->fp, tf->symbol, NULL); @@ -769,15 +845,11 @@ static int __register_trace_fprobe(struct trace_fprobe *tf) /* Internal unregister function - just handle fprobe and flags */ static void __unregister_trace_fprobe(struct trace_fprobe *tf) { - if (trace_fprobe_is_registered(tf)) { + if (trace_fprobe_is_registered(tf)) unregister_fprobe(&tf->fp); - memset(&tf->fp, 0, sizeof(tf->fp)); - if (trace_fprobe_is_tracepoint(tf)) { - tracepoint_probe_unregister(tf->tpoint, - tf->tpoint->probestub, NULL); - tf->tpoint = NULL; - tf->mod = NULL; - } + if (tf->tuser) { + tracepoint_user_put(tf->tuser); + tf->tuser = NULL; } } @@ -837,7 +909,7 @@ static bool trace_fprobe_has_same_fprobe(struct trace_fprobe *orig, return false; } -static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to) +static int append_trace_fprobe_event(struct trace_fprobe *tf, struct trace_fprobe *to) { int ret; @@ -865,7 +937,7 @@ static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to) if (ret) return ret; - ret = __register_trace_fprobe(tf); + ret = trace_fprobe_verify_target(tf); if (ret) trace_probe_unlink(&tf->tp); else @@ -874,8 +946,8 @@ static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to) return ret; } -/* Register a trace_probe and probe_event */ -static int register_trace_fprobe(struct trace_fprobe *tf) +/* Register a trace_probe and probe_event, and check the fprobe is available. */ +static int register_trace_fprobe_event(struct trace_fprobe *tf) { struct trace_fprobe *old_tf; int ret; @@ -885,7 +957,7 @@ static int register_trace_fprobe(struct trace_fprobe *tf) old_tf = find_trace_fprobe(trace_probe_name(&tf->tp), trace_probe_group_name(&tf->tp)); if (old_tf) - return append_trace_fprobe(tf, old_tf); + return append_trace_fprobe_event(tf, old_tf); /* Register new event */ ret = register_fprobe_event(tf); @@ -898,8 +970,8 @@ static int register_trace_fprobe(struct trace_fprobe *tf) return ret; } - /* Register fprobe */ - ret = __register_trace_fprobe(tf); + /* Verify fprobe is sane. */ + ret = trace_fprobe_verify_target(tf); if (ret < 0) unregister_fprobe_event(tf); else @@ -963,15 +1035,6 @@ static struct tracepoint *find_tracepoint(const char *tp_name, } #ifdef CONFIG_MODULES -static void reenable_trace_fprobe(struct trace_fprobe *tf) -{ - struct trace_probe *tp = &tf->tp; - - list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { - __enable_trace_fprobe(tf); - } -} - /* * Find a tracepoint from specified module. In this case, this does not get the * module's refcount. The caller must ensure the module is not freed. @@ -988,36 +1051,94 @@ static struct tracepoint *find_tracepoint_in_module(struct module *mod, return data.tpoint; } +/* These are CONFIG_MODULES=y specific functions. */ +static bool tracepoint_user_within_module(struct tracepoint_user *tuser, + struct module *mod) +{ + return within_module(tracepoint_user_ip(tuser), mod); +} + +static int tracepoint_user_register_again(struct tracepoint_user *tuser, + struct tracepoint *tpoint) +{ + tuser->tpoint = tpoint; + return tracepoint_user_register(tuser); +} + +static void tracepoint_user_unregister_clear(struct tracepoint_user *tuser) +{ + tracepoint_user_unregister(tuser); + tuser->tpoint = NULL; +} + +/* module callback for tracepoint_user */ static int __tracepoint_probe_module_cb(struct notifier_block *self, unsigned long val, void *data) { struct tp_module *tp_mod = data; + struct tracepoint_user *tuser; struct tracepoint *tpoint; + + if (val != MODULE_STATE_GOING && val != MODULE_STATE_COMING) + return NOTIFY_DONE; + + mutex_lock(&tracepoint_user_mutex); + for_each_tracepoint_user(tuser) { + if (val == MODULE_STATE_COMING) { + /* This is not a tracepoint in this module. Skip it. */ + tpoint = find_tracepoint_in_module(tp_mod->mod, tuser->name); + if (!tpoint) + continue; + WARN_ON_ONCE(tracepoint_user_register_again(tuser, tpoint)); + } else if (val == MODULE_STATE_GOING && + tracepoint_user_within_module(tuser, tp_mod->mod)) { + /* Unregister all tracepoint_user in this module. */ + tracepoint_user_unregister_clear(tuser); + } + } + mutex_unlock(&tracepoint_user_mutex); + + return NOTIFY_DONE; +} + +static struct notifier_block tracepoint_module_nb = { + .notifier_call = __tracepoint_probe_module_cb, +}; + +/* module callback for tprobe events */ +static int __tprobe_event_module_cb(struct notifier_block *self, + unsigned long val, void *data) +{ struct trace_fprobe *tf; struct dyn_event *pos; + struct module *mod = data; if (val != MODULE_STATE_GOING && val != MODULE_STATE_COMING) return NOTIFY_DONE; mutex_lock(&event_mutex); for_each_trace_fprobe(tf, pos) { - if (val == MODULE_STATE_COMING && tf->tpoint == TRACEPOINT_STUB) { - tpoint = find_tracepoint_in_module(tp_mod->mod, tf->symbol); - if (tpoint) { - tf->tpoint = tpoint; - tf->mod = tp_mod->mod; - if (!WARN_ON_ONCE(__regsiter_tracepoint_fprobe(tf)) && - trace_probe_is_enabled(&tf->tp)) - reenable_trace_fprobe(tf); - } - } else if (val == MODULE_STATE_GOING && tp_mod->mod == tf->mod) { + /* Skip fprobe and disabled tprobe events. */ + if (!trace_fprobe_is_tracepoint(tf) || !tf->tuser) + continue; + + /* Before this notification, tracepoint notifier has already done. */ + if (val == MODULE_STATE_COMING && + tracepoint_user_within_module(tf->tuser, mod)) { + unsigned long ip = tracepoint_user_ip(tf->tuser); + + WARN_ON_ONCE(register_fprobe_ips(&tf->fp, &ip, 1)); + } else if (val == MODULE_STATE_GOING && + /* + * tracepoint_user_within_module() does not work here because + * tracepoint_user is already unregistered and cleared tpoint. + * Instead, checking whether the fprobe is registered but + * tpoint is cleared(unregistered). Such unbalance probes + * must be adjusted anyway. + */ + trace_fprobe_is_registered(tf) && + !tf->tuser->tpoint) { unregister_fprobe(&tf->fp); - if (trace_fprobe_is_tracepoint(tf)) { - tracepoint_probe_unregister(tf->tpoint, - tf->tpoint->probestub, NULL); - tf->tpoint = TRACEPOINT_STUB; - tf->mod = NULL; - } } } mutex_unlock(&event_mutex); @@ -1025,8 +1146,11 @@ static int __tracepoint_probe_module_cb(struct notifier_block *self, return NOTIFY_DONE; } -static struct notifier_block tracepoint_module_nb = { - .notifier_call = __tracepoint_probe_module_cb, +/* NOTE: this must be called after tracepoint callback */ +static struct notifier_block tprobe_event_module_nb = { + .notifier_call = __tprobe_event_module_cb, + /* Make sure this is later than tracepoint module notifier. */ + .priority = -10, }; #endif /* CONFIG_MODULES */ @@ -1086,8 +1210,6 @@ static int parse_symbol_and_return(int argc, const char *argv[], return 0; } -DEFINE_FREE(module_put, struct module *, if (_T) module_put(_T)) - static int trace_fprobe_create_internal(int argc, const char *argv[], struct traceprobe_parse_context *ctx) { @@ -1116,19 +1238,18 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], * FETCHARG:TYPE : use TYPE instead of unsigned long. */ struct trace_fprobe *tf __free(free_trace_fprobe) = NULL; - int i, new_argc = 0, ret = 0; - bool is_return = false; - char *symbol __free(kfree) = NULL; const char *event = NULL, *group = FPROBE_EVENT_SYSTEM; + struct module *mod __free(module_put) = NULL; const char **new_argv __free(kfree) = NULL; - char buf[MAX_EVENT_NAME_LEN]; - char gbuf[MAX_EVENT_NAME_LEN]; - char sbuf[KSYM_NAME_LEN]; - char abuf[MAX_BTF_ARGS_LEN]; + char *symbol __free(kfree) = NULL; + char *ebuf __free(kfree) = NULL; + char *gbuf __free(kfree) = NULL; + char *sbuf __free(kfree) = NULL; + char *abuf __free(kfree) = NULL; char *dbuf __free(kfree) = NULL; + int i, new_argc = 0, ret = 0; bool is_tracepoint = false; - struct module *tp_mod __free(module_put) = NULL; - struct tracepoint *tpoint = NULL; + bool is_return = false; if ((argv[0][0] != 'f' && argv[0][0] != 't') || argc < 2) return -ECANCELED; @@ -1156,6 +1277,9 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], trace_probe_log_set_index(0); if (event) { + gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!gbuf) + return -ENOMEM; ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) @@ -1163,15 +1287,18 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], } if (!event) { + ebuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!ebuf) + return -ENOMEM; /* Make a new event name */ if (is_tracepoint) - snprintf(buf, MAX_EVENT_NAME_LEN, "%s%s", + snprintf(ebuf, MAX_EVENT_NAME_LEN, "%s%s", isdigit(*symbol) ? "_" : "", symbol); else - snprintf(buf, MAX_EVENT_NAME_LEN, "%s__%s", symbol, + snprintf(ebuf, MAX_EVENT_NAME_LEN, "%s__%s", symbol, is_return ? "exit" : "entry"); - sanitize_event_name(buf); - event = buf; + sanitize_event_name(ebuf); + event = ebuf; } if (is_return) @@ -1179,25 +1306,28 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], else ctx->flags |= TPARG_FL_FENTRY; + ctx->funcname = NULL; if (is_tracepoint) { + /* Get tracepoint and lock its module until the end of the registration. */ + struct tracepoint *tpoint; + ctx->flags |= TPARG_FL_TPOINT; - tpoint = find_tracepoint(symbol, &tp_mod); + mod = NULL; + tpoint = find_tracepoint(symbol, &mod); if (tpoint) { - ctx->funcname = kallsyms_lookup( - (unsigned long)tpoint->probestub, - NULL, NULL, NULL, sbuf); - } else if (IS_ENABLED(CONFIG_MODULES)) { - /* This *may* be loaded afterwards */ - tpoint = TRACEPOINT_STUB; - ctx->funcname = symbol; - } else { - trace_probe_log_set_index(1); - trace_probe_log_err(0, NO_TRACEPOINT); - return -EINVAL; + sbuf = kmalloc(KSYM_NAME_LEN, GFP_KERNEL); + if (!sbuf) + return -ENOMEM; + ctx->funcname = kallsyms_lookup((unsigned long)tpoint->probestub, + NULL, NULL, NULL, sbuf); } - } else + } + if (!ctx->funcname) ctx->funcname = symbol; + abuf = kmalloc(MAX_BTF_ARGS_LEN, GFP_KERNEL); + if (!abuf) + return -ENOMEM; argc -= 2; argv += 2; new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc, abuf, MAX_BTF_ARGS_LEN, ctx); @@ -1218,8 +1348,7 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], return ret; /* setup a probe */ - tf = alloc_trace_fprobe(group, event, symbol, tpoint, tp_mod, - argc, is_return); + tf = alloc_trace_fprobe(group, event, symbol, argc, is_return, is_tracepoint); if (IS_ERR(tf)) { ret = PTR_ERR(tf); /* This must return -ENOMEM, else there is a bug */ @@ -1251,7 +1380,7 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], if (ret < 0) return ret; - ret = register_trace_fprobe(tf); + ret = register_trace_fprobe_event(tf); if (ret) { trace_probe_log_set_index(1); if (ret == -EILSEQ) @@ -1271,14 +1400,17 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], static int trace_fprobe_create_cb(int argc, const char *argv[]) { - struct traceprobe_parse_context ctx = { - .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE, - }; + struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL; int ret; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE; + trace_probe_log_init("trace_fprobe", argc, argv); - ret = trace_fprobe_create_internal(argc, argv, &ctx); - traceprobe_finish_parse(&ctx); + ret = trace_fprobe_create_internal(argc, argv, ctx); trace_probe_log_clear(); return ret; } @@ -1321,6 +1453,88 @@ static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev) } /* + * Enable trace_probe + * if the file is NULL, enable "perf" handler, or enable "trace" handler. + */ +static int enable_trace_fprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *tp; + struct trace_fprobe *tf; + bool enabled; + int ret = 0; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + enabled = trace_probe_is_enabled(tp); + + /* This also changes "enabled" state */ + if (file) { + ret = trace_probe_add_file(tp, file); + if (ret) + return ret; + } else + trace_probe_set_flag(tp, TP_FLAG_PROFILE); + + if (!enabled) { + list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { + ret = __register_trace_fprobe(tf); + if (ret < 0) + return ret; + } + } + + return 0; +} + +/* + * Disable trace_probe + * if the file is NULL, disable "perf" handler, or disable "trace" handler. + */ +static int disable_trace_fprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_fprobe *tf; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + + if (file) { + if (!trace_probe_get_file_link(tp, file)) + return -ENOENT; + if (!trace_probe_has_single_file(tp)) + goto out; + trace_probe_clear_flag(tp, TP_FLAG_TRACE); + } else + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + + if (!trace_probe_is_enabled(tp)) { + list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { + unregister_fprobe(&tf->fp); + if (tf->tuser) { + tracepoint_user_put(tf->tuser); + tf->tuser = NULL; + } + } + } + + out: + if (file) + /* + * Synchronization is done in below function. For perf event, + * file == NULL and perf_trace_event_unreg() calls + * tracepoint_synchronize_unregister() to ensure synchronize + * event. We don't need to care about it. + */ + trace_probe_remove_file(tp, file); + + return 0; +} + +/* * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex. */ static int fprobe_register(struct trace_event_call *event, @@ -1365,6 +1579,9 @@ static __init int init_fprobe_trace_early(void) ret = register_tracepoint_module_notifier(&tracepoint_module_nb); if (ret) return ret; + ret = register_module_notifier(&tprobe_event_module_nb); + if (ret) + return ret; #endif return 0; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 98ccf3f00c51..c12795c2fb39 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -154,11 +154,11 @@ static int function_trace_init(struct trace_array *tr) if (!tr->ops) return -ENOMEM; - func = select_trace_function(func_flags.val); + func = select_trace_function(tr->current_trace_flags->val); if (!func) return -EINVAL; - if (!handle_func_repeats(tr, func_flags.val)) + if (!handle_func_repeats(tr, tr->current_trace_flags->val)) return -ENOMEM; ftrace_init_array_ops(tr, func); @@ -209,7 +209,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { struct trace_array *tr = op->private; - struct trace_array_cpu *data; unsigned int trace_ctx; int bit; @@ -224,9 +223,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, trace_ctx = tracing_gen_ctx_dec(); - data = this_cpu_ptr(tr->array_buffer.data); - if (!atomic_read(&data->disabled)) - trace_function(tr, ip, parent_ip, trace_ctx, NULL); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); ftrace_test_recursion_unlock(bit); } @@ -236,10 +233,8 @@ function_args_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { struct trace_array *tr = op->private; - struct trace_array_cpu *data; unsigned int trace_ctx; int bit; - int cpu; if (unlikely(!tr->function_enabled)) return; @@ -250,10 +245,7 @@ function_args_trace_call(unsigned long ip, unsigned long parent_ip, trace_ctx = tracing_gen_ctx(); - cpu = smp_processor_id(); - data = per_cpu_ptr(tr->array_buffer.data, cpu); - if (!atomic_read(&data->disabled)) - trace_function(tr, ip, parent_ip, trace_ctx, fregs); + trace_function(tr, ip, parent_ip, trace_ctx, fregs); ftrace_test_recursion_unlock(bit); } @@ -299,7 +291,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, parent_ip = function_get_true_parent_ip(parent_ip, fregs); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&data->disabled); + disabled = local_inc_return(&data->disabled); if (likely(disabled == 1)) { trace_ctx = tracing_gen_ctx_flags(flags); @@ -311,7 +303,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, __trace_stack(tr, trace_ctx, skip); } - atomic_dec(&data->disabled); + local_dec(&data->disabled); local_irq_restore(flags); } @@ -352,7 +344,6 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, { struct trace_func_repeats *last_info; struct trace_array *tr = op->private; - struct trace_array_cpu *data; unsigned int trace_ctx; int bit; @@ -364,8 +355,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, return; parent_ip = function_get_true_parent_ip(parent_ip, fregs); - data = this_cpu_ptr(tr->array_buffer.data); - if (atomic_read(&data->disabled)) + if (!tracer_tracing_is_on(tr)) goto out; /* @@ -412,7 +402,7 @@ function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, parent_ip = function_get_true_parent_ip(parent_ip, fregs); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&data->disabled); + disabled = local_inc_return(&data->disabled); if (likely(disabled == 1)) { last_info = per_cpu_ptr(tr->last_func_repeats, cpu); @@ -427,7 +417,7 @@ function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, } out: - atomic_dec(&data->disabled); + local_dec(&data->disabled); local_irq_restore(flags); } @@ -469,14 +459,14 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) u32 new_flags; /* Do nothing if already set. */ - if (!!set == !!(func_flags.val & bit)) + if (!!set == !!(tr->current_trace_flags->val & bit)) return 0; /* We can change this flag only when not running. */ if (tr->current_trace != &function_trace) return 0; - new_flags = (func_flags.val & ~bit) | (set ? bit : 0); + new_flags = (tr->current_trace_flags->val & ~bit) | (set ? bit : 0); func = select_trace_function(new_flags); if (!func) return -EINVAL; @@ -501,7 +491,7 @@ static struct tracer function_trace __tracer_data = .init = function_trace_init, .reset = function_trace_reset, .start = function_trace_start, - .flags = &func_flags, + .default_flags = &func_flags, .set_flag = func_set_flag, .allow_instances = true, #ifdef CONFIG_FTRACE_SELFTEST @@ -633,11 +623,7 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, static __always_inline void trace_stack(struct trace_array *tr) { - unsigned int trace_ctx; - - trace_ctx = tracing_gen_ctx(); - - __trace_stack(tr, trace_ctx, FTRACE_STACK_SKIP); + __trace_stack(tr, tracing_gen_ctx_dec(), FTRACE_STACK_SKIP); } static void diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 0c357a89c58e..b1e9c9913309 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -16,9 +16,12 @@ #include "trace.h" #include "trace_output.h" -/* When set, irq functions will be ignored */ +/* When set, irq functions might be ignored */ static int ftrace_graph_skip_irqs; +/* Do not record function time when task is sleeping */ +int fgraph_no_sleep_time; + struct fgraph_cpu_data { pid_t last_pid; int depth; @@ -27,14 +30,26 @@ struct fgraph_cpu_data { unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; }; +struct fgraph_ent_args { + struct ftrace_graph_ent_entry ent; + /* Force the sizeof of args[] to have FTRACE_REGS_MAX_ARGS entries */ + unsigned long args[FTRACE_REGS_MAX_ARGS]; +}; + +struct fgraph_retaddr_ent_args { + struct fgraph_retaddr_ent_entry ent; + /* Force the sizeof of args[] to have FTRACE_REGS_MAX_ARGS entries */ + unsigned long args[FTRACE_REGS_MAX_ARGS]; +}; + struct fgraph_data { struct fgraph_cpu_data __percpu *cpu_data; /* Place to preserve last processed entry. */ union { - struct ftrace_graph_ent_entry ent; - struct fgraph_retaddr_ent_entry rent; - } ent; + struct fgraph_ent_args ent; + struct fgraph_retaddr_ent_args rent; + }; struct ftrace_graph_ret_entry ret; int failed; int cpu; @@ -78,11 +93,6 @@ static struct tracer_opt trace_opts[] = { /* Include sleep time (scheduled out) between entry and return */ { TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) }, -#ifdef CONFIG_FUNCTION_PROFILER - /* Include time within nested functions */ - { TRACER_OPT(graph-time, TRACE_GRAPH_GRAPH_TIME) }, -#endif - { } /* Empty entry */ }; @@ -90,13 +100,13 @@ static struct tracer_flags tracer_flags = { /* Don't display overruns, proc, or tail by default */ .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS | - TRACE_GRAPH_SLEEP_TIME | TRACE_GRAPH_GRAPH_TIME, + TRACE_GRAPH_SLEEP_TIME, .opts = trace_opts }; -static bool tracer_flags_is_set(u32 flags) +static bool tracer_flags_is_set(struct trace_array *tr, u32 flags) { - return (tracer_flags.val & flags) == flags; + return (tr->current_trace_flags->val & flags) == flags; } /* @@ -155,20 +165,32 @@ int __trace_graph_entry(struct trace_array *tr, int __trace_graph_retaddr_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned int trace_ctx, - unsigned long retaddr) + unsigned long retaddr, + struct ftrace_regs *fregs) { struct ring_buffer_event *event; struct trace_buffer *buffer = tr->array_buffer.buffer; struct fgraph_retaddr_ent_entry *entry; + int size; + + /* If fregs is defined, add FTRACE_REGS_MAX_ARGS long size words */ + size = sizeof(*entry) + (FTRACE_REGS_MAX_ARGS * !!fregs * sizeof(long)); event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RETADDR_ENT, - sizeof(*entry), trace_ctx); + size, trace_ctx); if (!event) return 0; entry = ring_buffer_event_data(event); - entry->graph_ent.func = trace->func; - entry->graph_ent.depth = trace->depth; - entry->graph_ent.retaddr = retaddr; + entry->graph_rent.ent = *trace; + entry->graph_rent.retaddr = retaddr; + +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + if (fregs) { + for (int i = 0; i < FTRACE_REGS_MAX_ARGS; i++) + entry->args[i] = ftrace_regs_get_argument(fregs, i); + } +#endif + trace_buffer_unlock_commit_nostack(buffer, event); return 1; @@ -177,17 +199,21 @@ int __trace_graph_retaddr_entry(struct trace_array *tr, int __trace_graph_retaddr_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned int trace_ctx, - unsigned long retaddr) + unsigned long retaddr, + struct ftrace_regs *fregs) { return 1; } #endif -static inline int ftrace_graph_ignore_irqs(void) +static inline int ftrace_graph_ignore_irqs(struct trace_array *tr) { if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT)) return 0; + if (tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_IRQS)) + return 0; + return in_hardirq(); } @@ -202,12 +228,9 @@ static int graph_entry(struct ftrace_graph_ent *trace, { unsigned long *task_var = fgraph_get_task_var(gops); struct trace_array *tr = gops->private; - struct trace_array_cpu *data; struct fgraph_times *ftimes; unsigned int trace_ctx; - long disabled; int ret = 0; - int cpu; if (*task_var & TRACE_GRAPH_NOTRACE) return 0; @@ -228,22 +251,20 @@ static int graph_entry(struct ftrace_graph_ent *trace, return 1; } - if (!ftrace_trace_task(tr)) - return 0; - if (ftrace_graph_ignore_func(gops, trace)) return 0; - if (ftrace_graph_ignore_irqs()) + if (ftrace_graph_ignore_irqs(tr)) return 0; - if (fgraph_sleep_time) { - /* Only need to record the calltime */ - ftimes = fgraph_reserve_data(gops->idx, sizeof(ftimes->calltime)); - } else { + if (fgraph_no_sleep_time && + !tracer_flags_is_set(tr, TRACE_GRAPH_SLEEP_TIME)) { ftimes = fgraph_reserve_data(gops->idx, sizeof(*ftimes)); if (ftimes) ftimes->sleeptime = current->ftrace_sleeptime; + } else { + /* Only need to record the calltime */ + ftimes = fgraph_reserve_data(gops->idx, sizeof(ftimes->calltime)); } if (!ftimes) return 0; @@ -257,21 +278,15 @@ static int graph_entry(struct ftrace_graph_ent *trace, if (tracing_thresh) return 1; - preempt_disable_notrace(); - cpu = raw_smp_processor_id(); - data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_read(&data->disabled); - if (likely(!disabled)) { - trace_ctx = tracing_gen_ctx(); - if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && - tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR)) { - unsigned long retaddr = ftrace_graph_top_ret_addr(current); - ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr); - } else { - ret = __graph_entry(tr, trace, trace_ctx, fregs); - } + trace_ctx = tracing_gen_ctx(); + if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && + tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_RETADDR)) { + unsigned long retaddr = ftrace_graph_top_ret_addr(current); + ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, + retaddr, fregs); + } else { + ret = __graph_entry(tr, trace, trace_ctx, fregs); } - preempt_enable_notrace(); return ret; } @@ -336,11 +351,15 @@ void __trace_graph_return(struct trace_array *tr, trace_buffer_unlock_commit_nostack(buffer, event); } -static void handle_nosleeptime(struct ftrace_graph_ret *trace, +static void handle_nosleeptime(struct trace_array *tr, + struct ftrace_graph_ret *trace, struct fgraph_times *ftimes, int size) { - if (fgraph_sleep_time || size < sizeof(*ftimes)) + if (size < sizeof(*ftimes)) + return; + + if (!fgraph_no_sleep_time || tracer_flags_is_set(tr, TRACE_GRAPH_SLEEP_TIME)) return; ftimes->calltime += current->ftrace_sleeptime - ftimes->sleeptime; @@ -351,13 +370,10 @@ void trace_graph_return(struct ftrace_graph_ret *trace, { unsigned long *task_var = fgraph_get_task_var(gops); struct trace_array *tr = gops->private; - struct trace_array_cpu *data; struct fgraph_times *ftimes; unsigned int trace_ctx; u64 calltime, rettime; - long disabled; int size; - int cpu; rettime = trace_clock_local(); @@ -372,19 +388,12 @@ void trace_graph_return(struct ftrace_graph_ret *trace, if (!ftimes) return; - handle_nosleeptime(trace, ftimes, size); + handle_nosleeptime(tr, trace, ftimes, size); calltime = ftimes->calltime; - preempt_disable_notrace(); - cpu = raw_smp_processor_id(); - data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_read(&data->disabled); - if (likely(!disabled)) { - trace_ctx = tracing_gen_ctx(); - __trace_graph_return(tr, trace, trace_ctx, calltime, rettime); - } - preempt_enable_notrace(); + trace_ctx = tracing_gen_ctx(); + __trace_graph_return(tr, trace, trace_ctx, calltime, rettime); } static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, @@ -392,6 +401,7 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, struct ftrace_regs *fregs) { struct fgraph_times *ftimes; + struct trace_array *tr; int size; ftrace_graph_addr_finish(gops, trace); @@ -405,7 +415,8 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, if (!ftimes) return; - handle_nosleeptime(trace, ftimes, size); + tr = gops->private; + handle_nosleeptime(tr, trace, ftimes, size); if (tracing_thresh && (trace_clock_local() - ftimes->calltime < tracing_thresh)) @@ -454,7 +465,7 @@ static int graph_trace_init(struct trace_array *tr) { int ret; - if (tracer_flags_is_set(TRACE_GRAPH_ARGS)) + if (tracer_flags_is_set(tr, TRACE_GRAPH_ARGS)) tr->gops->entryfunc = trace_graph_entry_args; else tr->gops->entryfunc = trace_graph_entry; @@ -464,6 +475,12 @@ static int graph_trace_init(struct trace_array *tr) else tr->gops->retfunc = trace_graph_return; + if (!tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_IRQS)) + ftrace_graph_skip_irqs++; + + if (!tracer_flags_is_set(tr, TRACE_GRAPH_SLEEP_TIME)) + fgraph_no_sleep_time++; + /* Make gops functions visible before we start tracing */ smp_mb(); @@ -475,6 +492,8 @@ static int graph_trace_init(struct trace_array *tr) return 0; } +static struct tracer graph_trace; + static int ftrace_graph_trace_args(struct trace_array *tr, int set) { trace_func_graph_ent_t entry; @@ -499,6 +518,16 @@ static int ftrace_graph_trace_args(struct trace_array *tr, int set) static void graph_trace_reset(struct trace_array *tr) { + if (!tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_IRQS)) + ftrace_graph_skip_irqs--; + if (WARN_ON_ONCE(ftrace_graph_skip_irqs < 0)) + ftrace_graph_skip_irqs = 0; + + if (!tracer_flags_is_set(tr, TRACE_GRAPH_SLEEP_TIME)) + fgraph_no_sleep_time--; + if (WARN_ON_ONCE(fgraph_no_sleep_time < 0)) + fgraph_no_sleep_time = 0; + tracing_stop_cmdline_record(); unregister_ftrace_graph(tr->gops); } @@ -527,7 +556,7 @@ static void print_graph_proc(struct trace_seq *s, pid_t pid) { char comm[TASK_COMM_LEN]; /* sign + log10(MAX_INT) + '\0' */ - char pid_str[11]; + char pid_str[12]; int spaces = 0; int len; int i; @@ -641,10 +670,9 @@ get_return_for_leaf(struct trace_iterator *iter, * Save current and next entries for later reference * if the output fails. */ - if (unlikely(curr->ent.type == TRACE_GRAPH_RETADDR_ENT)) - data->ent.rent = *(struct fgraph_retaddr_ent_entry *)curr; - else - data->ent.ent = *curr; + int size = min_t(int, sizeof(data->rent), iter->ent_size); + + memcpy(&data->rent, curr, size); /* * If the next event is not a return type, then * we only care about what type it is. Otherwise we can @@ -707,7 +735,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, addr >= (unsigned long)__irqentry_text_end) return; - if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) { /* Absolute time */ if (flags & TRACE_GRAPH_PRINT_ABS_TIME) print_graph_abs_time(iter->ts, s); @@ -727,7 +755,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, } /* Latency format */ - if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) + if (tr->trace_flags & TRACE_ITER(LATENCY_FMT)) print_graph_lat_fmt(s, ent); } @@ -781,7 +809,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, struct trace_seq *s, u32 flags) { if (!(flags & TRACE_GRAPH_PRINT_DURATION) || - !(tr->trace_flags & TRACE_ITER_CONTEXT_INFO)) + !(tr->trace_flags & TRACE_ITER(CONTEXT_INFO))) return; /* No real adata, just filling the column with spaces */ @@ -822,7 +850,7 @@ static void print_graph_retaddr(struct trace_seq *s, struct fgraph_retaddr_ent_e trace_seq_puts(s, " /*"); trace_seq_puts(s, " <-"); - seq_print_ip_sym(s, entry->graph_ent.retaddr, trace_flags | TRACE_ITER_SYM_OFFSET); + seq_print_ip_sym_offset(s, entry->graph_rent.retaddr, trace_flags); if (comment) trace_seq_puts(s, " */"); @@ -968,7 +996,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, trace_seq_printf(s, "%ps", (void *)ret_func); if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) { - print_function_args(s, entry->args, ret_func); + print_function_args(s, FGRAPH_ENTRY_ARGS(entry), ret_func); trace_seq_putc(s, ';'); } else trace_seq_puts(s, "();"); @@ -1020,7 +1048,7 @@ print_graph_entry_nested(struct trace_iterator *iter, args_size = iter->ent_size - offsetof(struct ftrace_graph_ent_entry, args); if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) - print_function_args(s, entry->args, func); + print_function_args(s, FGRAPH_ENTRY_ARGS(entry), func); else trace_seq_puts(s, "()"); @@ -1058,7 +1086,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, /* Interrupt */ print_graph_irq(iter, addr, type, cpu, ent->pid, flags); - if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO)) + if (!(tr->trace_flags & TRACE_ITER(CONTEXT_INFO))) return; /* Absolute time */ @@ -1080,7 +1108,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, } /* Latency format */ - if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) + if (tr->trace_flags & TRACE_ITER(LATENCY_FMT)) print_graph_lat_fmt(s, ent); return; @@ -1202,11 +1230,14 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, /* * print_graph_entry() may consume the current event, * thus @field may become invalid, so we need to save it. - * sizeof(struct ftrace_graph_ent_entry) is very small, - * it can be safely saved at the stack. + * This function is shared by ftrace_graph_ent_entry and + * fgraph_retaddr_ent_entry, the size of the latter one + * is larger, but it is very small and can be safely saved + * at the stack. */ struct ftrace_graph_ent_entry *entry; - u8 save_buf[sizeof(*entry) + FTRACE_REGS_MAX_ARGS * sizeof(long)]; + struct fgraph_retaddr_ent_entry *rentry; + u8 save_buf[sizeof(*rentry) + FTRACE_REGS_MAX_ARGS * sizeof(long)]; /* The ent_size is expected to be as big as the entry */ if (iter->ent_size > sizeof(save_buf)) @@ -1435,12 +1466,17 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) } #ifdef CONFIG_FUNCTION_GRAPH_RETADDR case TRACE_GRAPH_RETADDR_ENT: { - struct fgraph_retaddr_ent_entry saved; + /* + * ftrace_graph_ent_entry and fgraph_retaddr_ent_entry have + * similar functions and memory layouts. The only difference + * is that the latter one has an extra retaddr member, so + * they can share most of the logic. + */ struct fgraph_retaddr_ent_entry *rfield; trace_assign_type(rfield, entry); - saved = *rfield; - return print_graph_entry((struct ftrace_graph_ent_entry *)&saved, s, iter, flags); + return print_graph_entry((struct ftrace_graph_ent_entry *)rfield, + s, iter, flags); } #endif case TRACE_GRAPH_RET: { @@ -1463,7 +1499,8 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) static enum print_line_t print_graph_function(struct trace_iterator *iter) { - return print_graph_function_flags(iter, tracer_flags.val); + struct trace_array *tr = iter->tr; + return print_graph_function_flags(iter, tr->current_trace_flags->val); } static enum print_line_t @@ -1499,7 +1536,7 @@ static void print_lat_header(struct seq_file *s, u32 flags) static void __print_graph_headers_flags(struct trace_array *tr, struct seq_file *s, u32 flags) { - int lat = tr->trace_flags & TRACE_ITER_LATENCY_FMT; + int lat = tr->trace_flags & TRACE_ITER(LATENCY_FMT); if (lat) print_lat_header(s, flags); @@ -1539,7 +1576,10 @@ static void __print_graph_headers_flags(struct trace_array *tr, static void print_graph_headers(struct seq_file *s) { - print_graph_headers_flags(s, tracer_flags.val); + struct trace_iterator *iter = s->private; + struct trace_array *tr = iter->tr; + + print_graph_headers_flags(s, tr->current_trace_flags->val); } void print_graph_headers_flags(struct seq_file *s, u32 flags) @@ -1547,10 +1587,10 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags) struct trace_iterator *iter = s->private; struct trace_array *tr = iter->tr; - if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO)) + if (!(tr->trace_flags & TRACE_ITER(CONTEXT_INFO))) return; - if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) { + if (tr->trace_flags & TRACE_ITER(LATENCY_FMT)) { /* print nothing if the buffers are empty */ if (trace_empty(iter)) return; @@ -1617,17 +1657,56 @@ void graph_trace_close(struct trace_iterator *iter) static int func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { - if (bit == TRACE_GRAPH_PRINT_IRQS) - ftrace_graph_skip_irqs = !set; +/* + * The function profiler gets updated even if function graph + * isn't the current tracer. Handle it separately. + */ +#ifdef CONFIG_FUNCTION_PROFILER + if (bit == TRACE_GRAPH_SLEEP_TIME && (tr->flags & TRACE_ARRAY_FL_GLOBAL) && + !!set == fprofile_no_sleep_time) { + if (set) { + fgraph_no_sleep_time--; + if (WARN_ON_ONCE(fgraph_no_sleep_time < 0)) + fgraph_no_sleep_time = 0; + fprofile_no_sleep_time = false; + } else { + fgraph_no_sleep_time++; + fprofile_no_sleep_time = true; + } + } +#endif - if (bit == TRACE_GRAPH_SLEEP_TIME) - ftrace_graph_sleep_time_control(set); + /* Do nothing if the current tracer is not this tracer */ + if (tr->current_trace != &graph_trace) + return 0; - if (bit == TRACE_GRAPH_GRAPH_TIME) - ftrace_graph_graph_time_control(set); + /* Do nothing if already set. */ + if (!!set == !!(tr->current_trace_flags->val & bit)) + return 0; - if (bit == TRACE_GRAPH_ARGS) + switch (bit) { + case TRACE_GRAPH_SLEEP_TIME: + if (set) { + fgraph_no_sleep_time--; + if (WARN_ON_ONCE(fgraph_no_sleep_time < 0)) + fgraph_no_sleep_time = 0; + } else { + fgraph_no_sleep_time++; + } + break; + + case TRACE_GRAPH_PRINT_IRQS: + if (set) + ftrace_graph_skip_irqs--; + else + ftrace_graph_skip_irqs++; + if (WARN_ON_ONCE(ftrace_graph_skip_irqs < 0)) + ftrace_graph_skip_irqs = 0; + break; + + case TRACE_GRAPH_ARGS: return ftrace_graph_trace_args(tr, set); + } return 0; } @@ -1664,7 +1743,7 @@ static struct tracer graph_trace __tracer_data = { .reset = graph_trace_reset, .print_line = print_graph_function, .print_header = print_graph_headers, - .flags = &tracer_flags, + .default_flags = &tracer_flags, .set_flag = func_graph_set_flag, .allow_instances = true, #ifdef CONFIG_FTRACE_SELFTEST diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index b65353ec2837..2f7b94e98317 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -325,12 +325,9 @@ static void move_to_next_cpu(void) cpus_read_lock(); cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); - next_cpu = cpumask_next(raw_smp_processor_id(), current_mask); + next_cpu = cpumask_next_wrap(raw_smp_processor_id(), current_mask); cpus_read_unlock(); - if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(current_mask); - if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */ goto change_mode; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 40c39e946940..17673905907c 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -63,7 +63,7 @@ irq_trace(void) #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int irqsoff_display_graph(struct trace_array *tr, int set); -# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH) +# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER(DISPLAY_GRAPH)) #else static inline int irqsoff_display_graph(struct trace_array *tr, int set) { @@ -123,12 +123,12 @@ static int func_prolog_dec(struct trace_array *tr, return 0; *data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&(*data)->disabled); + disabled = local_inc_return(&(*data)->disabled); if (likely(disabled == 1)) return 1; - atomic_dec(&(*data)->disabled); + local_dec(&(*data)->disabled); return 0; } @@ -152,7 +152,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, trace_function(tr, ip, parent_ip, trace_ctx, fregs); - atomic_dec(&data->disabled); + local_dec(&data->disabled); } #endif /* CONFIG_FUNCTION_TRACER */ @@ -184,7 +184,7 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace, unsigned long flags; unsigned int trace_ctx; u64 *calltime; - int ret; + int ret = 0; if (ftrace_graph_ignore_func(gops, trace)) return 0; @@ -202,14 +202,12 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace, return 0; calltime = fgraph_reserve_data(gops->idx, sizeof(*calltime)); - if (!calltime) - return 0; - - *calltime = trace_clock_local(); - - trace_ctx = tracing_gen_ctx_flags(flags); - ret = __trace_graph_entry(tr, trace, trace_ctx); - atomic_dec(&data->disabled); + if (calltime) { + *calltime = trace_clock_local(); + trace_ctx = tracing_gen_ctx_flags(flags); + ret = __trace_graph_entry(tr, trace, trace_ctx); + } + local_dec(&data->disabled); return ret; } @@ -233,12 +231,11 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace, rettime = trace_clock_local(); calltime = fgraph_retrieve_data(gops->idx, &size); - if (!calltime) - return; - - trace_ctx = tracing_gen_ctx_flags(flags); - __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime); - atomic_dec(&data->disabled); + if (calltime) { + trace_ctx = tracing_gen_ctx_flags(flags); + __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime); + } + local_dec(&data->disabled); } static struct fgraph_ops fgraph_ops = { @@ -397,6 +394,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) int cpu; struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; + long disabled; if (!tracer_enabled || !tracing_is_enabled()) return; @@ -408,20 +406,22 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) data = per_cpu_ptr(tr->array_buffer.data, cpu); - if (unlikely(!data) || atomic_read(&data->disabled)) + if (unlikely(!data) || local_read(&data->disabled)) return; - atomic_inc(&data->disabled); + disabled = local_inc_return(&data->disabled); - data->critical_sequence = max_sequence; - data->preempt_timestamp = ftrace_now(cpu); - data->critical_start = parent_ip ? : ip; + if (disabled == 1) { + data->critical_sequence = max_sequence; + data->preempt_timestamp = ftrace_now(cpu); + data->critical_start = parent_ip ? : ip; - __trace_function(tr, ip, parent_ip, tracing_gen_ctx()); + __trace_function(tr, ip, parent_ip, tracing_gen_ctx()); - per_cpu(tracing_cpu, cpu) = 1; + per_cpu(tracing_cpu, cpu) = 1; + } - atomic_dec(&data->disabled); + local_dec(&data->disabled); } static nokprobe_inline void @@ -431,6 +431,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned int trace_ctx; + long disabled; cpu = raw_smp_processor_id(); /* Always clear the tracing cpu on stopping the trace */ @@ -445,16 +446,19 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) data = per_cpu_ptr(tr->array_buffer.data, cpu); if (unlikely(!data) || - !data->critical_start || atomic_read(&data->disabled)) + !data->critical_start || local_read(&data->disabled)) return; - atomic_inc(&data->disabled); + disabled = local_inc_return(&data->disabled); - trace_ctx = tracing_gen_ctx(); - __trace_function(tr, ip, parent_ip, trace_ctx); - check_critical_timing(tr, data, parent_ip ? : ip, cpu); - data->critical_start = 0; - atomic_dec(&data->disabled); + if (disabled == 1) { + trace_ctx = tracing_gen_ctx(); + __trace_function(tr, ip, parent_ip, trace_ctx); + check_critical_timing(tr, data, parent_ip ? : ip, cpu); + data->critical_start = 0; + } + + local_dec(&data->disabled); } /* start and stop critical timings used to for stoppage (in idle) */ @@ -481,8 +485,8 @@ static int register_irqsoff_function(struct trace_array *tr, int graph, int set) { int ret; - /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ - if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION))) + /* 'set' is set if TRACE_ITER(FUNCTION) is about to be set */ + if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER(FUNCTION)))) return 0; if (graph) @@ -511,7 +515,7 @@ static void unregister_irqsoff_function(struct trace_array *tr, int graph) static int irqsoff_function_set(struct trace_array *tr, u32 mask, int set) { - if (!(mask & TRACE_ITER_FUNCTION)) + if (!(mask & TRACE_ITER(FUNCTION))) return 0; if (set) @@ -532,7 +536,7 @@ static inline int irqsoff_function_set(struct trace_array *tr, u32 mask, int set } #endif /* CONFIG_FUNCTION_TRACER */ -static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) +static int irqsoff_flag_changed(struct trace_array *tr, u64 mask, int set) { struct tracer *tracer = tr->current_trace; @@ -540,7 +544,7 @@ static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) return 0; #ifdef CONFIG_FUNCTION_GRAPH_TRACER - if (mask & TRACE_ITER_DISPLAY_GRAPH) + if (mask & TRACE_ITER(DISPLAY_GRAPH)) return irqsoff_display_graph(tr, set); #endif @@ -578,10 +582,10 @@ static int __irqsoff_tracer_init(struct trace_array *tr) save_flags = tr->trace_flags; /* non overwrite screws up the latency tracers */ - set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); - set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); + set_tracer_flag(tr, TRACE_ITER(OVERWRITE), 1); + set_tracer_flag(tr, TRACE_ITER(LATENCY_FMT), 1); /* without pause, we will produce garbage if another latency occurs */ - set_tracer_flag(tr, TRACE_ITER_PAUSE_ON_TRACE, 1); + set_tracer_flag(tr, TRACE_ITER(PAUSE_ON_TRACE), 1); tr->max_latency = 0; irqsoff_trace = tr; @@ -601,15 +605,15 @@ static int __irqsoff_tracer_init(struct trace_array *tr) static void __irqsoff_tracer_reset(struct trace_array *tr) { - int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; - int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; - int pause_flag = save_flags & TRACE_ITER_PAUSE_ON_TRACE; + int lat_flag = save_flags & TRACE_ITER(LATENCY_FMT); + int overwrite_flag = save_flags & TRACE_ITER(OVERWRITE); + int pause_flag = save_flags & TRACE_ITER(PAUSE_ON_TRACE); stop_irqsoff_tracer(tr, is_graph(tr)); - set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); - set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); - set_tracer_flag(tr, TRACE_ITER_PAUSE_ON_TRACE, pause_flag); + set_tracer_flag(tr, TRACE_ITER(LATENCY_FMT), lat_flag); + set_tracer_flag(tr, TRACE_ITER(OVERWRITE), overwrite_flag); + set_tracer_flag(tr, TRACE_ITER(PAUSE_ON_TRACE), pause_flag); ftrace_reset_array_ops(tr); irqsoff_busy = false; diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 1e72d20b3c2f..b30795f34079 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -31,7 +31,7 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file) old_userobj = tr->trace_flags; /* don't look at user memory in panic mode */ - tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + tr->trace_flags &= ~TRACE_ITER(SYM_USEROBJ); kdb_printf("Dumping ftrace buffer:\n"); if (skip_entries) @@ -43,17 +43,15 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file) if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter.buffer_iter[cpu] = - ring_buffer_read_prepare(iter.array_buffer->buffer, - cpu, GFP_ATOMIC); - ring_buffer_read_start(iter.buffer_iter[cpu]); + ring_buffer_read_start(iter.array_buffer->buffer, + cpu, GFP_ATOMIC); tracing_iter_reset(&iter, cpu); } } else { iter.cpu_file = cpu_file; iter.buffer_iter[cpu_file] = - ring_buffer_read_prepare(iter.array_buffer->buffer, + ring_buffer_read_start(iter.array_buffer->buffer, cpu_file, GFP_ATOMIC); - ring_buffer_read_start(iter.buffer_iter[cpu_file]); tracing_iter_reset(&iter, cpu_file); } @@ -98,7 +96,6 @@ static int kdb_ftdump(int argc, const char **argv) long cpu_file; int err; int cnt; - int cpu; if (argc > 2) return KDB_ARGCOUNT; @@ -120,9 +117,7 @@ static int kdb_ftdump(int argc, const char **argv) trace_init_global_iter(&iter); iter.buffer_iter = buffer_iter; - for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); - } + tracer_tracing_disable(iter.tr); /* A negative skip_entries means skip all but the last entries */ if (skip_entries < 0) { @@ -135,9 +130,7 @@ static int kdb_ftdump(int argc, const char **argv) ftrace_dump_buf(skip_entries, cpu_file); - for_each_tracing_cpu(cpu) { - atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); - } + tracer_tracing_enable(iter.tr); kdb_trap_printk--; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 2703b96d8990..9953506370a5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -9,19 +9,19 @@ #include <linux/bpf-cgroup.h> #include <linux/cleanup.h> -#include <linux/security.h> +#include <linux/error-injection.h> #include <linux/module.h> -#include <linux/uaccess.h> #include <linux/rculist.h> -#include <linux/error-injection.h> +#include <linux/security.h> +#include <linux/uaccess.h> #include <asm/setup.h> /* for COMMAND_LINE_SIZE */ #include "trace_dynevent.h" #include "trace_kprobe_selftest.h" #include "trace_probe.h" -#include "trace_probe_tmpl.h" #include "trace_probe_kernel.h" +#include "trace_probe_tmpl.h" #define KPROBE_EVENT_SYSTEM "kprobes" #define KRETPROBE_MAXACTIVE_MAX 4096 @@ -861,20 +861,20 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], * FETCHARG:TYPE : use TYPE instead of unsigned long. */ struct trace_kprobe *tk __free(free_trace_kprobe) = NULL; + const char *event = NULL, *group = KPROBE_EVENT_SYSTEM; + const char **new_argv __free(kfree) = NULL; int i, len, new_argc = 0, ret = 0; - bool is_return = false; char *symbol __free(kfree) = NULL; - char *tmp = NULL; - const char **new_argv __free(kfree) = NULL; - const char *event = NULL, *group = KPROBE_EVENT_SYSTEM; + char *ebuf __free(kfree) = NULL; + char *gbuf __free(kfree) = NULL; + char *abuf __free(kfree) = NULL; + char *dbuf __free(kfree) = NULL; enum probe_print_type ptype; + bool is_return = false; int maxactive = 0; - long offset = 0; void *addr = NULL; - char buf[MAX_EVENT_NAME_LEN]; - char gbuf[MAX_EVENT_NAME_LEN]; - char abuf[MAX_BTF_ARGS_LEN]; - char *dbuf __free(kfree) = NULL; + char *tmp = NULL; + long offset = 0; switch (argv[0][0]) { case 'r': @@ -893,6 +893,8 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], event++; if (isdigit(argv[0][1])) { + char *buf __free(kfree) = NULL; + if (!is_return) { trace_probe_log_err(1, BAD_MAXACT_TYPE); return -EINVAL; @@ -905,7 +907,9 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], trace_probe_log_err(1, BAD_MAXACT); return -EINVAL; } - memcpy(buf, &argv[0][1], len); + buf = kmemdup(&argv[0][1], len + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; buf[len] = '\0'; ret = kstrtouint(buf, 0, &maxactive); if (ret || !maxactive) { @@ -973,6 +977,9 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], trace_probe_log_set_index(0); if (event) { + gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!gbuf) + return -ENOMEM; ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) @@ -981,16 +988,22 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], if (!event) { /* Make a new event name */ + ebuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!ebuf) + return -ENOMEM; if (symbol) - snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld", + snprintf(ebuf, MAX_EVENT_NAME_LEN, "%c_%s_%ld", is_return ? 'r' : 'p', symbol, offset); else - snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p", + snprintf(ebuf, MAX_EVENT_NAME_LEN, "%c_0x%p", is_return ? 'r' : 'p', addr); - sanitize_event_name(buf); - event = buf; + sanitize_event_name(ebuf); + event = ebuf; } + abuf = kmalloc(MAX_BTF_ARGS_LEN, GFP_KERNEL); + if (!abuf) + return -ENOMEM; argc -= 2; argv += 2; ctx->funcname = symbol; new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc, @@ -1065,14 +1078,18 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], static int trace_kprobe_create_cb(int argc, const char *argv[]) { - struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL }; + struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL; int ret; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + ctx->flags = TPARG_FL_KERNEL; + trace_probe_log_init("trace_kprobe", argc, argv); - ret = trace_kprobe_create_internal(argc, argv, &ctx); + ret = trace_kprobe_create_internal(argc, argv, ctx); - traceprobe_finish_parse(&ctx); trace_probe_log_clear(); return ret; } @@ -1089,7 +1106,7 @@ static int create_or_delete_trace_kprobe(const char *raw_command) if (raw_command[0] == '-') return dyn_event_release(raw_command, &trace_kprobe_ops); - ret = trace_kprobe_create(raw_command); + ret = dyn_event_create(raw_command, &trace_kprobe_ops); return ret == -ECANCELED ? -EINVAL : ret; } @@ -1567,7 +1584,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags, trace_seq_printf(s, "%s: (", trace_probe_name(tp)); - if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) + if (!seq_print_ip_sym_offset(s, field->ip, flags)) goto out; trace_seq_putc(s, ')'); @@ -1597,12 +1614,12 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, trace_seq_printf(s, "%s: (", trace_probe_name(tp)); - if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) + if (!seq_print_ip_sym_offset(s, field->ret_ip, flags)) goto out; trace_seq_puts(s, " <- "); - if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) + if (!seq_print_ip_sym_no_offset(s, field->func, flags)) goto out; trace_seq_putc(s, ')'); @@ -1798,14 +1815,15 @@ static int kprobe_register(struct trace_event_call *event, static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); + unsigned int flags = trace_probe_load_flag(&tk->tp); int ret = 0; raw_cpu_inc(*tk->nhit); - if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE)) + if (flags & TP_FLAG_TRACE) kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS - if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE)) + if (flags & TP_FLAG_PROFILE) ret = kprobe_perf_func(tk, regs); #endif return ret; @@ -1817,6 +1835,7 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) { struct kretprobe *rp = get_kretprobe(ri); struct trace_kprobe *tk; + unsigned int flags; /* * There is a small chance that get_kretprobe(ri) returns NULL when @@ -1829,10 +1848,11 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) tk = container_of(rp, struct trace_kprobe, rp); raw_cpu_inc(*tk->nhit); - if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE)) + flags = trace_probe_load_flag(&tk->tp); + if (flags & TP_FLAG_TRACE) kretprobe_trace_func(tk, ri, regs); #ifdef CONFIG_PERF_EVENTS - if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE)) + if (flags & TP_FLAG_PROFILE) kretprobe_perf_func(tk, ri, regs); #endif return 0; /* We don't tweak kernel, so just return 0 */ diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index ba5858866b2f..c706544be60c 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -291,7 +291,6 @@ __init static int init_mmio_trace(void) device_initcall(init_mmio_trace); static void __trace_mmiotrace_rw(struct trace_array *tr, - struct trace_array_cpu *data, struct mmiotrace_rw *rw) { struct trace_buffer *buffer = tr->array_buffer.buffer; @@ -315,12 +314,10 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, void mmio_trace_rw(struct mmiotrace_rw *rw) { struct trace_array *tr = mmio_trace_array; - struct trace_array_cpu *data = per_cpu_ptr(tr->array_buffer.data, smp_processor_id()); - __trace_mmiotrace_rw(tr, data, rw); + __trace_mmiotrace_rw(tr, rw); } static void __trace_mmiotrace_map(struct trace_array *tr, - struct trace_array_cpu *data, struct mmiotrace_map *map) { struct trace_buffer *buffer = tr->array_buffer.buffer; @@ -344,12 +341,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, void mmio_trace_mapping(struct mmiotrace_map *map) { struct trace_array *tr = mmio_trace_array; - struct trace_array_cpu *data; - - preempt_disable(); - data = per_cpu_ptr(tr->array_buffer.data, smp_processor_id()); - __trace_mmiotrace_map(tr, data, map); - preempt_enable(); + __trace_mmiotrace_map(tr, map); } int mmio_trace_printk(const char *fmt, va_list args) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index e732c9e37e14..827104d00bc0 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -24,6 +24,7 @@ #include <linux/sched/clock.h> #include <uapi/linux/sched/types.h> #include <linux/sched.h> +#include <linux/string.h> #include "trace.h" #ifdef CONFIG_X86_LOCAL_APIC @@ -271,7 +272,7 @@ static inline void tlat_var_reset(void) * So far, all the values are initialized as 0, so * zeroing the structure is perfect. */ - for_each_cpu(cpu, cpu_online_mask) { + for_each_online_cpu(cpu) { tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu); if (tlat_var->kthread) hrtimer_cancel(&tlat_var->timer); @@ -295,7 +296,7 @@ static inline void osn_var_reset(void) * So far, all the values are initialized as 0, so * zeroing the structure is perfect. */ - for_each_cpu(cpu, cpu_online_mask) { + for_each_online_cpu(cpu) { osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); memset(osn_var, 0, sizeof(*osn_var)); } @@ -328,7 +329,7 @@ static struct osnoise_data { u64 print_stack; /* print IRQ stack if total > */ int timerlat_tracer; /* timerlat tracer */ #endif - bool tainted; /* infor users and developers about a problem */ + bool tainted; /* info users and developers about a problem */ } osnoise_data = { .sample_period = DEFAULT_SAMPLE_PERIOD, .sample_runtime = DEFAULT_SAMPLE_RUNTIME, @@ -637,8 +638,8 @@ __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, u entry = ring_buffer_event_data(event); - memcpy(&entry->caller, fstack->calls, size); entry->size = fstack->nr_entries; + memcpy(&entry->caller, fstack->calls, size); trace_buffer_unlock_commit_nostack(buffer, event); } @@ -737,7 +738,7 @@ cond_move_thread_delta_start(struct osnoise_variables *osn_var, u64 duration) /* * get_int_safe_duration - Get the duration of a window * - * The irq, softirq and thread varaibles need to have its duration without + * The irq, softirq and thread variables need to have its duration without * the interference from higher priority interrupts. Instead of keeping a * variable to discount the interrupt interference from these variables, the * starting time of these variables are pushed forward with the interrupt's @@ -1459,7 +1460,7 @@ static int run_osnoise(void) stop_in = osnoise_data.stop_tracing * NSEC_PER_USEC; /* - * Start timestemp + * Start timestamp */ start = time_get(); @@ -1880,7 +1881,7 @@ static int timerlat_main(void *data) tlat->kthread = current; osn_var->pid = current->pid; /* - * Anotate the arrival time. + * Annotate the arrival time. */ tlat->abs_period = hrtimer_cb_get_time(&tlat->timer); @@ -1977,7 +1978,7 @@ static void stop_per_cpu_kthreads(void) } /* - * start_kthread - Start a workload tread + * start_kthread - Start a workload thread */ static int start_kthread(unsigned int cpu) { @@ -2302,7 +2303,7 @@ osnoise_cpus_read(struct file *filp, char __user *ubuf, size_t count, * osnoise_cpus_write - Write function for "cpus" entry * @filp: The active open file structure * @ubuf: The user buffer that contains the value to write - * @cnt: The maximum number of bytes to write to "file" + * @count: The maximum number of bytes to write to "file" * @ppos: The current position in @file * * This function provides a write implementation for the "cpus" @@ -2320,13 +2321,14 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, { cpumask_var_t osnoise_cpumask_new; int running, err; - char buf[256]; + char *buf __free(kfree) = NULL; - if (count >= 256) - return -EINVAL; + if (count < 1) + return 0; - if (copy_from_user(buf, ubuf, count)) - return -EFAULT; + buf = memdup_user_nul(ubuf, count); + if (IS_ERR(buf)) + return PTR_ERR(buf); if (!zalloc_cpumask_var(&osnoise_cpumask_new, GFP_KERNEL)) return -ENOMEM; @@ -2703,7 +2705,7 @@ static int osnoise_create_cpu_timerlat_fd(struct dentry *top_dir) * Why not using tracing instance per_cpu/ dir? * * Because osnoise/timerlat have a single workload, having - * multiple files like these are wast of memory. + * multiple files like these are waste of memory. */ per_cpu = tracefs_create_dir("per_cpu", top_dir); if (!per_cpu) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index b9ab06c99543..cc2d3306bb60 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -420,7 +420,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, } mmap_read_unlock(mm); } - if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) + if (ret && ((sym_flags & TRACE_ITER(SYM_ADDR)) || !file)) trace_seq_printf(s, " <" IP_FMT ">", ip); return !trace_seq_has_overflowed(s); } @@ -433,9 +433,9 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) goto out; } - trace_seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET); + trace_seq_print_sym(s, ip, sym_flags & TRACE_ITER(SYM_OFFSET)); - if (sym_flags & TRACE_ITER_SYM_ADDR) + if (sym_flags & TRACE_ITER(SYM_ADDR)) trace_seq_printf(s, " <" IP_FMT ">", ip); out: @@ -569,7 +569,7 @@ static int lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) { struct trace_array *tr = iter->tr; - unsigned long verbose = tr->trace_flags & TRACE_ITER_VERBOSE; + unsigned long verbose = tr->trace_flags & TRACE_ITER(VERBOSE); unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; unsigned long long abs_ts = iter->ts - iter->array_buffer->time_start; unsigned long long rel_ts = next_ts - iter->ts; @@ -636,7 +636,7 @@ int trace_print_context(struct trace_iterator *iter) trace_seq_printf(s, "%16s-%-7d ", comm, entry->pid); - if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { + if (tr->trace_flags & TRACE_ITER(RECORD_TGID)) { unsigned int tgid = trace_find_tgid(entry->pid); if (!tgid) @@ -647,7 +647,7 @@ int trace_print_context(struct trace_iterator *iter) trace_seq_printf(s, "[%03d] ", iter->cpu); - if (tr->trace_flags & TRACE_ITER_IRQ_INFO) + if (tr->trace_flags & TRACE_ITER(IRQ_INFO)) trace_print_lat_fmt(s, entry); trace_print_time(s, iter, iter->ts); @@ -661,7 +661,7 @@ int trace_print_lat_context(struct trace_iterator *iter) struct trace_entry *entry, *next_entry; struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; - unsigned long verbose = (tr->trace_flags & TRACE_ITER_VERBOSE); + unsigned long verbose = (tr->trace_flags & TRACE_ITER(VERBOSE)); u64 next_ts; next_entry = trace_find_next_entry(iter, NULL, &next_ts); @@ -701,6 +701,7 @@ void print_function_args(struct trace_seq *s, unsigned long *args, struct btf *btf; s32 tid, nr = 0; int a, p, x; + u16 encode; trace_seq_printf(s, "("); @@ -744,7 +745,12 @@ void print_function_args(struct trace_seq *s, unsigned long *args, trace_seq_printf(s, "0x%lx", arg); break; case BTF_KIND_INT: - trace_seq_printf(s, "%ld", arg); + encode = btf_int_encoding(t); + /* Print unsigned ints as hex */ + if (encode & BTF_INT_SIGNED) + trace_seq_printf(s, "%ld", arg); + else + trace_seq_printf(s, "0x%lx", arg); break; case BTF_KIND_ENUM: trace_seq_printf(s, "%ld", arg); @@ -938,10 +944,15 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c struct list_head *head) { struct ftrace_event_field *field; + struct trace_array *tr = iter->tr; + unsigned long long laddr; + unsigned long addr; int offset; int len; int ret; + int i; void *pos; + char *str; list_for_each_entry_reverse(field, head, link) { trace_seq_printf(&iter->seq, " %s=", field->name); @@ -968,14 +979,35 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c trace_seq_puts(&iter->seq, "<OVERFLOW>"); break; } - pos = (void *)iter->ent + offset; - trace_seq_printf(&iter->seq, "%.*s", len, (char *)pos); + str = (char *)iter->ent + offset; + /* Check if there's any non printable strings */ + for (i = 0; i < len; i++) { + if (str[i] && !(isascii(str[i]) && isprint(str[i]))) + break; + } + if (i < len) { + for (i = 0; i < len; i++) { + if (isascii(str[i]) && isprint(str[i])) + trace_seq_putc(&iter->seq, str[i]); + else + trace_seq_putc(&iter->seq, '.'); + } + trace_seq_puts(&iter->seq, " ("); + for (i = 0; i < len; i++) { + if (i) + trace_seq_putc(&iter->seq, ':'); + trace_seq_printf(&iter->seq, "%02x", str[i]); + } + trace_seq_putc(&iter->seq, ')'); + } else { + trace_seq_printf(&iter->seq, "%.*s", len, str); + } break; case FILTER_PTR_STRING: if (!iter->fmt_size) trace_iter_expand_format(iter); - pos = *(void **)pos; - ret = strncpy_from_kernel_nofault(iter->fmt, pos, + addr = trace_adjust_address(tr, *(unsigned long *)pos); + ret = strncpy_from_kernel_nofault(iter->fmt, (void *)addr, iter->fmt_size); if (ret < 0) trace_seq_printf(&iter->seq, "(0x%px)", pos); @@ -984,8 +1016,8 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c pos, iter->fmt); break; case FILTER_TRACE_FN: - pos = *(void **)pos; - trace_seq_printf(&iter->seq, "%pS", pos); + addr = trace_adjust_address(tr, *(unsigned long *)pos); + trace_seq_printf(&iter->seq, "%pS", (void *)addr); break; case FILTER_CPU: case FILTER_OTHER: @@ -1015,14 +1047,36 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c break; } - trace_seq_printf(&iter->seq, "0x%x (%d)", - *(unsigned int *)pos, - *(unsigned int *)pos); + addr = *(unsigned int *)pos; + + /* Some fields reference offset from _stext. */ + if (!strcmp(field->name, "caller_offs") || + !strcmp(field->name, "parent_offs")) { + unsigned long ip; + + ip = addr + (unsigned long)_stext; + ip = trace_adjust_address(tr, ip); + trace_seq_printf(&iter->seq, "%pS ", (void *)ip); + } + + if (sizeof(long) == 4) { + addr = trace_adjust_address(tr, addr); + trace_seq_printf(&iter->seq, "%pS (%d)", + (void *)addr, (int)addr); + } else { + trace_seq_printf(&iter->seq, "0x%x (%d)", + (unsigned int)addr, (int)addr); + } break; case 8: - trace_seq_printf(&iter->seq, "0x%llx (%lld)", - *(unsigned long long *)pos, - *(unsigned long long *)pos); + laddr = *(unsigned long long *)pos; + if (sizeof(long) == 8) { + laddr = trace_adjust_address(tr, (unsigned long)laddr); + trace_seq_printf(&iter->seq, "%pS (%lld)", + (void *)(long)laddr, laddr); + } else { + trace_seq_printf(&iter->seq, "0x%llx (%lld)", laddr, laddr); + } break; default: trace_seq_puts(&iter->seq, "<INVALID-SIZE>"); @@ -1086,17 +1140,17 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, } static void print_fn_trace(struct trace_seq *s, unsigned long ip, - unsigned long parent_ip, long delta, - unsigned long *args, int flags) + unsigned long parent_ip, unsigned long *args, + struct trace_array *tr, int flags) { - ip += delta; - parent_ip += delta; + ip = trace_adjust_address(tr, ip); + parent_ip = trace_adjust_address(tr, parent_ip); seq_print_ip_sym(s, ip, flags); if (args) print_function_args(s, args, ip); - if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) { + if ((flags & TRACE_ITER(PRINT_PARENT)) && parent_ip) { trace_seq_puts(s, " <-"); seq_print_ip_sym(s, parent_ip, flags); } @@ -1119,8 +1173,7 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, else args = NULL; - print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, - args, flags); + print_fn_trace(s, field->ip, field->parent_ip, args, iter->tr, flags); trace_seq_putc(s, '\n'); return trace_handle_return(s); @@ -1387,7 +1440,7 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, trace_seq_puts(s, "<user stack trace>\n"); - if (tr->trace_flags & TRACE_ITER_SYM_USEROBJ) { + if (tr->trace_flags & TRACE_ITER(SYM_USEROBJ)) { struct task_struct *task; /* * we do the lookup on the thread group leader, @@ -1437,12 +1490,12 @@ trace_hwlat_print(struct trace_iterator *iter, int flags, trace_assign_type(field, entry); - trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%lld.%09ld count:%d", + trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ptSp count:%d", field->seqnum, field->duration, field->outer_duration, - (long long)field->timestamp.tv_sec, - field->timestamp.tv_nsec, field->count); + &field->timestamp, + field->count); if (field->nmi_count) { /* @@ -1706,7 +1759,7 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - ip = field->ip + iter->tr->text_delta; + ip = trace_adjust_address(iter->tr, field->ip); seq_print_ip_sym(s, ip, flags); trace_seq_printf(s, ": %s", field->buf); @@ -1792,7 +1845,7 @@ trace_func_repeats_print(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, NULL, flags); + print_fn_trace(s, field->ip, field->parent_ip, NULL, iter->tr, flags); trace_seq_printf(s, " (repeats: %u, last_ts:", field->count); trace_print_time(s, iter, iter->ts - FUNC_REPEATS_GET_DELTA_TS(field)); diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 2e305364f2a9..99b676733d46 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -16,6 +16,17 @@ extern int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags); +static inline int seq_print_ip_sym_offset(struct trace_seq *s, unsigned long ip, + unsigned long sym_flags) +{ + return seq_print_ip_sym(s, ip, sym_flags | TRACE_ITER(SYM_OFFSET)); +} +static inline int seq_print_ip_sym_no_offset(struct trace_seq *s, unsigned long ip, + unsigned long sym_flags) +{ + return seq_print_ip_sym(s, ip, sym_flags & ~TRACE_ITER(SYM_OFFSET)); +} + extern void trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset); extern int trace_print_context(struct trace_iterator *iter); extern int trace_print_lat_context(struct trace_iterator *iter); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 2eeecb6c95ee..2f571083ce9e 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -13,8 +13,8 @@ #include <linux/bpf.h> #include <linux/fs.h> -#include "trace_btf.h" +#include "trace_btf.h" #include "trace_probe.h" #undef C @@ -154,22 +154,30 @@ fail: } static struct trace_probe_log trace_probe_log; +extern struct mutex dyn_event_ops_mutex; -void trace_probe_log_init(const char *subsystem, int argc, const char **argv) +const char *trace_probe_log_init(const char *subsystem, int argc, const char **argv) { + lockdep_assert_held(&dyn_event_ops_mutex); + trace_probe_log.subsystem = subsystem; trace_probe_log.argc = argc; trace_probe_log.argv = argv; trace_probe_log.index = 0; + return subsystem; } void trace_probe_log_clear(void) { + lockdep_assert_held(&dyn_event_ops_mutex); + memset(&trace_probe_log, 0, sizeof(trace_probe_log)); } void trace_probe_log_set_index(int index) { + lockdep_assert_held(&dyn_event_ops_mutex); + trace_probe_log.index = index; } @@ -178,6 +186,8 @@ void __trace_probe_log_err(int offset, int err_type) char *command, *p; int i, len = 0, pos = 0; + lockdep_assert_held(&dyn_event_ops_mutex); + if (!trace_probe_log.argv) return; @@ -205,7 +215,7 @@ void __trace_probe_log_err(int offset, int err_type) p = command; for (i = 0; i < trace_probe_log.argc; i++) { len = strlen(trace_probe_log.argv[i]); - strcpy(p, trace_probe_log.argv[i]); + memcpy(p, trace_probe_log.argv[i], len); p[len] = ' '; p += len + 1; } @@ -238,7 +248,25 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset) return 0; } -/* @buf must has MAX_EVENT_NAME_LEN size */ +/** + * traceprobe_parse_event_name() - Parse a string into group and event names + * @pevent: A pointer to the string to be parsed. + * @pgroup: A pointer to the group name. + * @buf: A buffer to store the parsed group name. + * @offset: The offset of the string in the original user command, for logging. + * + * This parses a string with the format `[GROUP/][EVENT]` or `[GROUP.][EVENT]` + * (either GROUP or EVENT or both must be specified). + * Since the parsed group name is stored in @buf, the caller must ensure @buf + * is at least MAX_EVENT_NAME_LEN bytes. + * + * Return: 0 on success, or -EINVAL on failure. + * + * If success, *@pevent is updated to point to the event name part of the + * original string, or NULL if there is no event name. + * Also, *@pgroup is updated to point to the parsed group which is stored + * in @buf, or NULL if there is no group name. + */ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, char *buf, int offset) { @@ -489,7 +517,7 @@ static void clear_btf_context(struct traceprobe_parse_context *ctx) } } -/* Return 1 if the field separater is arrow operator ('->') */ +/* Return 1 if the field separator is arrow operator ('->') */ static int split_next_field(char *varname, char **next_field, struct traceprobe_parse_context *ctx) { @@ -648,7 +676,7 @@ static int parse_btf_arg(char *varname, ret = query_btf_context(ctx); if (ret < 0 || ctx->nr_params == 0) { trace_probe_log_err(ctx->offset, NO_BTF_ENTRY); - return PTR_ERR(params); + return -ENOENT; } } params = ctx->params; @@ -770,6 +798,36 @@ static int check_prepare_btf_string_fetch(char *typename, #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API +static void store_entry_arg_at(struct fetch_insn *code, int argnum, int offset) +{ + code[0].op = FETCH_OP_ARG; + code[0].param = argnum; + code[1].op = FETCH_OP_ST_EDATA; + code[1].offset = offset; +} + +static int get_entry_arg_max_offset(struct probe_entry_arg *earg) +{ + int i, max_offset = 0; + + /* + * earg->code[] array has an operation sequence which is run in + * the entry handler. + * The sequence stopped by FETCH_OP_END and each data stored in + * the entry data buffer by FETCH_OP_ST_EDATA. The FETCH_OP_ST_EDATA + * stores the data at the data buffer + its offset, and all data are + * "unsigned long" size. The offset must be increased when a data is + * stored. Thus we need to find the last FETCH_OP_ST_EDATA in the + * code array. + */ + for (i = 0; i < earg->size - 1 && earg->code[i].op != FETCH_OP_END; i++) { + if (earg->code[i].op == FETCH_OP_ST_EDATA) + if (earg->code[i].offset > max_offset) + max_offset = earg->code[i].offset; + } + return max_offset; +} + /* * Add the entry code to store the 'argnum'th parameter and return the offset * in the entry data buffer where the data will be stored. @@ -777,8 +835,7 @@ static int check_prepare_btf_string_fetch(char *typename, static int __store_entry_arg(struct trace_probe *tp, int argnum) { struct probe_entry_arg *earg = tp->entry_arg; - bool match = false; - int i, offset; + int i, offset, last_offset = 0; if (!earg) { earg = kzalloc(sizeof(*tp->entry_arg), GFP_KERNEL); @@ -795,78 +852,59 @@ static int __store_entry_arg(struct trace_probe *tp, int argnum) for (i = 0; i < earg->size; i++) earg->code[i].op = FETCH_OP_END; tp->entry_arg = earg; + store_entry_arg_at(earg->code, argnum, 0); + return 0; } /* - * The entry code array is repeating the pair of - * [FETCH_OP_ARG(argnum)][FETCH_OP_ST_EDATA(offset of entry data buffer)] - * and the rest of entries are filled with [FETCH_OP_END]. + * NOTE: if anyone change the following rule, please rewrite this. + * The entry code array is filled with the pair of * - * To reduce the redundant function parameter fetching, we scan the entry - * code array to find the FETCH_OP_ARG which already fetches the 'argnum' - * parameter. If it doesn't match, update 'offset' to find the last - * offset. - * If we find the FETCH_OP_END without matching FETCH_OP_ARG entry, we - * will save the entry with FETCH_OP_ARG and FETCH_OP_ST_EDATA, and - * return data offset so that caller can find the data offset in the entry - * data buffer. + * [FETCH_OP_ARG(argnum)] + * [FETCH_OP_ST_EDATA(offset of entry data buffer)] + * + * and the rest of entries are filled with [FETCH_OP_END]. + * The offset should be incremented, thus the last pair should + * have the largest offset. */ - offset = 0; - for (i = 0; i < earg->size - 1; i++) { - switch (earg->code[i].op) { - case FETCH_OP_END: - earg->code[i].op = FETCH_OP_ARG; - earg->code[i].param = argnum; - earg->code[i + 1].op = FETCH_OP_ST_EDATA; - earg->code[i + 1].offset = offset; - return offset; - case FETCH_OP_ARG: - match = (earg->code[i].param == argnum); - break; - case FETCH_OP_ST_EDATA: - offset = earg->code[i].offset; - if (match) - return offset; - offset += sizeof(unsigned long); - break; - default: - break; - } + + /* Search the offset for the sprcified argnum. */ + for (i = 0; i < earg->size - 1 && earg->code[i].op != FETCH_OP_END; i += 2) { + if (WARN_ON_ONCE(earg->code[i].op != FETCH_OP_ARG)) + return -EINVAL; + + if (earg->code[i].param != argnum) + continue; + + if (WARN_ON_ONCE(earg->code[i + 1].op != FETCH_OP_ST_EDATA)) + return -EINVAL; + + return earg->code[i + 1].offset; } - return -ENOSPC; + /* Not found, append new entry if possible. */ + if (i >= earg->size - 1) + return -ENOSPC; + + /* The last entry must have the largest offset. */ + if (i != 0) { + if (WARN_ON_ONCE(earg->code[i - 1].op != FETCH_OP_ST_EDATA)) + return -EINVAL; + last_offset = earg->code[i - 1].offset; + } + + offset = last_offset + sizeof(unsigned long); + store_entry_arg_at(&earg->code[i], argnum, offset); + return offset; } int traceprobe_get_entry_data_size(struct trace_probe *tp) { struct probe_entry_arg *earg = tp->entry_arg; - int i, size = 0; if (!earg) return 0; - /* - * earg->code[] array has an operation sequence which is run in - * the entry handler. - * The sequence stopped by FETCH_OP_END and each data stored in - * the entry data buffer by FETCH_OP_ST_EDATA. The FETCH_OP_ST_EDATA - * stores the data at the data buffer + its offset, and all data are - * "unsigned long" size. The offset must be increased when a data is - * stored. Thus we need to find the last FETCH_OP_ST_EDATA in the - * code array. - */ - for (i = 0; i < earg->size; i++) { - switch (earg->code[i].op) { - case FETCH_OP_END: - goto out; - case FETCH_OP_ST_EDATA: - size = earg->code[i].offset + sizeof(unsigned long); - break; - default: - break; - } - } -out: - return size; + return get_entry_arg_max_offset(earg) + sizeof(unsigned long); } void store_trace_entry_data(void *edata, struct trace_probe *tp, struct pt_regs *regs) diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 854e5668f5ee..9fc56c937130 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -10,20 +10,22 @@ * Author: Srikar Dronamraju */ +#include <linux/bitops.h> +#include <linux/btf.h> +#include <linux/cleanup.h> +#include <linux/kprobes.h> +#include <linux/limits.h> +#include <linux/perf_event.h> +#include <linux/ptrace.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/smp.h> -#include <linux/tracefs.h> -#include <linux/types.h> #include <linux/string.h> -#include <linux/ptrace.h> -#include <linux/perf_event.h> -#include <linux/kprobes.h> #include <linux/stringify.h> -#include <linux/limits.h> +#include <linux/tracefs.h> +#include <linux/types.h> #include <linux/uaccess.h> -#include <linux/bitops.h> -#include <linux/btf.h> + #include <asm/bitsperlong.h> #include "trace.h" @@ -269,16 +271,21 @@ struct event_file_link { struct list_head list; }; +static inline unsigned int trace_probe_load_flag(struct trace_probe *tp) +{ + return smp_load_acquire(&tp->event->flags); +} + static inline bool trace_probe_test_flag(struct trace_probe *tp, unsigned int flag) { - return !!(tp->event->flags & flag); + return !!(trace_probe_load_flag(tp) & flag); } static inline void trace_probe_set_flag(struct trace_probe *tp, unsigned int flag) { - tp->event->flags |= flag; + smp_store_release(&tp->event->flags, tp->event->flags | flag); } static inline void trace_probe_clear_flag(struct trace_probe *tp, @@ -438,6 +445,14 @@ extern void traceprobe_free_probe_arg(struct probe_arg *arg); * this MUST be called for clean up the context and return a resource. */ void traceprobe_finish_parse(struct traceprobe_parse_context *ctx); +static inline void traceprobe_free_parse_ctx(struct traceprobe_parse_context *ctx) +{ + traceprobe_finish_parse(ctx); + kfree(ctx); +} + +DEFINE_FREE(traceprobe_parse_context, struct traceprobe_parse_context *, + if (_T) traceprobe_free_parse_ctx(_T)) extern int traceprobe_split_symbol_offset(char *symbol, long *offset); int traceprobe_parse_event_name(const char **pevent, const char **pgroup, @@ -563,11 +578,13 @@ struct trace_probe_log { int index; }; -void trace_probe_log_init(const char *subsystem, int argc, const char **argv); +const char *trace_probe_log_init(const char *subsystem, int argc, const char **argv); void trace_probe_log_set_index(int index); void trace_probe_log_clear(void); void __trace_probe_log_err(int offset, int err); +DEFINE_FREE(trace_probe_log_clear, const char *, if (_T) trace_probe_log_clear()) + #define trace_probe_log_err(offs, err) \ __trace_probe_log_err(offs, TP_ERR_##err) diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index cb49f7279dc8..c46d584ded3b 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -224,7 +224,6 @@ static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val) /* Place map_cmdline_to_pid array right after saved_cmdlines */ s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val * TASK_COMM_LEN]; - s->cmdline_idx = 0; memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(s->map_pid_to_cmdline)); memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, @@ -248,6 +247,8 @@ int trace_save_cmdline(struct task_struct *tsk) if (!tsk->pid) return 1; + BUILD_BUG_ON(!is_power_of_2(PID_MAX_DEFAULT)); + tpid = tsk->pid & (PID_MAX_DEFAULT - 1); /* diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index a0db3404f7f7..8faa73d3bba1 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -41,7 +41,7 @@ static void stop_func_tracer(struct trace_array *tr, int graph); static int save_flags; #ifdef CONFIG_FUNCTION_GRAPH_TRACER -# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH) +# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER(DISPLAY_GRAPH)) #else # define is_graph(tr) false #endif @@ -83,14 +83,14 @@ func_prolog_preempt_disable(struct trace_array *tr, goto out_enable; *data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&(*data)->disabled); + disabled = local_inc_return(&(*data)->disabled); if (unlikely(disabled != 1)) goto out; return 1; out: - atomic_dec(&(*data)->disabled); + local_dec(&(*data)->disabled); out_enable: preempt_enable_notrace(); @@ -138,13 +138,11 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace, return 0; calltime = fgraph_reserve_data(gops->idx, sizeof(*calltime)); - if (!calltime) - return 0; - - *calltime = trace_clock_local(); - - ret = __trace_graph_entry(tr, trace, trace_ctx); - atomic_dec(&data->disabled); + if (calltime) { + *calltime = trace_clock_local(); + ret = __trace_graph_entry(tr, trace, trace_ctx); + } + local_dec(&data->disabled); preempt_enable_notrace(); return ret; @@ -169,12 +167,10 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace, rettime = trace_clock_local(); calltime = fgraph_retrieve_data(gops->idx, &size); - if (!calltime) - return; - - __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime); - atomic_dec(&data->disabled); + if (calltime) + __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime); + local_dec(&data->disabled); preempt_enable_notrace(); return; } @@ -243,7 +239,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, trace_function(tr, ip, parent_ip, trace_ctx, fregs); local_irq_restore(flags); - atomic_dec(&data->disabled); + local_dec(&data->disabled); preempt_enable_notrace(); } @@ -251,8 +247,8 @@ static int register_wakeup_function(struct trace_array *tr, int graph, int set) { int ret; - /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ - if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION))) + /* 'set' is set if TRACE_ITER(FUNCTION) is about to be set */ + if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER(FUNCTION)))) return 0; if (graph) @@ -281,7 +277,7 @@ static void unregister_wakeup_function(struct trace_array *tr, int graph) static int wakeup_function_set(struct trace_array *tr, u32 mask, int set) { - if (!(mask & TRACE_ITER_FUNCTION)) + if (!(mask & TRACE_ITER(FUNCTION))) return 0; if (set) @@ -328,7 +324,7 @@ __trace_function(struct trace_array *tr, trace_function(tr, ip, parent_ip, trace_ctx, NULL); } -static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) +static int wakeup_flag_changed(struct trace_array *tr, u64 mask, int set) { struct tracer *tracer = tr->current_trace; @@ -336,7 +332,7 @@ static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) return 0; #ifdef CONFIG_FUNCTION_GRAPH_TRACER - if (mask & TRACE_ITER_DISPLAY_GRAPH) + if (mask & TRACE_ITER(DISPLAY_GRAPH)) return wakeup_display_graph(tr, set); #endif @@ -471,7 +467,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, /* disable local data, not wakeup_cpu data */ cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); + disabled = local_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); if (likely(disabled != 1)) goto out; @@ -508,7 +504,7 @@ out_unlock: arch_spin_unlock(&wakeup_lock); local_irq_restore(flags); out: - atomic_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); + local_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); } static void __wakeup_reset(struct trace_array *tr) @@ -563,7 +559,7 @@ probe_wakeup(void *ignore, struct task_struct *p) (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio))) return; - disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); + disabled = local_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); if (unlikely(disabled != 1)) goto out; @@ -610,7 +606,7 @@ probe_wakeup(void *ignore, struct task_struct *p) out_locked: arch_spin_unlock(&wakeup_lock); out: - atomic_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); + local_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); } static void start_wakeup_tracer(struct trace_array *tr) @@ -685,8 +681,8 @@ static int __wakeup_tracer_init(struct trace_array *tr) save_flags = tr->trace_flags; /* non overwrite screws up the latency tracers */ - set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); - set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); + set_tracer_flag(tr, TRACE_ITER(OVERWRITE), 1); + set_tracer_flag(tr, TRACE_ITER(LATENCY_FMT), 1); tr->max_latency = 0; wakeup_trace = tr; @@ -729,15 +725,15 @@ static int wakeup_dl_tracer_init(struct trace_array *tr) static void wakeup_tracer_reset(struct trace_array *tr) { - int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; - int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; + int lat_flag = save_flags & TRACE_ITER(LATENCY_FMT); + int overwrite_flag = save_flags & TRACE_ITER(OVERWRITE); stop_wakeup_tracer(tr); /* make sure we put back any tasks we are tracing */ wakeup_reset(tr); - set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); - set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); + set_tracer_flag(tr, TRACE_ITER(LATENCY_FMT), lat_flag); + set_tracer_flag(tr, TRACE_ITER(OVERWRITE), overwrite_flag); ftrace_reset_array_ops(tr); wakeup_busy = false; } diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index c158d65a8a88..32684ef4fb9d 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -15,7 +15,7 @@ * * A write to the buffer will either succeed or fail. That is, unlike * sprintf() there will not be a partial write (well it may write into - * the buffer but it wont update the pointers). This allows users to + * the buffer but it won't update the pointers). This allows users to * try to write something into the trace_seq buffer and if it fails * they can flush it and try again. * diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 14c6f272c4d8..0aa2514a6593 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -32,7 +32,7 @@ static arch_spinlock_t stack_trace_max_lock = DEFINE_PER_CPU(int, disable_stack_tracer); static DEFINE_MUTEX(stack_sysctl_mutex); -int stack_tracer_enabled; +static int stack_tracer_enabled; static void print_max_stack(void) { @@ -542,7 +542,7 @@ static __init int enable_stacktrace(char *str) int len; if ((len = str_has_prefix(str, "_filter="))) - strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE); + strscpy(stack_trace_filter_buf, str + len); stack_tracer_enabled = 1; return 1; @@ -578,3 +578,23 @@ static __init int stack_trace_init(void) } device_initcall(stack_trace_init); + + +static const struct ctl_table trace_stack_sysctl_table[] = { + { + .procname = "stack_tracer_enabled", + .data = &stack_tracer_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = stack_trace_sysctl, + }, +}; + +static int __init init_trace_stack_sysctls(void) +{ + register_sysctl_init("kernel", trace_stack_sysctl_table); + return 0; +} +subsys_initcall(init_trace_stack_sysctls); + + diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 46aab0ab9350..e96d0063cbcf 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <trace/syscall.h> #include <trace/events/syscalls.h> +#include <linux/kernel_stat.h> #include <linux/syscalls.h> #include <linux/slab.h> #include <linux/kernel.h> @@ -123,6 +124,119 @@ const char *get_syscall_name(int syscall) return entry->name; } +/* Added to user strings or arrays when max limit is reached */ +#define EXTRA "..." + +static void get_dynamic_len_ptr(struct syscall_trace_enter *trace, + struct syscall_metadata *entry, + int *offset_p, int *len_p, unsigned char **ptr_p) +{ + unsigned char *ptr; + int offset = *offset_p; + int val; + + /* This arg points to a user space string */ + ptr = (void *)trace->args + sizeof(long) * entry->nb_args + offset; + val = *(int *)ptr; + + /* The value is a dynamic string (len << 16 | offset) */ + ptr = (void *)trace + (val & 0xffff); + *len_p = val >> 16; + offset += 4; + + *ptr_p = ptr; + *offset_p = offset; +} + +static enum print_line_t +sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadata *entry, + struct trace_seq *s, struct trace_event *event) +{ + unsigned char *ptr; + int offset = 0; + int bits, len; + bool done = false; + static const struct trace_print_flags __flags[] = + { + { O_TMPFILE, "O_TMPFILE" }, + { O_WRONLY, "O_WRONLY" }, + { O_RDWR, "O_RDWR" }, + { O_CREAT, "O_CREAT" }, + { O_EXCL, "O_EXCL" }, + { O_NOCTTY, "O_NOCTTY" }, + { O_TRUNC, "O_TRUNC" }, + { O_APPEND, "O_APPEND" }, + { O_NONBLOCK, "O_NONBLOCK" }, + { O_DSYNC, "O_DSYNC" }, + { O_DIRECT, "O_DIRECT" }, + { O_LARGEFILE, "O_LARGEFILE" }, + { O_DIRECTORY, "O_DIRECTORY" }, + { O_NOFOLLOW, "O_NOFOLLOW" }, + { O_NOATIME, "O_NOATIME" }, + { O_CLOEXEC, "O_CLOEXEC" }, + { -1, NULL } + }; + + trace_seq_printf(s, "%s(", entry->name); + + for (int i = 0; !done && i < entry->nb_args; i++) { + + if (trace_seq_has_overflowed(s)) + goto end; + + if (i) + trace_seq_puts(s, ", "); + + switch (i) { + case 2: + bits = trace->args[2]; + + trace_seq_puts(s, "flags: "); + + /* No need to show mode when not creating the file */ + if (!(bits & (O_CREAT|O_TMPFILE))) + done = true; + + if (!(bits & O_ACCMODE)) { + if (!bits) { + trace_seq_puts(s, "O_RDONLY"); + continue; + } + trace_seq_puts(s, "O_RDONLY|"); + } + + trace_print_flags_seq(s, "|", bits, __flags); + /* + * trace_print_flags_seq() adds a '\0' to the + * buffer, but this needs to append more to the seq. + */ + if (!trace_seq_has_overflowed(s)) + trace_seq_pop(s); + + continue; + case 3: + trace_seq_printf(s, "%s: 0%03o", entry->args[i], + (unsigned int)trace->args[i]); + continue; + } + + trace_seq_printf(s, "%s: %lu", entry->args[i], + trace->args[i]); + + if (!(BIT(i) & entry->user_mask)) + continue; + + get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr); + trace_seq_printf(s, " \"%.*s\"", len, ptr); + } + + trace_seq_putc(s, ')'); +end: + trace_seq_putc(s, '\n'); + + return trace_handle_return(s); +} + static enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -132,7 +246,9 @@ print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_entry *ent = iter->ent; struct syscall_trace_enter *trace; struct syscall_metadata *entry; - int i, syscall; + int i, syscall, val, len; + unsigned char *ptr; + int offset = 0; trace = (typeof(trace))ent; syscall = trace->nr; @@ -146,21 +262,80 @@ print_syscall_enter(struct trace_iterator *iter, int flags, goto end; } + switch (entry->syscall_nr) { + case __NR_openat: + if (!tr || !(tr->trace_flags & TRACE_ITER(VERBOSE))) + return sys_enter_openat_print(trace, entry, s, event); + break; + default: + break; + } + trace_seq_printf(s, "%s(", entry->name); for (i = 0; i < entry->nb_args; i++) { + bool printable = false; + char *str; if (trace_seq_has_overflowed(s)) goto end; + if (i) + trace_seq_puts(s, ", "); + /* parameter types */ - if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) + if (tr && tr->trace_flags & TRACE_ITER(VERBOSE)) trace_seq_printf(s, "%s ", entry->types[i]); /* parameter values */ - trace_seq_printf(s, "%s: %lx%s", entry->args[i], - trace->args[i], - i == entry->nb_args - 1 ? "" : ", "); + if (trace->args[i] < 10) + trace_seq_printf(s, "%s: %lu", entry->args[i], + trace->args[i]); + else + trace_seq_printf(s, "%s: 0x%lx", entry->args[i], + trace->args[i]); + + if (!(BIT(i) & entry->user_mask)) + continue; + + get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr); + + if (entry->user_arg_size < 0 || entry->user_arg_is_str) { + trace_seq_printf(s, " \"%.*s\"", len, ptr); + continue; + } + + val = trace->args[entry->user_arg_size]; + + str = ptr; + trace_seq_puts(s, " ("); + for (int x = 0; x < len; x++, ptr++) { + if (isascii(*ptr) && isprint(*ptr)) + printable = true; + if (x) + trace_seq_putc(s, ':'); + trace_seq_printf(s, "%02x", *ptr); + } + if (len < val) + trace_seq_printf(s, ", %s", EXTRA); + + trace_seq_putc(s, ')'); + + /* If nothing is printable, don't bother printing anything */ + if (!printable) + continue; + + trace_seq_puts(s, " \""); + for (int x = 0; x < len; x++) { + if (isascii(str[x]) && isprint(str[x])) + trace_seq_putc(s, str[x]); + else + trace_seq_putc(s, '.'); + } + if (len < val) + trace_seq_printf(s, "\"%s", EXTRA); + else + trace_seq_putc(s, '"'); } trace_seq_putc(s, ')'); @@ -206,26 +381,107 @@ print_syscall_exit(struct trace_iterator *iter, int flags, .size = sizeof(_type), .align = __alignof__(_type), \ .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER } +/* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + +static int __init +sys_enter_openat_print_fmt(struct syscall_metadata *entry, char *buf, int len) +{ + int pos = 0; + + pos += snprintf(buf + pos, LEN_OR_ZERO, + "\"dfd: 0x%%08lx, filename: 0x%%08lx \\\"%%s\\\", flags: %%s%%s, mode: 0%%03o\","); + pos += snprintf(buf + pos, LEN_OR_ZERO, + " ((unsigned long)(REC->dfd)),"); + pos += snprintf(buf + pos, LEN_OR_ZERO, + " ((unsigned long)(REC->filename)),"); + pos += snprintf(buf + pos, LEN_OR_ZERO, + " __get_str(__filename_val),"); + pos += snprintf(buf + pos, LEN_OR_ZERO, + " (REC->flags & ~3) && !(REC->flags & 3) ? \"O_RDONLY|\" : \"\", "); + pos += snprintf(buf + pos, LEN_OR_ZERO, + " REC->flags ? __print_flags(REC->flags, \"|\", "); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_WRONLY\" }, ", O_WRONLY); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_RDWR\" }, ", O_RDWR); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_CREAT\" }, ", O_CREAT); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_EXCL\" }, ", O_EXCL); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_NOCTTY\" }, ", O_NOCTTY); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_TRUNC\" }, ", O_TRUNC); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_APPEND\" }, ", O_APPEND); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_NONBLOCK\" }, ", O_NONBLOCK); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_DSYNC\" }, ", O_DSYNC); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_DIRECT\" }, ", O_DIRECT); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_LARGEFILE\" }, ", O_LARGEFILE); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_DIRECTORY\" }, ", O_DIRECTORY); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_NOFOLLOW\" }, ", O_NOFOLLOW); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_NOATIME\" }, ", O_NOATIME); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_CLOEXEC\" }) : \"O_RDONLY\", ", O_CLOEXEC); + + pos += snprintf(buf + pos, LEN_OR_ZERO, + " ((unsigned long)(REC->mode))"); + return pos; +} + static int __init __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) { + bool is_string = entry->user_arg_is_str; int i; int pos = 0; - /* When len=0, we just calculate the needed length */ -#define LEN_OR_ZERO (len ? len - pos : 0) + switch (entry->syscall_nr) { + case __NR_openat: + return sys_enter_openat_print_fmt(entry, buf, len); + default: + break; + } pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); for (i = 0; i < entry->nb_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s", - entry->args[i], sizeof(unsigned long), - i == entry->nb_args - 1 ? "" : ", "); + if (i) + pos += snprintf(buf + pos, LEN_OR_ZERO, ", "); + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx", + entry->args[i], sizeof(unsigned long)); + + if (!(BIT(i) & entry->user_mask)) + continue; + + /* Add the format for the user space string or array */ + if (entry->user_arg_size < 0 || is_string) + pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\""); + else + pos += snprintf(buf + pos, LEN_OR_ZERO, " (%%s)"); } pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); for (i = 0; i < entry->nb_args; i++) { pos += snprintf(buf + pos, LEN_OR_ZERO, ", ((unsigned long)(REC->%s))", entry->args[i]); + if (!(BIT(i) & entry->user_mask)) + continue; + /* The user space data for arg has name __<arg>_val */ + if (entry->user_arg_size < 0 || is_string) { + pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)", + entry->args[i]); + } else { + pos += snprintf(buf + pos, LEN_OR_ZERO, ", __print_dynamic_array(__%s_val, 1)", + entry->args[i]); + } } #undef LEN_OR_ZERO @@ -271,8 +527,11 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call) { struct syscall_trace_enter trace; struct syscall_metadata *meta = call->data; + unsigned long mask; + char *arg; int offset = offsetof(typeof(trace), args); int ret = 0; + int len; int i; for (i = 0; i < meta->nb_args; i++) { @@ -285,9 +544,320 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call) offset += sizeof(unsigned long); } + if (ret || !meta->user_mask) + return ret; + + mask = meta->user_mask; + + while (mask) { + int idx = ffs(mask) - 1; + mask &= ~BIT(idx); + + /* + * User space data is faulted into a temporary buffer and then + * added as a dynamic string or array to the end of the event. + * The user space data name for the arg pointer is + * "__<arg>_val". + */ + len = strlen(meta->args[idx]) + sizeof("___val"); + arg = kmalloc(len, GFP_KERNEL); + if (WARN_ON_ONCE(!arg)) { + meta->user_mask = 0; + return -ENOMEM; + } + + snprintf(arg, len, "__%s_val", meta->args[idx]); + + ret = trace_define_field(call, "__data_loc char[]", + arg, offset, sizeof(int), 0, + FILTER_OTHER); + if (ret) { + kfree(arg); + break; + } + offset += 4; + } return ret; } +/* + * Create a per CPU temporary buffer to copy user space pointers into. + * + * SYSCALL_FAULT_USER_MAX is the amount to copy from user space. + * (defined in kernel/trace/trace.h) + + * SYSCALL_FAULT_ARG_SZ is the amount to copy from user space plus the + * nul terminating byte and possibly appended EXTRA (4 bytes). + * + * SYSCALL_FAULT_BUF_SZ holds the size of the per CPU buffer to use + * to copy memory from user space addresses into that will hold + * 3 args as only 3 args are allowed to be copied from system calls. + */ +#define SYSCALL_FAULT_ARG_SZ (SYSCALL_FAULT_USER_MAX + 1 + 4) +#define SYSCALL_FAULT_MAX_CNT 3 +#define SYSCALL_FAULT_BUF_SZ (SYSCALL_FAULT_ARG_SZ * SYSCALL_FAULT_MAX_CNT) + +/* Use the tracing per CPU buffer infrastructure to copy from user space */ +struct syscall_user_buffer { + struct trace_user_buf_info buf; + struct rcu_head rcu; +}; + +static struct syscall_user_buffer *syscall_buffer; + +static int syscall_fault_buffer_enable(void) +{ + struct syscall_user_buffer *sbuf; + int ret; + + lockdep_assert_held(&syscall_trace_lock); + + if (syscall_buffer) { + trace_user_fault_get(&syscall_buffer->buf); + return 0; + } + + sbuf = kmalloc(sizeof(*sbuf), GFP_KERNEL); + if (!sbuf) + return -ENOMEM; + + ret = trace_user_fault_init(&sbuf->buf, SYSCALL_FAULT_BUF_SZ); + if (ret < 0) { + kfree(sbuf); + return ret; + } + + WRITE_ONCE(syscall_buffer, sbuf); + + return 0; +} + +static void rcu_free_syscall_buffer(struct rcu_head *rcu) +{ + struct syscall_user_buffer *sbuf = + container_of(rcu, struct syscall_user_buffer, rcu); + + trace_user_fault_destroy(&sbuf->buf); + kfree(sbuf); +} + + +static void syscall_fault_buffer_disable(void) +{ + struct syscall_user_buffer *sbuf = syscall_buffer; + + lockdep_assert_held(&syscall_trace_lock); + + if (trace_user_fault_put(&sbuf->buf)) + return; + + WRITE_ONCE(syscall_buffer, NULL); + call_rcu_tasks_trace(&sbuf->rcu, rcu_free_syscall_buffer); +} + +struct syscall_args { + char *ptr_array[SYSCALL_FAULT_MAX_CNT]; + int read[SYSCALL_FAULT_MAX_CNT]; + int uargs; +}; + +static int syscall_copy_user(char *buf, const char __user *ptr, + size_t size, void *data) +{ + struct syscall_args *args = data; + int ret; + + for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { + ptr = (char __user *)args->ptr_array[i]; + ret = strncpy_from_user(buf, ptr, size); + args->read[i] = ret; + } + return 0; +} + +static int syscall_copy_user_array(char *buf, const char __user *ptr, + size_t size, void *data) +{ + struct syscall_args *args = data; + int ret; + + for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { + ptr = (char __user *)args->ptr_array[i]; + ret = __copy_from_user(buf, ptr, size); + args->read[i] = ret ? -1 : size; + } + return 0; +} + +static char *sys_fault_user(unsigned int buf_size, + struct syscall_metadata *sys_data, + struct syscall_user_buffer *sbuf, + unsigned long *args, + unsigned int data_size[SYSCALL_FAULT_MAX_CNT]) +{ + trace_user_buf_copy syscall_copy = syscall_copy_user; + unsigned long mask = sys_data->user_mask; + unsigned long size = SYSCALL_FAULT_ARG_SZ - 1; + struct syscall_args sargs; + bool array = false; + char *buffer; + char *buf; + int ret; + int i = 0; + + /* The extra is appended to the user data in the buffer */ + BUILD_BUG_ON(SYSCALL_FAULT_USER_MAX + sizeof(EXTRA) >= + SYSCALL_FAULT_ARG_SZ); + + /* + * If this system call event has a size argument, use + * it to define how much of user space memory to read, + * and read it as an array and not a string. + */ + if (sys_data->user_arg_size >= 0) { + array = true; + size = args[sys_data->user_arg_size]; + if (size > SYSCALL_FAULT_ARG_SZ - 1) + size = SYSCALL_FAULT_ARG_SZ - 1; + syscall_copy = syscall_copy_user_array; + } + + while (mask) { + int idx = ffs(mask) - 1; + mask &= ~BIT(idx); + + if (WARN_ON_ONCE(i == SYSCALL_FAULT_MAX_CNT)) + break; + + /* Get the pointer to user space memory to read */ + sargs.ptr_array[i++] = (char *)args[idx]; + } + + sargs.uargs = i; + + /* Clear the values that are not used */ + for (; i < SYSCALL_FAULT_MAX_CNT; i++) { + data_size[i] = -1; /* Denotes no pointer */ + } + + /* A zero size means do not even try */ + if (!buf_size) + return NULL; + + buffer = trace_user_fault_read(&sbuf->buf, NULL, size, + syscall_copy, &sargs); + if (!buffer) + return NULL; + + buf = buffer; + for (i = 0; i < sargs.uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { + + ret = sargs.read[i]; + if (ret < 0) + continue; + buf[ret] = '\0'; + + /* For strings, replace any non-printable characters with '.' */ + if (!array) { + for (int x = 0; x < ret; x++) { + if (!isprint(buf[x])) + buf[x] = '.'; + } + + size = min(buf_size, SYSCALL_FAULT_USER_MAX); + + /* + * If the text was truncated due to our max limit, + * add "..." to the string. + */ + if (ret > size) { + strscpy(buf + size, EXTRA, sizeof(EXTRA)); + ret = size + sizeof(EXTRA); + } else { + buf[ret++] = '\0'; + } + } else { + ret = min((unsigned int)ret, buf_size); + } + data_size[i] = ret; + } + + return buffer; +} + +static int +syscall_get_data(struct syscall_metadata *sys_data, unsigned long *args, + char **buffer, int *size, int *user_sizes, int *uargs, + int buf_size) +{ + struct syscall_user_buffer *sbuf; + int i; + + /* If the syscall_buffer is NULL, tracing is being shutdown */ + sbuf = READ_ONCE(syscall_buffer); + if (!sbuf) + return -1; + + *buffer = sys_fault_user(buf_size, sys_data, sbuf, args, user_sizes); + /* + * user_size is the amount of data to append. + * Need to add 4 for the meta field that points to + * the user memory at the end of the event and also + * stores its size. + */ + for (i = 0; i < SYSCALL_FAULT_MAX_CNT; i++) { + if (user_sizes[i] < 0) + break; + *size += user_sizes[i] + 4; + } + /* Save the number of user read arguments of this syscall */ + *uargs = i; + return 0; +} + +static void syscall_put_data(struct syscall_metadata *sys_data, + struct syscall_trace_enter *entry, + char *buffer, int size, int *user_sizes, int uargs) +{ + char *buf = buffer; + void *ptr; + int val; + + /* + * Set the pointer to point to the meta data of the event + * that has information about the stored user space memory. + */ + ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args; + + /* + * The meta data will store the offset of the user data from + * the beginning of the event. That is after the static arguments + * and the meta data fields. + */ + val = (ptr - (void *)entry) + 4 * uargs; + + for (int i = 0; i < uargs; i++) { + + if (i) + val += user_sizes[i - 1]; + + /* Store the offset and the size into the meta data */ + *(int *)ptr = val | (user_sizes[i] << 16); + + /* Skip the meta data */ + ptr += 4; + } + + for (int i = 0; i < uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { + /* Nothing to do if the user space was empty or faulted */ + if (!user_sizes[i]) + continue; + + memcpy(ptr, buf, user_sizes[i]); + ptr += user_sizes[i]; + } +} + static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) { struct trace_array *tr = data; @@ -296,22 +866,24 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) struct syscall_metadata *sys_data; struct trace_event_buffer fbuffer; unsigned long args[6]; + char *user_ptr; + int user_sizes[SYSCALL_FAULT_MAX_CNT] = {}; int syscall_nr; - int size; + int size = 0; + int uargs = 0; + bool mayfault; /* * Syscall probe called with preemption enabled, but the ring * buffer and per-cpu data require preemption to be disabled. */ might_fault(); - guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; - /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ - trace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]); + trace_file = READ_ONCE(tr->enter_syscall_files[syscall_nr]); if (!trace_file) return; @@ -322,7 +894,20 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) if (!sys_data) return; - size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; + /* Check if this syscall event faults in user space memory */ + mayfault = sys_data->user_mask != 0; + + guard(preempt_notrace)(); + + syscall_get_arguments(current, regs, args); + + if (mayfault) { + if (syscall_get_data(sys_data, args, &user_ptr, + &size, user_sizes, &uargs, tr->syscall_buf_sz) < 0) + return; + } + + size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; entry = trace_event_buffer_reserve(&fbuffer, trace_file, size); if (!entry) @@ -330,9 +915,12 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) entry = ring_buffer_event_data(fbuffer.event); entry->nr = syscall_nr; - syscall_get_arguments(current, regs, args); + memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); + if (mayfault) + syscall_put_data(sys_data, entry, user_ptr, size, user_sizes, uargs); + trace_event_buffer_commit(&fbuffer); } @@ -356,8 +944,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; - /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ - trace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]); + trace_file = READ_ONCE(tr->exit_syscall_files[syscall_nr]); if (!trace_file) return; @@ -382,39 +969,50 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) static int reg_event_syscall_enter(struct trace_event_file *file, struct trace_event_call *call) { + struct syscall_metadata *sys_data = call->data; struct trace_array *tr = file->tr; int ret = 0; int num; - num = ((struct syscall_metadata *)call->data)->syscall_nr; + num = sys_data->syscall_nr; if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return -ENOSYS; - mutex_lock(&syscall_trace_lock); - if (!tr->sys_refcount_enter) + guard(mutex)(&syscall_trace_lock); + if (sys_data->user_mask) { + ret = syscall_fault_buffer_enable(); + if (ret < 0) + return ret; + } + if (!tr->sys_refcount_enter) { ret = register_trace_sys_enter(ftrace_syscall_enter, tr); - if (!ret) { - rcu_assign_pointer(tr->enter_syscall_files[num], file); - tr->sys_refcount_enter++; + if (ret < 0) { + if (sys_data->user_mask) + syscall_fault_buffer_disable(); + return ret; + } } - mutex_unlock(&syscall_trace_lock); - return ret; + WRITE_ONCE(tr->enter_syscall_files[num], file); + tr->sys_refcount_enter++; + return 0; } static void unreg_event_syscall_enter(struct trace_event_file *file, struct trace_event_call *call) { + struct syscall_metadata *sys_data = call->data; struct trace_array *tr = file->tr; int num; - num = ((struct syscall_metadata *)call->data)->syscall_nr; + num = sys_data->syscall_nr; if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return; - mutex_lock(&syscall_trace_lock); + guard(mutex)(&syscall_trace_lock); tr->sys_refcount_enter--; - RCU_INIT_POINTER(tr->enter_syscall_files[num], NULL); + WRITE_ONCE(tr->enter_syscall_files[num], NULL); if (!tr->sys_refcount_enter) unregister_trace_sys_enter(ftrace_syscall_enter, tr); - mutex_unlock(&syscall_trace_lock); + if (sys_data->user_mask) + syscall_fault_buffer_disable(); } static int reg_event_syscall_exit(struct trace_event_file *file, @@ -431,7 +1029,7 @@ static int reg_event_syscall_exit(struct trace_event_file *file, if (!tr->sys_refcount_exit) ret = register_trace_sys_exit(ftrace_syscall_exit, tr); if (!ret) { - rcu_assign_pointer(tr->exit_syscall_files[num], file); + WRITE_ONCE(tr->exit_syscall_files[num], file); tr->sys_refcount_exit++; } mutex_unlock(&syscall_trace_lock); @@ -449,12 +1047,221 @@ static void unreg_event_syscall_exit(struct trace_event_file *file, return; mutex_lock(&syscall_trace_lock); tr->sys_refcount_exit--; - RCU_INIT_POINTER(tr->exit_syscall_files[num], NULL); + WRITE_ONCE(tr->exit_syscall_files[num], NULL); if (!tr->sys_refcount_exit) unregister_trace_sys_exit(ftrace_syscall_exit, tr); mutex_unlock(&syscall_trace_lock); } +/* + * For system calls that reference user space memory that can + * be recorded into the event, set the system call meta data's user_mask + * to the "args" index that points to the user space memory to retrieve. + */ +static void check_faultable_syscall(struct trace_event_call *call, int nr) +{ + struct syscall_metadata *sys_data = call->data; + unsigned long mask; + + /* Only work on entry */ + if (sys_data->enter_event != call) + return; + + sys_data->user_arg_size = -1; + + switch (nr) { + /* user arg 1 with size arg at 2 */ + case __NR_write: +#ifdef __NR_mq_timedsend + case __NR_mq_timedsend: +#endif + case __NR_pwrite64: + sys_data->user_mask = BIT(1); + sys_data->user_arg_size = 2; + break; + /* user arg 0 with size arg at 1 as string */ + case __NR_setdomainname: + case __NR_sethostname: + sys_data->user_mask = BIT(0); + sys_data->user_arg_size = 1; + sys_data->user_arg_is_str = 1; + break; +#ifdef __NR_kexec_file_load + /* user arg 4 with size arg at 3 as string */ + case __NR_kexec_file_load: + sys_data->user_mask = BIT(4); + sys_data->user_arg_size = 3; + sys_data->user_arg_is_str = 1; + break; +#endif + /* user arg at position 0 */ +#ifdef __NR_access + case __NR_access: +#endif + case __NR_acct: + case __NR_chdir: +#ifdef __NR_chown + case __NR_chown: +#endif +#ifdef __NR_chmod + case __NR_chmod: +#endif + case __NR_chroot: +#ifdef __NR_creat + case __NR_creat: +#endif + case __NR_delete_module: + case __NR_execve: + case __NR_fsopen: +#ifdef __NR_lchown + case __NR_lchown: +#endif +#ifdef __NR_open + case __NR_open: +#endif + case __NR_memfd_create: +#ifdef __NR_mkdir + case __NR_mkdir: +#endif +#ifdef __NR_mknod + case __NR_mknod: +#endif + case __NR_mq_open: + case __NR_mq_unlink: +#ifdef __NR_readlink + case __NR_readlink: +#endif +#ifdef __NR_rmdir + case __NR_rmdir: +#endif + case __NR_shmdt: +#ifdef __NR_statfs + case __NR_statfs: +#endif + case __NR_swapon: + case __NR_swapoff: +#ifdef __NR_truncate + case __NR_truncate: +#endif +#ifdef __NR_unlink + case __NR_unlink: +#endif + case __NR_umount2: +#ifdef __NR_utime + case __NR_utime: +#endif +#ifdef __NR_utimes + case __NR_utimes: +#endif + sys_data->user_mask = BIT(0); + break; + /* user arg at position 1 */ + case __NR_execveat: + case __NR_faccessat: + case __NR_faccessat2: + case __NR_finit_module: + case __NR_fchmodat: + case __NR_fchmodat2: + case __NR_fchownat: + case __NR_fgetxattr: + case __NR_flistxattr: + case __NR_fsetxattr: + case __NR_fspick: + case __NR_fremovexattr: +#ifdef __NR_futimesat + case __NR_futimesat: +#endif + case __NR_inotify_add_watch: + case __NR_mkdirat: + case __NR_mknodat: + case __NR_mount_setattr: + case __NR_name_to_handle_at: +#ifdef __NR_newfstatat + case __NR_newfstatat: +#endif + case __NR_openat: + case __NR_openat2: + case __NR_open_tree: + case __NR_open_tree_attr: + case __NR_readlinkat: + case __NR_quotactl: + case __NR_syslog: + case __NR_statx: + case __NR_unlinkat: +#ifdef __NR_utimensat + case __NR_utimensat: +#endif + sys_data->user_mask = BIT(1); + break; + /* user arg at position 2 */ + case __NR_init_module: + case __NR_fsconfig: + sys_data->user_mask = BIT(2); + break; + /* user arg at position 4 */ + case __NR_fanotify_mark: + sys_data->user_mask = BIT(4); + break; + /* 2 user args, 0 and 1 */ + case __NR_add_key: + case __NR_getxattr: + case __NR_lgetxattr: + case __NR_lremovexattr: +#ifdef __NR_link + case __NR_link: +#endif + case __NR_listxattr: + case __NR_llistxattr: + case __NR_lsetxattr: + case __NR_pivot_root: + case __NR_removexattr: +#ifdef __NR_rename + case __NR_rename: +#endif + case __NR_request_key: + case __NR_setxattr: +#ifdef __NR_symlink + case __NR_symlink: +#endif + sys_data->user_mask = BIT(0) | BIT(1); + break; + /* 2 user args, 0 and 2 */ + case __NR_symlinkat: + sys_data->user_mask = BIT(0) | BIT(2); + break; + /* 2 user args, 1 and 3 */ + case __NR_getxattrat: + case __NR_linkat: + case __NR_listxattrat: + case __NR_move_mount: +#ifdef __NR_renameat + case __NR_renameat: +#endif + case __NR_renameat2: + case __NR_removexattrat: + case __NR_setxattrat: + sys_data->user_mask = BIT(1) | BIT(3); + break; + case __NR_mount: /* Just dev_name and dir_name, TODO add type */ + sys_data->user_mask = BIT(0) | BIT(1) | BIT(2); + break; + default: + sys_data->user_mask = 0; + return; + } + + if (sys_data->user_arg_size < 0) + return; + + /* + * The user_arg_size can only be used when the system call + * is reading only a single address from user space. + */ + mask = sys_data->user_mask; + if (WARN_ON(mask & (mask - 1))) + sys_data->user_arg_size = -1; +} + static int __init init_syscall_trace(struct trace_event_call *call) { int id; @@ -467,6 +1274,8 @@ static int __init init_syscall_trace(struct trace_event_call *call) return -ENOSYS; } + check_faultable_syscall(call, num); + if (set_syscall_print_fmt(call) < 0) return -ENOMEM; @@ -594,9 +1403,14 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) struct hlist_head *head; unsigned long args[6]; bool valid_prog_array; + bool mayfault; + char *user_ptr; + int user_sizes[SYSCALL_FAULT_MAX_CNT] = {}; + int buf_size = CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT; int syscall_nr; int rctx; - int size; + int size = 0; + int uargs = 0; /* * Syscall probe called with preemption enabled, but the ring @@ -615,13 +1429,24 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) if (!sys_data) return; + syscall_get_arguments(current, regs, args); + + /* Check if this syscall event faults in user space memory */ + mayfault = sys_data->user_mask != 0; + + if (mayfault) { + if (syscall_get_data(sys_data, args, &user_ptr, + &size, user_sizes, &uargs, buf_size) < 0) + return; + } + head = this_cpu_ptr(sys_data->enter_event->perf_events); valid_prog_array = bpf_prog_array_valid(sys_data->enter_event); if (!valid_prog_array && hlist_empty(head)) return; /* get the size after alignment with the u32 buffer size field */ - size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); + size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); @@ -630,9 +1455,11 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) return; rec->nr = syscall_nr; - syscall_get_arguments(current, regs, args); memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args); + if (mayfault) + syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs); + if ((valid_prog_array && !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) || hlist_empty(head)) { @@ -647,36 +1474,46 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) static int perf_sysenter_enable(struct trace_event_call *call) { - int ret = 0; + struct syscall_metadata *sys_data = call->data; int num; + int ret; - num = ((struct syscall_metadata *)call->data)->syscall_nr; + num = sys_data->syscall_nr; - mutex_lock(&syscall_trace_lock); - if (!sys_perf_refcount_enter) + guard(mutex)(&syscall_trace_lock); + if (sys_data->user_mask) { + ret = syscall_fault_buffer_enable(); + if (ret < 0) + return ret; + } + if (!sys_perf_refcount_enter) { ret = register_trace_sys_enter(perf_syscall_enter, NULL); - if (ret) { - pr_info("event trace: Could not activate syscall entry trace point"); - } else { - set_bit(num, enabled_perf_enter_syscalls); - sys_perf_refcount_enter++; + if (ret) { + pr_info("event trace: Could not activate syscall entry trace point"); + if (sys_data->user_mask) + syscall_fault_buffer_disable(); + return ret; + } } - mutex_unlock(&syscall_trace_lock); - return ret; + set_bit(num, enabled_perf_enter_syscalls); + sys_perf_refcount_enter++; + return 0; } static void perf_sysenter_disable(struct trace_event_call *call) { + struct syscall_metadata *sys_data = call->data; int num; - num = ((struct syscall_metadata *)call->data)->syscall_nr; + num = sys_data->syscall_nr; - mutex_lock(&syscall_trace_lock); + guard(mutex)(&syscall_trace_lock); sys_perf_refcount_enter--; clear_bit(num, enabled_perf_enter_syscalls); if (!sys_perf_refcount_enter) unregister_trace_sys_enter(perf_syscall_enter, NULL); - mutex_unlock(&syscall_trace_lock); + if (sys_data->user_mask) + syscall_fault_buffer_disable(); } static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs, @@ -753,22 +1590,21 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) static int perf_sysexit_enable(struct trace_event_call *call) { - int ret = 0; int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; - mutex_lock(&syscall_trace_lock); - if (!sys_perf_refcount_exit) - ret = register_trace_sys_exit(perf_syscall_exit, NULL); - if (ret) { - pr_info("event trace: Could not activate syscall exit trace point"); - } else { - set_bit(num, enabled_perf_exit_syscalls); - sys_perf_refcount_exit++; + guard(mutex)(&syscall_trace_lock); + if (!sys_perf_refcount_exit) { + int ret = register_trace_sys_exit(perf_syscall_exit, NULL); + if (ret) { + pr_info("event trace: Could not activate syscall exit trace point"); + return ret; + } } - mutex_unlock(&syscall_trace_lock); - return ret; + set_bit(num, enabled_perf_exit_syscalls); + sys_perf_refcount_exit++; + return 0; } static void perf_sysexit_disable(struct trace_event_call *call) @@ -777,12 +1613,11 @@ static void perf_sysexit_disable(struct trace_event_call *call) num = ((struct syscall_metadata *)call->data)->syscall_nr; - mutex_lock(&syscall_trace_lock); + guard(mutex)(&syscall_trace_lock); sys_perf_refcount_exit--; clear_bit(num, enabled_perf_exit_syscalls); if (!sys_perf_refcount_exit) unregister_trace_sys_exit(perf_syscall_exit, NULL); - mutex_unlock(&syscall_trace_lock); } #endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 3386439ec9f6..1b4f32e2b9bd 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -8,17 +8,19 @@ #define pr_fmt(fmt) "trace_uprobe: " fmt #include <linux/bpf-cgroup.h> -#include <linux/security.h> +#include <linux/cleanup.h> #include <linux/ctype.h> +#include <linux/filter.h> #include <linux/module.h> -#include <linux/uaccess.h> -#include <linux/uprobes.h> #include <linux/namei.h> -#include <linux/string.h> -#include <linux/rculist.h> -#include <linux/filter.h> #include <linux/percpu.h> +#include <linux/rculist.h> +#include <linux/security.h> +#include <linux/string.h> +#include <linux/uaccess.h> +#include <linux/uprobes.h> +#include "trace.h" #include "trace_dynevent.h" #include "trace_probe.h" #include "trace_probe_tmpl.h" @@ -531,20 +533,25 @@ static int register_trace_uprobe(struct trace_uprobe *tu) return ret; } +DEFINE_FREE(free_trace_uprobe, struct trace_uprobe *, if (_T) free_trace_uprobe(_T)) + /* * Argument syntax: * - Add uprobe: p|r[:[GRP/][EVENT]] PATH:OFFSET[%return][(REF)] [FETCHARGS] */ static int __trace_uprobe_create(int argc, const char **argv) { - struct trace_uprobe *tu; + struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL; + struct trace_uprobe *tu __free(free_trace_uprobe) = NULL; + const char *trlog __free(trace_probe_log_clear) = NULL; const char *event = NULL, *group = UPROBE_EVENT_SYSTEM; - char *arg, *filename, *rctr, *rctr_end, *tmp; - char buf[MAX_EVENT_NAME_LEN]; - char gbuf[MAX_EVENT_NAME_LEN]; - enum probe_print_type ptype; - struct path path; + struct path path __free(path_put) = {}; unsigned long offset, ref_ctr_offset; + char *filename __free(kfree) = NULL; + char *arg, *rctr, *rctr_end, *tmp; + char *gbuf __free(kfree) = NULL; + char *buf __free(kfree) = NULL; + enum probe_print_type ptype; bool is_return = false; int i, ret; @@ -563,7 +570,7 @@ static int __trace_uprobe_create(int argc, const char **argv) if (argc < 2) return -ECANCELED; - trace_probe_log_init("trace_uprobe", argc, argv); + trlog = trace_probe_log_init("trace_uprobe", argc, argv); if (argc - 2 > MAX_TRACE_ARGS) { trace_probe_log_set_index(2); @@ -583,10 +590,8 @@ static int __trace_uprobe_create(int argc, const char **argv) /* Find the last occurrence, in case the path contains ':' too. */ arg = strrchr(filename, ':'); - if (!arg || !isdigit(arg[1])) { - kfree(filename); + if (!arg || !isdigit(arg[1])) return -ECANCELED; - } trace_probe_log_set_index(1); /* filename is the 2nd argument */ @@ -594,14 +599,11 @@ static int __trace_uprobe_create(int argc, const char **argv) ret = kern_path(filename, LOOKUP_FOLLOW, &path); if (ret) { trace_probe_log_err(0, FILE_NOT_FOUND); - kfree(filename); - trace_probe_log_clear(); return ret; } if (!d_is_reg(path.dentry)) { trace_probe_log_err(0, NO_REGULAR_FILE); - ret = -EINVAL; - goto fail_address_parse; + return -EINVAL; } /* Parse reference counter offset if specified. */ @@ -609,16 +611,14 @@ static int __trace_uprobe_create(int argc, const char **argv) if (rctr) { rctr_end = strchr(rctr, ')'); if (!rctr_end) { - ret = -EINVAL; rctr_end = rctr + strlen(rctr); trace_probe_log_err(rctr_end - filename, REFCNT_OPEN_BRACE); - goto fail_address_parse; + return -EINVAL; } else if (rctr_end[1] != '\0') { - ret = -EINVAL; trace_probe_log_err(rctr_end + 1 - filename, BAD_REFCNT_SUFFIX); - goto fail_address_parse; + return -EINVAL; } *rctr++ = '\0'; @@ -626,7 +626,7 @@ static int __trace_uprobe_create(int argc, const char **argv) ret = kstrtoul(rctr, 0, &ref_ctr_offset); if (ret) { trace_probe_log_err(rctr - filename, BAD_REFCNT); - goto fail_address_parse; + return ret; } } @@ -638,8 +638,7 @@ static int __trace_uprobe_create(int argc, const char **argv) is_return = true; } else { trace_probe_log_err(tmp - filename, BAD_ADDR_SUFFIX); - ret = -EINVAL; - goto fail_address_parse; + return -EINVAL; } } @@ -647,16 +646,20 @@ static int __trace_uprobe_create(int argc, const char **argv) ret = kstrtoul(arg, 0, &offset); if (ret) { trace_probe_log_err(arg - filename, BAD_UPROBE_OFFS); - goto fail_address_parse; + return ret; } /* setup a probe */ trace_probe_log_set_index(0); if (event) { + gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!gbuf) + return -ENOMEM; + ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) - goto fail_address_parse; + return ret; } if (!event) { @@ -664,15 +667,16 @@ static int __trace_uprobe_create(int argc, const char **argv) char *ptr; tail = kstrdup(kbasename(filename), GFP_KERNEL); - if (!tail) { - ret = -ENOMEM; - goto fail_address_parse; - } + if (!tail) + return -ENOMEM; ptr = strpbrk(tail, ".-_"); if (ptr) *ptr = '\0'; + buf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!buf) + return -ENOMEM; snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset); event = buf; kfree(tail); @@ -686,45 +690,36 @@ static int __trace_uprobe_create(int argc, const char **argv) ret = PTR_ERR(tu); /* This must return -ENOMEM otherwise there is a bug */ WARN_ON_ONCE(ret != -ENOMEM); - goto fail_address_parse; + return ret; } tu->offset = offset; tu->ref_ctr_offset = ref_ctr_offset; tu->path = path; - tu->filename = filename; + /* Clear @path so that it will not freed by path_put() */ + memset(&path, 0, sizeof(path)); + tu->filename = no_free_ptr(filename); + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + ctx->flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER; /* parse arguments */ for (i = 0; i < argc; i++) { - struct traceprobe_parse_context ctx = { - .flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER, - }; - trace_probe_log_set_index(i + 2); - ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], &ctx); - traceprobe_finish_parse(&ctx); + ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], ctx); if (ret) - goto error; + return ret; } ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; ret = traceprobe_set_print_fmt(&tu->tp, ptype); if (ret < 0) - goto error; + return ret; ret = register_trace_uprobe(tu); if (!ret) - goto out; - -error: - free_trace_uprobe(tu); -out: - trace_probe_log_clear(); - return ret; - -fail_address_parse: - trace_probe_log_clear(); - path_put(&path); - kfree(filename); + tu = NULL; return ret; } @@ -741,7 +736,7 @@ static int create_or_delete_trace_uprobe(const char *raw_command) if (raw_command[0] == '-') return dyn_event_release(raw_command, &trace_uprobe_ops); - ret = trace_uprobe_create(raw_command); + ret = dyn_event_create(raw_command, &trace_uprobe_ops); return ret == -ECANCELED ? -EINVAL : ret; } @@ -1489,7 +1484,7 @@ int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, : BPF_FD_TYPE_UPROBE; *filename = tu->filename; *probe_offset = tu->offset; - *probe_addr = 0; + *probe_addr = tu->ref_ctr_offset; return 0; } #endif /* CONFIG_PERF_EVENTS */ @@ -1534,6 +1529,7 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs, struct trace_uprobe *tu; struct uprobe_dispatch_data udd; struct uprobe_cpu_buffer *ucb = NULL; + unsigned int flags; int ret = 0; tu = container_of(con, struct trace_uprobe, consumer); @@ -1548,11 +1544,12 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs, if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; - if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) + flags = trace_probe_load_flag(&tu->tp); + if (flags & TP_FLAG_TRACE) ret |= uprobe_trace_func(tu, regs, &ucb); #ifdef CONFIG_PERF_EVENTS - if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) + if (flags & TP_FLAG_PROFILE) ret |= uprobe_perf_func(tu, regs, &ucb); #endif uprobe_buffer_put(ucb); @@ -1566,6 +1563,7 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, struct trace_uprobe *tu; struct uprobe_dispatch_data udd; struct uprobe_cpu_buffer *ucb = NULL; + unsigned int flags; tu = container_of(con, struct trace_uprobe, consumer); @@ -1577,11 +1575,12 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; - if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) + flags = trace_probe_load_flag(&tu->tp); + if (flags & TP_FLAG_TRACE) uretprobe_trace_func(tu, func, regs, &ucb); #ifdef CONFIG_PERF_EVENTS - if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) + if (flags & TP_FLAG_PROFILE) uretprobe_perf_func(tu, func, regs, &ucb); #endif uprobe_buffer_put(ucb); diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 1921ade45be3..7f8da4dab69d 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -1076,7 +1076,7 @@ int tracing_map_sort_entries(struct tracing_map *map, struct tracing_map_sort_entry *sort_entry, **entries; int i, n_entries, ret; - entries = vmalloc(array_size(sizeof(sort_entry), map->max_elts)); + entries = vmalloc_array(map->max_elts, sizeof(sort_entry)); if (!entries) return -ENOMEM; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 16b283f9d831..6ea2f6363b90 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -57,12 +57,11 @@ void bacct_add_tsk(struct user_namespace *user_ns, stats->ac_sched = tsk->policy; stats->ac_pid = task_pid_nr_ns(tsk, pid_ns); stats->ac_tgid = task_tgid_nr_ns(tsk, pid_ns); + stats->ac_ppid = task_ppid_nr_ns(tsk, pid_ns); rcu_read_lock(); tcred = __task_cred(tsk); stats->ac_uid = from_kuid_munged(user_ns, tcred->uid); stats->ac_gid = from_kgid_munged(user_ns, tcred->gid); - stats->ac_ppid = pid_alive(tsk) ? - task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0; rcu_read_unlock(); task_cputime(tsk, &utime, &stime); diff --git a/kernel/ucount.c b/kernel/ucount.c index 8686e329b8f2..586af49fc03e 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -199,18 +199,16 @@ void put_ucounts(struct ucounts *ucounts) } } -static inline bool atomic_long_inc_below(atomic_long_t *v, int u) +static inline bool atomic_long_inc_below(atomic_long_t *v, long u) { - long c, old; - c = atomic_long_read(v); - for (;;) { + long c = atomic_long_read(v); + + do { if (unlikely(c >= u)) return false; - old = atomic_long_cmpxchg(v, c, c+1); - if (likely(old == c)) - return true; - c = old; - } + } while (!atomic_long_try_cmpxchg(v, &c, c+1)); + + return true; } struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile new file mode 100644 index 000000000000..eae37bea54fd --- /dev/null +++ b/kernel/unwind/Makefile @@ -0,0 +1 @@ + obj-$(CONFIG_UNWIND_USER) += user.o deferred.o diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c new file mode 100644 index 000000000000..a88fb481c4a3 --- /dev/null +++ b/kernel/unwind/deferred.c @@ -0,0 +1,366 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Deferred user space unwinding + */ +#include <linux/sched/task_stack.h> +#include <linux/unwind_deferred.h> +#include <linux/sched/clock.h> +#include <linux/task_work.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/sizes.h> +#include <linux/slab.h> +#include <linux/mm.h> + +/* + * For requesting a deferred user space stack trace from NMI context + * the architecture must support a safe cmpxchg in NMI context. + * For those architectures that do not have that, then it cannot ask + * for a deferred user space stack trace from an NMI context. If it + * does, then it will get -EINVAL. + */ +#if defined(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) +# define CAN_USE_IN_NMI 1 +static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt) +{ + u32 old = 0; + + return try_cmpxchg(&info->id.cnt, &old, cnt); +} +#else +# define CAN_USE_IN_NMI 0 +/* When NMIs are not allowed, this always succeeds */ +static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt) +{ + info->id.cnt = cnt; + return true; +} +#endif + +/* Make the cache fit in a 4K page */ +#define UNWIND_MAX_ENTRIES \ + ((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long)) + +/* Guards adding to or removing from the list of callbacks */ +static DEFINE_MUTEX(callback_mutex); +static LIST_HEAD(callbacks); + +#define RESERVED_BITS (UNWIND_PENDING | UNWIND_USED) + +/* Zero'd bits are available for assigning callback users */ +static unsigned long unwind_mask = RESERVED_BITS; +DEFINE_STATIC_SRCU(unwind_srcu); + +static inline bool unwind_pending(struct unwind_task_info *info) +{ + return atomic_long_read(&info->unwind_mask) & UNWIND_PENDING; +} + +/* + * This is a unique percpu identifier for a given task entry context. + * Conceptually, it's incremented every time the CPU enters the kernel from + * user space, so that each "entry context" on the CPU gets a unique ID. In + * reality, as an optimization, it's only incremented on demand for the first + * deferred unwind request after a given entry-from-user. + * + * It's combined with the CPU id to make a systemwide-unique "context cookie". + */ +static DEFINE_PER_CPU(u32, unwind_ctx_ctr); + +/* + * The context cookie is a unique identifier that is assigned to a user + * space stacktrace. As the user space stacktrace remains the same while + * the task is in the kernel, the cookie is an identifier for the stacktrace. + * Although it is possible for the stacktrace to get another cookie if another + * request is made after the cookie was cleared and before reentering user + * space. + */ +static u64 get_cookie(struct unwind_task_info *info) +{ + u32 cnt = 1; + + lockdep_assert_irqs_disabled(); + + if (info->id.cpu) + return info->id.id; + + /* LSB is always set to ensure 0 is an invalid value */ + cnt |= __this_cpu_read(unwind_ctx_ctr) + 2; + if (try_assign_cnt(info, cnt)) { + /* Update the per cpu counter */ + __this_cpu_write(unwind_ctx_ctr, cnt); + } + /* Interrupts are disabled, the CPU will always be same */ + info->id.cpu = smp_processor_id() + 1; /* Must be non zero */ + + return info->id.id; +} + +/** + * unwind_user_faultable - Produce a user stacktrace in faultable context + * @trace: The descriptor that will store the user stacktrace + * + * This must be called in a known faultable context (usually when entering + * or exiting user space). Depending on the available implementations + * the @trace will be loaded with the addresses of the user space stacktrace + * if it can be found. + * + * Return: 0 on success and negative on error + * On success @trace will contain the user space stacktrace + */ +int unwind_user_faultable(struct unwind_stacktrace *trace) +{ + struct unwind_task_info *info = ¤t->unwind_info; + struct unwind_cache *cache; + + /* Should always be called from faultable context */ + might_fault(); + + if (!current->mm) + return -EINVAL; + + if (!info->cache) { + info->cache = kzalloc(struct_size(cache, entries, UNWIND_MAX_ENTRIES), + GFP_KERNEL); + if (!info->cache) + return -ENOMEM; + } + + cache = info->cache; + trace->entries = cache->entries; + trace->nr = cache->nr_entries; + /* + * The user stack has already been previously unwound in this + * entry context. Skip the unwind and use the cache. + */ + if (trace->nr) + return 0; + + unwind_user(trace, UNWIND_MAX_ENTRIES); + + cache->nr_entries = trace->nr; + + /* Clear nr_entries on way back to user space */ + atomic_long_or(UNWIND_USED, &info->unwind_mask); + + return 0; +} + +static void process_unwind_deferred(struct task_struct *task) +{ + struct unwind_task_info *info = &task->unwind_info; + struct unwind_stacktrace trace; + struct unwind_work *work; + unsigned long bits; + u64 cookie; + + if (WARN_ON_ONCE(!unwind_pending(info))) + return; + + /* Clear pending bit but make sure to have the current bits */ + bits = atomic_long_fetch_andnot(UNWIND_PENDING, + &info->unwind_mask); + /* + * From here on out, the callback must always be called, even if it's + * just an empty trace. + */ + trace.nr = 0; + trace.entries = NULL; + + unwind_user_faultable(&trace); + + if (info->cache) + bits &= ~(info->cache->unwind_completed); + + cookie = info->id.id; + + guard(srcu)(&unwind_srcu); + list_for_each_entry_srcu(work, &callbacks, list, + srcu_read_lock_held(&unwind_srcu)) { + if (test_bit(work->bit, &bits)) { + work->func(work, &trace, cookie); + if (info->cache) + info->cache->unwind_completed |= BIT(work->bit); + } + } +} + +static void unwind_deferred_task_work(struct callback_head *head) +{ + process_unwind_deferred(current); +} + +void unwind_deferred_task_exit(struct task_struct *task) +{ + struct unwind_task_info *info = ¤t->unwind_info; + + if (!unwind_pending(info)) + return; + + process_unwind_deferred(task); + + task_work_cancel(task, &info->work); +} + +/** + * unwind_deferred_request - Request a user stacktrace on task kernel exit + * @work: Unwind descriptor requesting the trace + * @cookie: The cookie of the first request made for this task + * + * Schedule a user space unwind to be done in task work before exiting the + * kernel. + * + * The returned @cookie output is the generated cookie of the very first + * request for a user space stacktrace for this task since it entered the + * kernel. It can be from a request by any caller of this infrastructure. + * Its value will also be passed to the callback function. It can be + * used to stitch kernel and user stack traces together in post-processing. + * + * It's valid to call this function multiple times for the same @work within + * the same task entry context. Each call will return the same cookie + * while the task hasn't left the kernel. If the callback is not pending + * because it has already been previously called for the same entry context, + * it will be called again with the same stack trace and cookie. + * + * Return: 0 if the callback successfully was queued. + * 1 if the callback is pending or was already executed. + * Negative if there's an error. + * @cookie holds the cookie of the first request by any user + */ +int unwind_deferred_request(struct unwind_work *work, u64 *cookie) +{ + struct unwind_task_info *info = ¤t->unwind_info; + int twa_mode = TWA_RESUME; + unsigned long old, bits; + unsigned long bit; + int ret; + + *cookie = 0; + + if ((current->flags & (PF_KTHREAD | PF_EXITING)) || + !user_mode(task_pt_regs(current))) + return -EINVAL; + + /* + * NMI requires having safe cmpxchg operations. + * Trigger a warning to make it obvious that an architecture + * is using this in NMI when it should not be. + */ + if (in_nmi()) { + if (WARN_ON_ONCE(!CAN_USE_IN_NMI)) + return -EINVAL; + twa_mode = TWA_NMI_CURRENT; + } + + /* Do not allow cancelled works to request again */ + bit = READ_ONCE(work->bit); + if (WARN_ON_ONCE(bit < 0)) + return -EINVAL; + + /* Only need the mask now */ + bit = BIT(bit); + + guard(irqsave)(); + + *cookie = get_cookie(info); + + old = atomic_long_read(&info->unwind_mask); + + /* Is this already queued or executed */ + if (old & bit) + return 1; + + /* + * This work's bit hasn't been set yet. Now set it with the PENDING + * bit and fetch the current value of unwind_mask. If ether the + * work's bit or PENDING was already set, then this is already queued + * to have a callback. + */ + bits = UNWIND_PENDING | bit; + old = atomic_long_fetch_or(bits, &info->unwind_mask); + if (old & bits) { + /* + * If the work's bit was set, whatever set it had better + * have also set pending and queued a callback. + */ + WARN_ON_ONCE(!(old & UNWIND_PENDING)); + return old & bit; + } + + /* The work has been claimed, now schedule it. */ + ret = task_work_add(current, &info->work, twa_mode); + + if (WARN_ON_ONCE(ret)) + atomic_long_set(&info->unwind_mask, 0); + + return ret; +} + +void unwind_deferred_cancel(struct unwind_work *work) +{ + struct task_struct *g, *t; + int bit; + + if (!work) + return; + + bit = work->bit; + + /* No work should be using a reserved bit */ + if (WARN_ON_ONCE(BIT(bit) & RESERVED_BITS)) + return; + + guard(mutex)(&callback_mutex); + list_del_rcu(&work->list); + + /* Do not allow any more requests and prevent callbacks */ + work->bit = -1; + + __clear_bit(bit, &unwind_mask); + + synchronize_srcu(&unwind_srcu); + + guard(rcu)(); + /* Clear this bit from all threads */ + for_each_process_thread(g, t) { + atomic_long_andnot(BIT(bit), + &t->unwind_info.unwind_mask); + if (t->unwind_info.cache) + clear_bit(bit, &t->unwind_info.cache->unwind_completed); + } +} + +int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) +{ + memset(work, 0, sizeof(*work)); + + guard(mutex)(&callback_mutex); + + /* See if there's a bit in the mask available */ + if (unwind_mask == ~0UL) + return -EBUSY; + + work->bit = ffz(unwind_mask); + __set_bit(work->bit, &unwind_mask); + + list_add_rcu(&work->list, &callbacks); + work->func = func; + return 0; +} + +void unwind_task_init(struct task_struct *task) +{ + struct unwind_task_info *info = &task->unwind_info; + + memset(info, 0, sizeof(*info)); + init_task_work(&info->work, unwind_deferred_task_work); + atomic_long_set(&info->unwind_mask, 0); +} + +void unwind_task_free(struct task_struct *task) +{ + struct unwind_task_info *info = &task->unwind_info; + + kfree(info->cache); + task_work_cancel(task, &info->work); +} diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c new file mode 100644 index 000000000000..39e270789444 --- /dev/null +++ b/kernel/unwind/user.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: GPL-2.0 +/* +* Generic interfaces for unwinding user space +*/ +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/sched/task_stack.h> +#include <linux/unwind_user.h> +#include <linux/uaccess.h> + +#define for_each_user_frame(state) \ + for (unwind_user_start(state); !(state)->done; unwind_user_next(state)) + +static inline int +get_user_word(unsigned long *word, unsigned long base, int off, unsigned int ws) +{ + unsigned long __user *addr = (void __user *)base + off; +#ifdef CONFIG_COMPAT + if (ws == sizeof(int)) { + unsigned int data; + int ret = get_user(data, (unsigned int __user *)addr); + *word = data; + return ret; + } +#endif + return get_user(*word, addr); +} + +static int unwind_user_next_common(struct unwind_user_state *state, + const struct unwind_user_frame *frame) +{ + unsigned long cfa, fp, ra; + + if (frame->use_fp) { + if (state->fp < state->sp) + return -EINVAL; + cfa = state->fp; + } else { + cfa = state->sp; + } + + /* Get the Canonical Frame Address (CFA) */ + cfa += frame->cfa_off; + + /* stack going in wrong direction? */ + if (cfa <= state->sp) + return -EINVAL; + + /* Make sure that the address is word aligned */ + if (cfa & (state->ws - 1)) + return -EINVAL; + + /* Find the Return Address (RA) */ + if (get_user_word(&ra, cfa, frame->ra_off, state->ws)) + return -EINVAL; + + if (frame->fp_off && get_user_word(&fp, cfa, frame->fp_off, state->ws)) + return -EINVAL; + + state->ip = ra; + state->sp = cfa; + if (frame->fp_off) + state->fp = fp; + state->topmost = false; + return 0; +} + +static int unwind_user_next_fp(struct unwind_user_state *state) +{ +#ifdef CONFIG_HAVE_UNWIND_USER_FP + struct pt_regs *regs = task_pt_regs(current); + + if (state->topmost && unwind_user_at_function_start(regs)) { + const struct unwind_user_frame fp_entry_frame = { + ARCH_INIT_USER_FP_ENTRY_FRAME(state->ws) + }; + return unwind_user_next_common(state, &fp_entry_frame); + } + + const struct unwind_user_frame fp_frame = { + ARCH_INIT_USER_FP_FRAME(state->ws) + }; + return unwind_user_next_common(state, &fp_frame); +#else + return -EINVAL; +#endif +} + +static int unwind_user_next(struct unwind_user_state *state) +{ + unsigned long iter_mask = state->available_types; + unsigned int bit; + + if (state->done) + return -EINVAL; + + for_each_set_bit(bit, &iter_mask, NR_UNWIND_USER_TYPE_BITS) { + enum unwind_user_type type = BIT(bit); + + state->current_type = type; + switch (type) { + case UNWIND_USER_TYPE_FP: + if (!unwind_user_next_fp(state)) + return 0; + continue; + default: + WARN_ONCE(1, "Undefined unwind bit %d", bit); + break; + } + break; + } + + /* No successful unwind method. */ + state->current_type = UNWIND_USER_TYPE_NONE; + state->done = true; + return -EINVAL; +} + +static int unwind_user_start(struct unwind_user_state *state) +{ + struct pt_regs *regs = task_pt_regs(current); + + memset(state, 0, sizeof(*state)); + + if ((current->flags & PF_KTHREAD) || !user_mode(regs)) { + state->done = true; + return -EINVAL; + } + + if (IS_ENABLED(CONFIG_HAVE_UNWIND_USER_FP)) + state->available_types |= UNWIND_USER_TYPE_FP; + + state->ip = instruction_pointer(regs); + state->sp = user_stack_pointer(regs); + state->fp = frame_pointer(regs); + state->ws = unwind_user_word_size(regs); + if (!state->ws) { + state->done = true; + return -EINVAL; + } + state->topmost = true; + + return 0; +} + +int unwind_user(struct unwind_stacktrace *trace, unsigned int max_entries) +{ + struct unwind_user_state state; + + trace->nr = 0; + + if (!max_entries) + return -EINVAL; + + if (current->flags & PF_KTHREAD) + return 0; + + for_each_user_frame(&state) { + trace->entries[trace->nr++] = state.ip; + if (trace->nr >= max_entries) + break; + } + + return 0; +} diff --git a/kernel/user.c b/kernel/user.c index f46b1d41163b..7aef4e679a6a 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -35,6 +35,7 @@ EXPORT_SYMBOL_GPL(init_binfmt_misc); * and 1 for... ? */ struct user_namespace init_user_ns = { + .ns = NS_COMMON_INIT(init_user_ns), .uid_map = { { .extent[0] = { @@ -65,13 +66,8 @@ struct user_namespace init_user_ns = { .nr_extents = 1, }, }, - .ns.count = REFCOUNT_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, - .ns.inum = PROC_USER_INIT_INO, -#ifdef CONFIG_USER_NS - .ns.ops = &userns_operations, -#endif .flags = USERNS_INIT_FLAGS, #ifdef CONFIG_KEYS .keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list), diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 682f40d5632d..03cb63883d04 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -21,6 +21,7 @@ #include <linux/fs_struct.h> #include <linux/bsearch.h> #include <linux/sort.h> +#include <linux/nstree.h> static struct kmem_cache *user_ns_cachep __ro_after_init; static DEFINE_MUTEX(userns_state_mutex); @@ -124,12 +125,11 @@ int create_user_ns(struct cred *new) goto fail_dec; ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); - ret = ns_alloc_inum(&ns->ns); + + ret = ns_common_init(ns); if (ret) goto fail_free; - ns->ns.ops = &userns_operations; - refcount_set(&ns->ns.count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->level = parent_ns->level + 1; @@ -159,12 +159,13 @@ int create_user_ns(struct cred *new) goto fail_keyring; set_cred_user_ns(new, ns); + ns_tree_add(ns); return 0; fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif - ns_free_inum(&ns->ns); + ns_common_free(ns); fail_free: kmem_cache_free(user_ns_cachep, ns); fail_dec: @@ -201,6 +202,7 @@ static void free_user_ns(struct work_struct *work) do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; + ns_tree_remove(ns); if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->gid_map.forward); kfree(ns->gid_map.reverse); @@ -218,11 +220,12 @@ static void free_user_ns(struct work_struct *work) #endif retire_userns_sysctls(ns); key_free_user_ns(ns); - ns_free_inum(&ns->ns); - kmem_cache_free(user_ns_cachep, ns); + ns_common_free(ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); dec_user_namespaces(ucounts); ns = parent; - } while (refcount_dec_and_test(&parent->ns.count)); + } while (ns_ref_put(parent)); } void __put_user_ns(struct user_namespace *ns) @@ -1322,11 +1325,6 @@ bool current_in_userns(const struct user_namespace *target_ns) } EXPORT_SYMBOL(current_in_userns); -static inline struct user_namespace *to_user_ns(struct ns_common *ns) -{ - return container_of(ns, struct user_namespace, ns); -} - static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; @@ -1402,7 +1400,6 @@ static struct user_namespace *userns_owner(struct ns_common *ns) const struct proc_ns_operations userns_operations = { .name = "user", - .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, @@ -1413,6 +1410,7 @@ const struct proc_ns_operations userns_operations = { static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); + ns_tree_add(&init_user_ns); return 0; } subsys_initcall(user_namespaces_init); diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c deleted file mode 100644 index 8303f4c7ca71..000000000000 --- a/kernel/usermode_driver.c +++ /dev/null @@ -1,191 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * umd - User mode driver support - */ -#include <linux/shmem_fs.h> -#include <linux/pipe_fs_i.h> -#include <linux/mount.h> -#include <linux/fs_struct.h> -#include <linux/task_work.h> -#include <linux/usermode_driver.h> - -static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *name) -{ - struct file_system_type *type; - struct vfsmount *mnt; - struct file *file; - ssize_t written; - loff_t pos = 0; - - type = get_fs_type("tmpfs"); - if (!type) - return ERR_PTR(-ENODEV); - - mnt = kern_mount(type); - put_filesystem(type); - if (IS_ERR(mnt)) - return mnt; - - file = file_open_root_mnt(mnt, name, O_CREAT | O_WRONLY, 0700); - if (IS_ERR(file)) { - kern_unmount(mnt); - return ERR_CAST(file); - } - - written = kernel_write(file, data, len, &pos); - if (written != len) { - int err = written; - if (err >= 0) - err = -ENOMEM; - filp_close(file, NULL); - kern_unmount(mnt); - return ERR_PTR(err); - } - - fput(file); - - /* Flush delayed fput so exec can open the file read-only */ - flush_delayed_fput(); - task_work_run(); - return mnt; -} - -/** - * umd_load_blob - Remember a blob of bytes for fork_usermode_driver - * @info: information about usermode driver - * @data: a blob of bytes that can be executed as a file - * @len: The lentgh of the blob - * - */ -int umd_load_blob(struct umd_info *info, const void *data, size_t len) -{ - struct vfsmount *mnt; - - if (WARN_ON_ONCE(info->wd.dentry || info->wd.mnt)) - return -EBUSY; - - mnt = blob_to_mnt(data, len, info->driver_name); - if (IS_ERR(mnt)) - return PTR_ERR(mnt); - - info->wd.mnt = mnt; - info->wd.dentry = mnt->mnt_root; - return 0; -} -EXPORT_SYMBOL_GPL(umd_load_blob); - -/** - * umd_unload_blob - Disassociate @info from a previously loaded blob - * @info: information about usermode driver - * - */ -int umd_unload_blob(struct umd_info *info) -{ - if (WARN_ON_ONCE(!info->wd.mnt || - !info->wd.dentry || - info->wd.mnt->mnt_root != info->wd.dentry)) - return -EINVAL; - - kern_unmount(info->wd.mnt); - info->wd.mnt = NULL; - info->wd.dentry = NULL; - return 0; -} -EXPORT_SYMBOL_GPL(umd_unload_blob); - -static int umd_setup(struct subprocess_info *info, struct cred *new) -{ - struct umd_info *umd_info = info->data; - struct file *from_umh[2]; - struct file *to_umh[2]; - int err; - - /* create pipe to send data to umh */ - err = create_pipe_files(to_umh, 0); - if (err) - return err; - err = replace_fd(0, to_umh[0], 0); - fput(to_umh[0]); - if (err < 0) { - fput(to_umh[1]); - return err; - } - - /* create pipe to receive data from umh */ - err = create_pipe_files(from_umh, 0); - if (err) { - fput(to_umh[1]); - replace_fd(0, NULL, 0); - return err; - } - err = replace_fd(1, from_umh[1], 0); - fput(from_umh[1]); - if (err < 0) { - fput(to_umh[1]); - replace_fd(0, NULL, 0); - fput(from_umh[0]); - return err; - } - - set_fs_pwd(current->fs, &umd_info->wd); - umd_info->pipe_to_umh = to_umh[1]; - umd_info->pipe_from_umh = from_umh[0]; - umd_info->tgid = get_pid(task_tgid(current)); - return 0; -} - -static void umd_cleanup(struct subprocess_info *info) -{ - struct umd_info *umd_info = info->data; - - /* cleanup if umh_setup() was successful but exec failed */ - if (info->retval) - umd_cleanup_helper(umd_info); -} - -/** - * umd_cleanup_helper - release the resources which were allocated in umd_setup - * @info: information about usermode driver - */ -void umd_cleanup_helper(struct umd_info *info) -{ - fput(info->pipe_to_umh); - fput(info->pipe_from_umh); - put_pid(info->tgid); - info->tgid = NULL; -} -EXPORT_SYMBOL_GPL(umd_cleanup_helper); - -/** - * fork_usermode_driver - fork a usermode driver - * @info: information about usermode driver (shouldn't be NULL) - * - * Returns either negative error or zero which indicates success in - * executing a usermode driver. In such case 'struct umd_info *info' - * is populated with two pipes and a tgid of the process. The caller is - * responsible for health check of the user process, killing it via - * tgid, and closing the pipes when user process is no longer needed. - */ -int fork_usermode_driver(struct umd_info *info) -{ - struct subprocess_info *sub_info; - const char *argv[] = { info->driver_name, NULL }; - int err; - - if (WARN_ON_ONCE(info->tgid)) - return -EBUSY; - - err = -ENOMEM; - sub_info = call_usermodehelper_setup(info->driver_name, - (char **)argv, NULL, GFP_KERNEL, - umd_setup, umd_cleanup, info); - if (!sub_info) - goto out; - - err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); -out: - return err; -} -EXPORT_SYMBOL_GPL(fork_usermode_driver); - - diff --git a/kernel/utsname.c b/kernel/utsname.c index b1ac3ca870f2..ebbfc578a9d3 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -13,6 +13,7 @@ #include <linux/cred.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> +#include <linux/nstree.h> #include <linux/sched/task.h> static struct kmem_cache *uts_ns_cache __ro_after_init; @@ -27,16 +28,6 @@ static void dec_uts_namespaces(struct ucounts *ucounts) dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES); } -static struct uts_namespace *create_uts_ns(void) -{ - struct uts_namespace *uts_ns; - - uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL); - if (uts_ns) - refcount_set(&uts_ns->ns.count, 1); - return uts_ns; -} - /* * Clone a new ns copying an original utsname, setting refcount to 1 * @old_ns: namespace to clone @@ -55,21 +46,20 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = create_uts_ns(); + ns = kmem_cache_zalloc(uts_ns_cache, GFP_KERNEL); if (!ns) goto fail_dec; - err = ns_alloc_inum(&ns->ns); + err = ns_common_init(ns); if (err) goto fail_free; ns->ucounts = ucounts; - ns->ns.ops = &utsns_operations; - down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); ns->user_ns = get_user_ns(user_ns); up_read(&uts_sem); + ns_tree_add(ns); return ns; fail_free: @@ -86,7 +76,7 @@ fail: * utsname of this process won't be seen by parent, and vice * versa. */ -struct uts_namespace *copy_utsname(unsigned long flags, +struct uts_namespace *copy_utsname(u64 flags, struct user_namespace *user_ns, struct uts_namespace *old_ns) { struct uts_namespace *new_ns; @@ -105,15 +95,12 @@ struct uts_namespace *copy_utsname(unsigned long flags, void free_uts_ns(struct uts_namespace *ns) { + ns_tree_remove(ns); dec_uts_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); - kmem_cache_free(uts_ns_cache, ns); -} - -static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) -{ - return container_of(ns, struct uts_namespace, ns); + ns_common_free(ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); } static struct ns_common *utsns_get(struct task_struct *task) @@ -159,7 +146,6 @@ static struct user_namespace *utsns_owner(struct ns_common *ns) const struct proc_ns_operations utsns_operations = { .name = "uts", - .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, @@ -174,4 +160,5 @@ void __init uts_ns_init(void) offsetof(struct uts_namespace, name), sizeof_field(struct uts_namespace, name), NULL); + ns_tree_add(&init_uts_ns); } diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index 2f844c279a3e..27107dcc1cbf 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -100,6 +100,7 @@ void vhost_task_stop(struct vhost_task *vtsk) * freeing it below. */ wait_for_completion(&vtsk->exited); + put_task_struct(vtsk->task); kfree(vtsk); } EXPORT_SYMBOL_GPL(vhost_task_stop); @@ -145,10 +146,10 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *), tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args); if (IS_ERR(tsk)) { kfree(vtsk); - return ERR_PTR(PTR_ERR(tsk)); + return ERR_CAST(tsk); } - vtsk->task = tsk; + vtsk->task = get_task_struct(tsk); return vtsk; } EXPORT_SYMBOL_GPL(vhost_task_create); diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c index 1fec61603ef3..fe9bf8db1922 100644 --- a/kernel/vmcore_info.c +++ b/kernel/vmcore_info.c @@ -31,6 +31,13 @@ u32 *vmcoreinfo_note; /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ static unsigned char *vmcoreinfo_data_safecopy; +struct hwerr_info { + atomic_t count; + time64_t timestamp; +}; + +static struct hwerr_info hwerr_data[HWERR_RECOV_MAX]; + Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len) { @@ -118,6 +125,16 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void) } EXPORT_SYMBOL(paddr_vmcoreinfo_note); +void hwerr_log_error_type(enum hwerr_error_type src) +{ + if (src < 0 || src >= HWERR_RECOV_MAX) + return; + + atomic_inc(&hwerr_data[src].count); + WRITE_ONCE(hwerr_data[src].timestamp, ktime_get_real_seconds()); +} +EXPORT_SYMBOL_GPL(hwerr_log_error_type); + static int __init crash_save_vmcoreinfo_init(void) { vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); @@ -210,6 +227,10 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PAGE_HUGETLB_MAPCOUNT_VALUE); #define PAGE_OFFLINE_MAPCOUNT_VALUE (PGTY_offline << 24) VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); +#ifdef CONFIG_UNACCEPTED_MEMORY +#define PAGE_UNACCEPTED_MAPCOUNT_VALUE (PGTY_unaccepted << 24) + VMCOREINFO_NUMBER(PAGE_UNACCEPTED_MAPCOUNT_VALUE); +#endif #ifdef CONFIG_KALLSYMS VMCOREINFO_SYMBOL(kallsyms_names); diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 7e45559521af..52f89f1137da 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -119,9 +119,9 @@ static bool post_one_notification(struct watch_queue *wqueue, offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE; get_page(page); len = n->info & WATCH_INFO_LENGTH; - p = kmap_atomic(page); + p = kmap_local_page(page); memcpy(p + offset, n, len); - kunmap_atomic(p); + kunmap_local(p); buf = pipe_buf(pipe, head); buf->page = page; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 9fa2af9dbf2c..0685e3a8aa0a 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -25,6 +25,7 @@ #include <linux/stop_machine.h> #include <linux/sysctl.h> #include <linux/tick.h> +#include <linux/sys_info.h> #include <linux/sched/clock.h> #include <linux/sched/debug.h> @@ -47,6 +48,7 @@ int __read_mostly watchdog_user_enabled = 1; static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT; static int __read_mostly watchdog_softlockup_user_enabled = 1; int __read_mostly watchdog_thresh = 10; +static int __read_mostly watchdog_thresh_next; static int __read_mostly watchdog_hardlockup_available; struct cpumask watchdog_cpumask __read_mostly; @@ -63,6 +65,36 @@ int __read_mostly sysctl_hardlockup_all_cpu_backtrace; */ unsigned int __read_mostly hardlockup_panic = IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC); + +/* + * bitmasks to control what kinds of system info to be printed when + * hard lockup is detected, it could be task, memory, lock etc. + * Refer include/linux/sys_info.h for detailed bit definition. + */ +static unsigned long hardlockup_si_mask; + +#ifdef CONFIG_SYSFS + +static unsigned int hardlockup_count; + +static ssize_t hardlockup_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%u\n", hardlockup_count); +} + +static struct kobj_attribute hardlockup_count_attr = __ATTR_RO(hardlockup_count); + +static __init int kernel_hardlockup_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &hardlockup_count_attr.attr, NULL); + return 0; +} + +late_initcall(kernel_hardlockup_sysfs_init); + +#endif // CONFIG_SYSFS + /* * We may not want to enable hard lockup detection by default in all cases, * for example when running the kernel as a guest on a hypervisor. In these @@ -154,11 +186,15 @@ static void watchdog_hardlockup_kick(void) void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) { + int hardlockup_all_cpu_backtrace; + if (per_cpu(watchdog_hardlockup_touched, cpu)) { per_cpu(watchdog_hardlockup_touched, cpu) = false; return; } + hardlockup_all_cpu_backtrace = (hardlockup_si_mask & SYS_INFO_ALL_BT) ? + 1 : sysctl_hardlockup_all_cpu_backtrace; /* * Check for a hardlockup by making sure the CPU's timer * interrupt is incrementing. The timer interrupt should have @@ -169,6 +205,19 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) unsigned int this_cpu = smp_processor_id(); unsigned long flags; +#ifdef CONFIG_SYSFS + ++hardlockup_count; +#endif + /* + * A poorly behaving BPF scheduler can trigger hard lockup by + * e.g. putting numerous affinitized tasks in a single queue and + * directing all CPUs at it. The following call can return true + * only once when sched_ext is enabled and will immediately + * abort the BPF scheduler and print out a warning message. + */ + if (scx_hardlockup(cpu)) + return; + /* Only print hardlockups once. */ if (per_cpu(watchdog_hardlockup_warned, cpu)) return; @@ -177,7 +226,7 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) * Prevent multiple hard-lockup reports if one cpu is already * engaged in dumping all cpu back traces. */ - if (sysctl_hardlockup_all_cpu_backtrace) { + if (hardlockup_all_cpu_backtrace) { if (test_and_set_bit_lock(0, &hard_lockup_nmi_warn)) return; } @@ -206,12 +255,13 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) trigger_single_cpu_backtrace(cpu); } - if (sysctl_hardlockup_all_cpu_backtrace) { + if (hardlockup_all_cpu_backtrace) { trigger_allbutcpu_cpu_backtrace(cpu); if (!hardlockup_panic) clear_bit_unlock(0, &hard_lockup_nmi_warn); } + sys_info(hardlockup_si_mask & ~SYS_INFO_ALL_BT); if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); @@ -302,6 +352,13 @@ static void lockup_detector_update_enable(void) int __read_mostly sysctl_softlockup_all_cpu_backtrace; #endif +/* + * bitmasks to control what kinds of system info to be printed when + * soft lockup is detected, it could be task, memory, lock etc. + * Refer include/linux/sys_info.h for detailed bit definition. + */ +static unsigned long softlockup_si_mask; + static struct cpumask watchdog_allowed_mask __read_mostly; /* Global variables, exported for sysctl */ @@ -311,6 +368,28 @@ unsigned int __read_mostly softlockup_panic = static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; +#ifdef CONFIG_SYSFS + +static unsigned int softlockup_count; + +static ssize_t softlockup_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%u\n", softlockup_count); +} + +static struct kobj_attribute softlockup_count_attr = __ATTR_RO(softlockup_count); + +static __init int kernel_softlockup_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &softlockup_count_attr.attr, NULL); + return 0; +} + +late_initcall(kernel_softlockup_sysfs_init); + +#endif // CONFIG_SYSFS + /* Timestamp taken after the last successful reschedule. */ static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); /* Timestamp of the last softlockup report. */ @@ -375,7 +454,11 @@ static DEFINE_PER_CPU(u8, cpustat_tail); */ static u16 get_16bit_precision(u64 data_ns) { - return data_ns >> 24LL; /* 2^24ns ~= 16.8ms */ + /* + * 2^24ns ~= 16.8ms + * Round to the nearest multiple of 16.8 milliseconds. + */ + return (data_ns + (1 << 23)) >> 24LL; } static void update_cpustat(void) @@ -394,6 +477,14 @@ static void update_cpustat(void) old_stat = __this_cpu_read(cpustat_old[i]); new_stat = get_16bit_precision(cpustat[tracked_stats[i]]); util = DIV_ROUND_UP(100 * (new_stat - old_stat), sample_period_16); + /* + * Since we use 16-bit precision, the raw data will undergo + * integer division, which may sometimes result in data loss, + * and then result might exceed 100%. To avoid confusion, + * we enforce a 100% display cap when calculations exceed this threshold. + */ + if (util > 100) + util = 100; __this_cpu_write(cpustat_util[tail][i], util); __this_cpu_write(cpustat_old[i], new_stat); } @@ -405,17 +496,17 @@ static void print_cpustat(void) { int i, group; u8 tail = __this_cpu_read(cpustat_tail); - u64 sample_period_second = sample_period; + u64 sample_period_msecond = sample_period; - do_div(sample_period_second, NSEC_PER_SEC); + do_div(sample_period_msecond, NSEC_PER_MSEC); /* * Outputting the "watchdog" prefix on every line is redundant and not * concise, and the original alarm information is sufficient for * positioning in logs, hence here printk() is used instead of pr_crit(). */ - printk(KERN_CRIT "CPU#%d Utilization every %llus during lockup:\n", - smp_processor_id(), sample_period_second); + printk(KERN_CRIT "CPU#%d Utilization every %llums during lockup:\n", + smp_processor_id(), sample_period_msecond); for (i = 0; i < NUM_SAMPLE_PERIODS; i++) { group = (tail + i) % NUM_SAMPLE_PERIODS; @@ -684,12 +775,21 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) unsigned long touch_ts, period_ts, now; struct pt_regs *regs = get_irq_regs(); int duration; - int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; + int softlockup_all_cpu_backtrace; unsigned long flags; if (!watchdog_enabled) return HRTIMER_NORESTART; + /* + * pass the buddy check if a panic is in process + */ + if (panic_in_progress()) + return HRTIMER_NORESTART; + + softlockup_all_cpu_backtrace = (softlockup_si_mask & SYS_INFO_ALL_BT) ? + 1 : sysctl_softlockup_all_cpu_backtrace; + watchdog_hardlockup_kick(); /* kick the softlockup detector */ @@ -742,6 +842,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) touch_ts = __this_cpu_read(watchdog_touch_ts); duration = is_softlockup(touch_ts, period_ts, now); if (unlikely(duration)) { +#ifdef CONFIG_SYSFS + ++softlockup_count; +#endif + /* * Prevent multiple soft-lockup reports if one cpu is already * engaged in dumping all cpu back traces. @@ -774,6 +878,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) } add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); + sys_info(softlockup_si_mask & ~SYS_INFO_ALL_BT); if (softlockup_panic) panic("softlockup: hung tasks"); } @@ -870,12 +975,20 @@ int lockup_detector_offline_cpu(unsigned int cpu) return 0; } -static void __lockup_detector_reconfigure(void) +static void __lockup_detector_reconfigure(bool thresh_changed) { cpus_read_lock(); watchdog_hardlockup_stop(); softlockup_stop_all(); + /* + * To prevent watchdog_timer_fn from using the old interval and + * the new watchdog_thresh at the same time, which could lead to + * false softlockup reports, it is necessary to update the + * watchdog_thresh after the softlockup is completed. + */ + if (thresh_changed) + watchdog_thresh = READ_ONCE(watchdog_thresh_next); set_sample_period(); lockup_detector_update_enable(); if (watchdog_enabled && watchdog_thresh) @@ -888,7 +1001,7 @@ static void __lockup_detector_reconfigure(void) void lockup_detector_reconfigure(void) { mutex_lock(&watchdog_mutex); - __lockup_detector_reconfigure(); + __lockup_detector_reconfigure(false); mutex_unlock(&watchdog_mutex); } @@ -908,27 +1021,29 @@ static __init void lockup_detector_setup(void) return; mutex_lock(&watchdog_mutex); - __lockup_detector_reconfigure(); + __lockup_detector_reconfigure(false); softlockup_initialized = true; mutex_unlock(&watchdog_mutex); } #else /* CONFIG_SOFTLOCKUP_DETECTOR */ -static void __lockup_detector_reconfigure(void) +static void __lockup_detector_reconfigure(bool thresh_changed) { cpus_read_lock(); watchdog_hardlockup_stop(); + if (thresh_changed) + watchdog_thresh = READ_ONCE(watchdog_thresh_next); lockup_detector_update_enable(); watchdog_hardlockup_start(); cpus_read_unlock(); } void lockup_detector_reconfigure(void) { - __lockup_detector_reconfigure(); + __lockup_detector_reconfigure(false); } static inline void lockup_detector_setup(void) { - __lockup_detector_reconfigure(); + __lockup_detector_reconfigure(false); } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ @@ -946,11 +1061,11 @@ void lockup_detector_soft_poweroff(void) #ifdef CONFIG_SYSCTL /* Propagate any changes to the watchdog infrastructure */ -static void proc_watchdog_update(void) +static void proc_watchdog_update(bool thresh_changed) { /* Remove impossible cpus to keep sysctl output clean. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); - __lockup_detector_reconfigure(); + __lockup_detector_reconfigure(thresh_changed); } /* @@ -984,7 +1099,7 @@ static int proc_watchdog_common(int which, const struct ctl_table *table, int wr } else { err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!err && old != READ_ONCE(*param)) - proc_watchdog_update(); + proc_watchdog_update(false); } mutex_unlock(&watchdog_mutex); return err; @@ -1035,11 +1150,13 @@ static int proc_watchdog_thresh(const struct ctl_table *table, int write, mutex_lock(&watchdog_mutex); - old = READ_ONCE(watchdog_thresh); + watchdog_thresh_next = READ_ONCE(watchdog_thresh); + + old = watchdog_thresh_next; err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (!err && write && old != READ_ONCE(watchdog_thresh)) - proc_watchdog_update(); + if (!err && write && old != READ_ONCE(watchdog_thresh_next)) + proc_watchdog_update(true); mutex_unlock(&watchdog_mutex); return err; @@ -1060,7 +1177,7 @@ static int proc_watchdog_cpumask(const struct ctl_table *table, int write, err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); if (!err && write) - proc_watchdog_update(); + proc_watchdog_update(false); mutex_unlock(&watchdog_mutex); return err; @@ -1080,7 +1197,7 @@ static const struct ctl_table watchdog_sysctls[] = { }, { .procname = "watchdog_thresh", - .data = &watchdog_thresh, + .data = &watchdog_thresh_next, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_watchdog_thresh, @@ -1113,6 +1230,13 @@ static const struct ctl_table watchdog_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "softlockup_sys_info", + .data = &softlockup_si_mask, + .maxlen = sizeof(softlockup_si_mask), + .mode = 0644, + .proc_handler = sysctl_sys_info_handler, + }, #ifdef CONFIG_SMP { .procname = "softlockup_all_cpu_backtrace", @@ -1135,6 +1259,13 @@ static const struct ctl_table watchdog_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "hardlockup_sys_info", + .data = &hardlockup_si_mask, + .maxlen = sizeof(hardlockup_si_mask), + .mode = 0644, + .proc_handler = sysctl_sys_info_handler, + }, #ifdef CONFIG_SMP { .procname = "hardlockup_all_cpu_backtrace", @@ -1147,14 +1278,11 @@ static const struct ctl_table watchdog_sysctls[] = { }, #endif /* CONFIG_SMP */ #endif -}; - -static struct ctl_table watchdog_hardlockup_sysctl[] = { { .procname = "nmi_watchdog", .data = &watchdog_hardlockup_user_enabled, .maxlen = sizeof(int), - .mode = 0444, + .mode = 0644, .proc_handler = proc_nmi_watchdog, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, @@ -1164,10 +1292,6 @@ static struct ctl_table watchdog_hardlockup_sysctl[] = { static void __init watchdog_sysctl_init(void) { register_sysctl_init("kernel", watchdog_sysctls); - - if (watchdog_hardlockup_available) - watchdog_hardlockup_sysctl[0].mode = 0644; - register_sysctl_init("kernel", watchdog_hardlockup_sysctl); } #else diff --git a/kernel/watchdog_buddy.c b/kernel/watchdog_buddy.c index 34dbfe091f4b..ee754d767c21 100644 --- a/kernel/watchdog_buddy.c +++ b/kernel/watchdog_buddy.c @@ -12,10 +12,7 @@ static unsigned int watchdog_next_cpu(unsigned int cpu) { unsigned int next_cpu; - next_cpu = cpumask_next(cpu, &watchdog_cpus); - if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(&watchdog_cpus); - + next_cpu = cpumask_next_wrap(cpu, &watchdog_cpus); if (next_cpu == cpu) return nr_cpu_ids; diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 75af12ff774e..d3ca70e3c256 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) "NMI watchdog: " fmt +#include <linux/panic.h> #include <linux/nmi.h> #include <linux/atomic.h> #include <linux/module.h> @@ -108,6 +109,9 @@ static void watchdog_overflow_callback(struct perf_event *event, /* Ensure the watchdog never gets throttled */ event->hw.interrupts = 0; + if (panic_in_progress()) + return; + if (!watchdog_check_timestamp()) return; @@ -187,6 +191,28 @@ void watchdog_hardlockup_disable(unsigned int cpu) } /** + * hardlockup_detector_perf_adjust_period - Adjust the event period due + * to current cpu frequency change + * @period: The target period to be set + */ +void hardlockup_detector_perf_adjust_period(u64 period) +{ + struct perf_event *event = this_cpu_read(watchdog_ev); + + if (!(watchdog_enabled & WATCHDOG_HARDLOCKUP_ENABLED)) + return; + + if (!event) + return; + + if (event->attr.sample_period == period) + return; + + if (perf_event_period(event, period)) + pr_err("failed to change period to %llu\n", period); +} + +/** * hardlockup_detector_perf_stop - Globally stop watchdog events * * Special interface for x86 to handle the perf HT bug. diff --git a/kernel/workqueue.c b/kernel/workqueue.c index cf6203282737..253311af47c6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -222,7 +222,9 @@ struct worker_pool { struct workqueue_attrs *attrs; /* I: worker attributes */ struct hlist_node hash_node; /* PL: unbound_pool_hash node */ int refcnt; /* PL: refcnt for unbound pools */ - +#ifdef CONFIG_PREEMPT_RT + spinlock_t cb_lock; /* BH worker cancel lock */ +#endif /* * Destruction of pool is RCU protected to allow dereferences * from get_work_pool(). @@ -505,12 +507,16 @@ static struct kthread_worker *pwq_release_worker __ro_after_init; struct workqueue_struct *system_wq __ro_after_init; EXPORT_SYMBOL(system_wq); +struct workqueue_struct *system_percpu_wq __ro_after_init; +EXPORT_SYMBOL(system_percpu_wq); struct workqueue_struct *system_highpri_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_highpri_wq); struct workqueue_struct *system_long_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_long_wq); struct workqueue_struct *system_unbound_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_unbound_wq); +struct workqueue_struct *system_dfl_wq __ro_after_init; +EXPORT_SYMBOL_GPL(system_dfl_wq); struct workqueue_struct *system_freezable_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_freezable_wq); struct workqueue_struct *system_power_efficient_wq __ro_after_init; @@ -535,12 +541,6 @@ static void show_one_worker_pool(struct worker_pool *pool); !lockdep_is_held(&wq_pool_mutex), \ "RCU or wq_pool_mutex should be held") -#define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ - RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() && \ - !lockdep_is_held(&wq->mutex) && \ - !lockdep_is_held(&wq_pool_mutex), \ - "RCU, wq->mutex or wq_pool_mutex should be held") - #define for_each_bh_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(bh_worker_pools, cpu)[0]; \ (pool) < &per_cpu(bh_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ @@ -686,7 +686,7 @@ EXPORT_SYMBOL_GPL(destroy_work_on_stack); void destroy_delayed_work_on_stack(struct delayed_work *work) { - destroy_timer_on_stack(&work->timer); + timer_destroy_on_stack(&work->timer); debug_object_free(&work->work, &work_debug_descr); } EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack); @@ -1686,17 +1686,14 @@ static void __pwq_activate_work(struct pool_workqueue *pwq, static bool tryinc_node_nr_active(struct wq_node_nr_active *nna) { int max = READ_ONCE(nna->max); + int old = atomic_read(&nna->nr); - while (true) { - int old, tmp; - - old = atomic_read(&nna->nr); + do { if (old >= max) return false; - tmp = atomic_cmpxchg_relaxed(&nna->nr, old, old + 1); - if (tmp == old) - return true; - } + } while (!atomic_try_cmpxchg_relaxed(&nna->nr, &old, old + 1)); + + return true; } /** @@ -2221,12 +2218,9 @@ static int wq_select_unbound_cpu(int cpu) } new_cpu = __this_cpu_read(wq_rr_cpu_last); - new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask); - if (unlikely(new_cpu >= nr_cpu_ids)) { - new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask); - if (unlikely(new_cpu >= nr_cpu_ids)) - return cpu; - } + new_cpu = cpumask_next_and_wrap(new_cpu, wq_unbound_cpumask, cpu_online_mask); + if (unlikely(new_cpu >= nr_cpu_ids)) + return cpu; __this_cpu_write(wq_rr_cpu_last, new_cpu); return new_cpu; @@ -2481,7 +2475,7 @@ EXPORT_SYMBOL_GPL(queue_work_node); void delayed_work_timer_fn(struct timer_list *t) { - struct delayed_work *dwork = from_timer(dwork, t, timer); + struct delayed_work *dwork = timer_container_of(dwork, t, timer); /* should have been called from irqsafe timer with irq already off */ __queue_work(dwork->cpu, dwork->wq, &dwork->work); @@ -2909,7 +2903,7 @@ static void set_worker_dying(struct worker *worker, struct list_head *list) */ static void idle_worker_timeout(struct timer_list *t) { - struct worker_pool *pool = from_timer(pool, t, idle_timer); + struct worker_pool *pool = timer_container_of(pool, t, idle_timer); bool do_cull = false; if (work_pending(&pool->idle_cull_work)) @@ -2932,7 +2926,7 @@ static void idle_worker_timeout(struct timer_list *t) raw_spin_unlock_irq(&pool->lock); if (do_cull) - queue_work(system_unbound_wq, &pool->idle_cull_work); + queue_work(system_dfl_wq, &pool->idle_cull_work); } /** @@ -3008,7 +3002,7 @@ static void send_mayday(struct work_struct *work) static void pool_mayday_timeout(struct timer_list *t) { - struct worker_pool *pool = from_timer(pool, t, mayday_timer); + struct worker_pool *pool = timer_container_of(pool, t, mayday_timer); struct work_struct *work; raw_spin_lock_irq(&pool->lock); @@ -3080,6 +3074,31 @@ restart: goto restart; } +#ifdef CONFIG_PREEMPT_RT +static void worker_lock_callback(struct worker_pool *pool) +{ + spin_lock(&pool->cb_lock); +} + +static void worker_unlock_callback(struct worker_pool *pool) +{ + spin_unlock(&pool->cb_lock); +} + +static void workqueue_callback_cancel_wait_running(struct worker_pool *pool) +{ + spin_lock(&pool->cb_lock); + spin_unlock(&pool->cb_lock); +} + +#else + +static void worker_lock_callback(struct worker_pool *pool) { } +static void worker_unlock_callback(struct worker_pool *pool) { } +static void workqueue_callback_cancel_wait_running(struct worker_pool *pool) { } + +#endif + /** * manage_workers - manage worker pool * @worker: self @@ -3241,7 +3260,7 @@ __acquires(&pool->lock) * point will only record its address. */ trace_workqueue_execute_end(work, worker->current_func); - pwq->stats[PWQ_STAT_COMPLETED]++; + lock_map_release(&lockdep_map); if (!bh_draining) lock_map_release(pwq->wq->lockdep_map); @@ -3272,6 +3291,8 @@ __acquires(&pool->lock) raw_spin_lock_irq(&pool->lock); + pwq->stats[PWQ_STAT_COMPLETED]++; + /* * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked * CPU intensive by wq_worker_tick() if @work hogged CPU longer than @@ -3416,6 +3437,27 @@ sleep: goto woke_up; } +static bool assign_rescuer_work(struct pool_workqueue *pwq, struct worker *rescuer) +{ + struct worker_pool *pool = pwq->pool; + struct work_struct *work, *n; + + /* need rescue? */ + if (!pwq->nr_active || !need_to_create_worker(pool)) + return false; + + /* + * Slurp in all works issued via this workqueue and + * process'em. + */ + list_for_each_entry_safe(work, n, &pool->worklist, entry) { + if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n)) + pwq->stats[PWQ_STAT_RESCUED]++; + } + + return !list_empty(&rescuer->scheduled); +} + /** * rescuer_thread - the rescuer thread function * @__rescuer: self @@ -3470,7 +3512,6 @@ repeat: struct pool_workqueue *pwq = list_first_entry(&wq->maydays, struct pool_workqueue, mayday_node); struct worker_pool *pool = pwq->pool; - struct work_struct *work, *n; __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); @@ -3481,18 +3522,9 @@ repeat: raw_spin_lock_irq(&pool->lock); - /* - * Slurp in all works issued via this workqueue and - * process'em. - */ WARN_ON_ONCE(!list_empty(&rescuer->scheduled)); - list_for_each_entry_safe(work, n, &pool->worklist, entry) { - if (get_work_pwq(work) == pwq && - assign_work(work, rescuer, &n)) - pwq->stats[PWQ_STAT_RESCUED]++; - } - if (!list_empty(&rescuer->scheduled)) { + if (assign_rescuer_work(pwq, rescuer)) { process_scheduled_works(rescuer); /* @@ -3507,10 +3539,9 @@ repeat: if (pwq->nr_active && need_to_create_worker(pool)) { raw_spin_lock(&wq_mayday_lock); /* - * Queue iff we aren't racing destruction - * and somebody else hasn't queued it already. + * Queue iff somebody else hasn't queued it already. */ - if (wq->rescuer && list_empty(&pwq->mayday_node)) { + if (list_empty(&pwq->mayday_node)) { get_pwq(pwq); list_add_tail(&pwq->mayday_node, &wq->maydays); } @@ -3557,6 +3588,7 @@ static void bh_worker(struct worker *worker) int nr_restarts = BH_WORKER_RESTARTS; unsigned long end = jiffies + BH_WORKER_JIFFIES; + worker_lock_callback(pool); raw_spin_lock_irq(&pool->lock); worker_leave_idle(worker); @@ -3585,6 +3617,7 @@ done: worker_enter_idle(worker); kick_pool(pool); raw_spin_unlock_irq(&pool->lock); + worker_unlock_callback(pool); } /* @@ -4222,17 +4255,17 @@ static bool __flush_work(struct work_struct *work, bool from_cancel) (data & WORK_OFFQ_BH)) { /* * On RT, prevent a live lock when %current preempted - * soft interrupt processing or prevents ksoftirqd from - * running by keeping flipping BH. If the BH work item - * runs on a different CPU then this has no effect other - * than doing the BH disable/enable dance for nothing. - * This is copied from - * kernel/softirq.c::tasklet_unlock_spin_wait(). + * soft interrupt processing by blocking on lock which + * is owned by the thread invoking the callback. */ while (!try_wait_for_completion(&barr.done)) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) { - local_bh_disable(); - local_bh_enable(); + struct worker_pool *pool; + + guard(rcu)(); + pool = get_work_pool(work); + if (pool) + workqueue_callback_cancel_wait_running(pool); } else { cpu_relax(); } @@ -4627,7 +4660,7 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs) * * Return: The allocated new workqueue_attr on success. %NULL on failure. */ -struct workqueue_attrs *alloc_workqueue_attrs(void) +struct workqueue_attrs *alloc_workqueue_attrs_noprof(void) { struct workqueue_attrs *attrs; @@ -4782,6 +4815,9 @@ static int init_worker_pool(struct worker_pool *pool) ida_init(&pool->worker_ida); INIT_HLIST_NODE(&pool->hash_node); pool->refcnt = 1; +#ifdef CONFIG_PREEMPT_RT + spin_lock_init(&pool->cb_lock); +#endif /* shouldn't fail above this point */ pool->attrs = alloc_workqueue_attrs(); @@ -5344,11 +5380,6 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) /* update node_nr_active->max */ wq_update_node_max_active(ctx->wq, -1); - /* rescuer needs to respect wq cpumask changes */ - if (ctx->wq->rescuer) - set_cpus_allowed_ptr(ctx->wq->rescuer->task, - unbound_effective_cpumask(ctx->wq)); - mutex_unlock(&ctx->wq->mutex); } @@ -5582,10 +5613,13 @@ static int init_rescuer(struct workqueue_struct *wq) } wq->rescuer = rescuer; - if (wq->flags & WQ_UNBOUND) - kthread_bind_mask(rescuer->task, unbound_effective_cpumask(wq)); + + /* initial cpumask is consistent with the detached rescuer and unbind_worker() */ + if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask)) + kthread_bind_mask(rescuer->task, wq_unbound_cpumask); else kthread_bind_mask(rescuer->task, cpu_possible_mask); + wake_up_process(rescuer->task); return 0; @@ -5680,12 +5714,12 @@ static struct workqueue_struct *__alloc_workqueue(const char *fmt, else wq_size = sizeof(*wq); - wq = kzalloc(wq_size, GFP_KERNEL); + wq = kzalloc_noprof(wq_size, GFP_KERNEL); if (!wq) return NULL; if (flags & WQ_UNBOUND) { - wq->unbound_attrs = alloc_workqueue_attrs(); + wq->unbound_attrs = alloc_workqueue_attrs_noprof(); if (!wq->unbound_attrs) goto err_free_wq; } @@ -5775,9 +5809,9 @@ err_destroy: } __printf(1, 4) -struct workqueue_struct *alloc_workqueue(const char *fmt, - unsigned int flags, - int max_active, ...) +struct workqueue_struct *alloc_workqueue_noprof(const char *fmt, + unsigned int flags, + int max_active, ...) { struct workqueue_struct *wq; va_list args; @@ -5792,7 +5826,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, return wq; } -EXPORT_SYMBOL_GPL(alloc_workqueue); +EXPORT_SYMBOL_GPL(alloc_workqueue_noprof); #ifdef CONFIG_LOCKDEP __printf(1, 5) @@ -5837,6 +5871,17 @@ static bool pwq_busy(struct pool_workqueue *pwq) * @wq: target workqueue * * Safely destroy a workqueue. All work currently pending will be done first. + * + * This function does NOT guarantee that non-pending work that has been + * submitted with queue_delayed_work() and similar functions will be done + * before destroying the workqueue. The fundamental problem is that, currently, + * the workqueue has no way of accessing non-pending delayed_work. delayed_work + * is only linked on the timer-side. All delayed_work must, therefore, be + * canceled before calling this function. + * + * TODO: It would be better if the problem described above wouldn't exist and + * destroy_workqueue() would cleanly cancel all pending and non-pending + * delayed_work. */ void destroy_workqueue(struct workqueue_struct *wq) { @@ -5859,16 +5904,10 @@ void destroy_workqueue(struct workqueue_struct *wq) /* kill rescuer, if sanity checks fail, leave it w/o rescuer */ if (wq->rescuer) { - struct worker *rescuer = wq->rescuer; - - /* this prevents new queueing */ - raw_spin_lock_irq(&wq_mayday_lock); - wq->rescuer = NULL; - raw_spin_unlock_irq(&wq_mayday_lock); - /* rescuer will empty maydays list before exiting */ - kthread_stop(rescuer->task); - kfree(rescuer); + kthread_stop(wq->rescuer->task); + kfree(wq->rescuer); + wq->rescuer = NULL; } /* @@ -6035,7 +6074,6 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) struct pool_workqueue *pwq; bool ret; - rcu_read_lock(); preempt_disable(); if (cpu == WORK_CPU_UNBOUND) @@ -6045,7 +6083,6 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) ret = !list_empty(&pwq->inactive_works); preempt_enable(); - rcu_read_unlock(); return ret; } @@ -6757,31 +6794,6 @@ long work_on_cpu_key(int cpu, long (*fn)(void *), return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu_key); - -/** - * work_on_cpu_safe_key - run a function in thread context on a particular cpu - * @cpu: the cpu to run on - * @fn: the function to run - * @arg: the function argument - * @key: The lock class key for lock debugging purposes - * - * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold - * any locks which would prevent @fn from completing. - * - * Return: The value @fn returns. - */ -long work_on_cpu_safe_key(int cpu, long (*fn)(void *), - void *arg, struct lock_class_key *key) -{ - long ret = -ENODEV; - - cpus_read_lock(); - if (cpu_online(cpu)) - ret = work_on_cpu_key(cpu, fn, arg, key); - cpus_read_unlock(); - return ret; -} -EXPORT_SYMBOL_GPL(work_on_cpu_safe_key); #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER @@ -6921,8 +6933,26 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) } if (!ret) { + int cpu; + struct worker_pool *pool; + struct worker *worker; + mutex_lock(&wq_pool_attach_mutex); cpumask_copy(wq_unbound_cpumask, unbound_cpumask); + /* rescuer needs to respect cpumask changes when it is not attached */ + list_for_each_entry(wq, &workqueues, list) { + if (wq->rescuer && !wq->rescuer->pool) + unbind_worker(wq->rescuer); + } + /* DISASSOCIATED worker needs to respect wq_unbound_cpumask */ + for_each_possible_cpu(cpu) { + for_each_cpu_worker_pool(pool, cpu) { + if (!(pool->flags & POOL_DISASSOCIATED)) + continue; + for_each_pool_worker(worker, pool) + unbind_worker(worker); + } + } mutex_unlock(&wq_pool_attach_mutex); } return ret; @@ -7560,8 +7590,6 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) if (!thresh) return; - rcu_read_lock(); - for_each_pool(pool, pi) { unsigned long pool_ts, touched, ts; @@ -7603,8 +7631,6 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) } - rcu_read_unlock(); - if (lockup_detected) show_all_workqueues(); @@ -7656,7 +7682,7 @@ static int wq_watchdog_param_set_thresh(const char *val, if (ret) return ret; - if (system_wq) + if (system_percpu_wq) wq_watchdog_set_thresh(thresh); else wq_watchdog_thresh = thresh; @@ -7754,7 +7780,8 @@ void __init workqueue_init_early(void) restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask); cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask); - + cpumask_andnot(wq_isolated_cpumask, cpu_possible_mask, + housekeeping_cpumask(HK_TYPE_DOMAIN)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); unbound_wq_update_pwq_attrs_buf = alloc_workqueue_attrs(); @@ -7815,23 +7842,24 @@ void __init workqueue_init_early(void) ordered_wq_attrs[i] = attrs; } - system_wq = alloc_workqueue("events", 0, 0); - system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); - system_long_wq = alloc_workqueue("events_long", 0, 0); - system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, - WQ_MAX_ACTIVE); + system_wq = alloc_workqueue("events", WQ_PERCPU, 0); + system_percpu_wq = alloc_workqueue("events", WQ_PERCPU, 0); + system_highpri_wq = alloc_workqueue("events_highpri", + WQ_HIGHPRI | WQ_PERCPU, 0); + system_long_wq = alloc_workqueue("events_long", WQ_PERCPU, 0); + system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE); + system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", - WQ_FREEZABLE, 0); + WQ_FREEZABLE | WQ_PERCPU, 0); system_power_efficient_wq = alloc_workqueue("events_power_efficient", - WQ_POWER_EFFICIENT, 0); + WQ_POWER_EFFICIENT | WQ_PERCPU, 0); system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient", - WQ_FREEZABLE | WQ_POWER_EFFICIENT, - 0); - system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0); + WQ_FREEZABLE | WQ_POWER_EFFICIENT | WQ_PERCPU, 0); + system_bh_wq = alloc_workqueue("events_bh", WQ_BH | WQ_PERCPU, 0); system_bh_highpri_wq = alloc_workqueue("events_bh_highpri", - WQ_BH | WQ_HIGHPRI, 0); - BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || - !system_unbound_wq || !system_freezable_wq || + WQ_BH | WQ_HIGHPRI | WQ_PERCPU, 0); + BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq || + !system_unbound_wq || !system_freezable_wq || !system_dfl_wq || !system_power_efficient_wq || !system_freezable_power_efficient_wq || !system_bh_wq || !system_bh_highpri_wq); |
