diff options
Diffstat (limited to 'arch/x86/kernel/alternative.c')
-rw-r--r-- | arch/x86/kernel/alternative.c | 441 |
1 files changed, 179 insertions, 262 deletions
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 45bcff181cba..ecfe7b497cad 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1,39 +1,17 @@ // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) "SMP alternatives: " fmt -#include <linux/module.h> -#include <linux/sched.h> +#include <linux/mmu_context.h> #include <linux/perf_event.h> -#include <linux/mutex.h> -#include <linux/list.h> -#include <linux/stringify.h> -#include <linux/highmem.h> -#include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/memory.h> -#include <linux/stop_machine.h> -#include <linux/slab.h> -#include <linux/kdebug.h> -#include <linux/kprobes.h> -#include <linux/mmu_context.h> -#include <linux/bsearch.h> -#include <linux/sync_core.h> #include <linux/execmem.h> + #include <asm/text-patching.h> -#include <asm/alternative.h> -#include <asm/sections.h> -#include <asm/mce.h> -#include <asm/nmi.h> -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> #include <asm/insn.h> -#include <asm/io.h> -#include <asm/fixmap.h> -#include <asm/paravirt.h> -#include <asm/asm-prototypes.h> -#include <asm/cfi.h> #include <asm/ibt.h> #include <asm/set_memory.h> +#include <asm/nmi.h> int __read_mostly alternatives_patched; @@ -339,13 +317,6 @@ static void add_nop(u8 *buf, unsigned int len) *buf = INT3_INSN_OPCODE; } -extern s32 __retpoline_sites[], __retpoline_sites_end[]; -extern s32 __return_sites[], __return_sites_end[]; -extern s32 __cfi_sites[], __cfi_sites_end[]; -extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; -extern s32 __smp_locks[], __smp_locks_end[]; -void text_poke_early(void *addr, const void *opcode, size_t len); - /* * Matches NOP and NOPL, not any of the other possible NOPs. */ @@ -537,7 +508,7 @@ static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, } } -void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) +void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) { __apply_relocation(buf, instr, instrlen, repl, repl_len); optimize_nops(instr, buf, instrlen); @@ -625,7 +596,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, DPRINTK(ALT, "alt table %px, -> %px", start, end); /* - * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using + * KASAN_SHADOW_START is defined using * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here. * During the process, KASAN becomes confused seeing partial LA57 * conversion and triggers a false-positive out-of-bound report. @@ -693,7 +664,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, for (; insn_buff_sz < a->instrlen; insn_buff_sz++) insn_buff[insn_buff_sz] = 0x90; - apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); + text_poke_apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); @@ -2305,7 +2276,7 @@ __visible noinline void __init __alt_reloc_selftest(void *arg) static noinline void __init alt_reloc_selftest(void) { /* - * Tests apply_relocation(). + * Tests text_poke_apply_relocation(). * * This has a relative immediate (CALL) in a place other than the first * instruction and additionally on x86_64 we get a RIP-relative LEA: @@ -2442,76 +2413,8 @@ void __init_or_module text_poke_early(void *addr, const void *opcode, } } -typedef struct { - struct mm_struct *mm; -} temp_mm_state_t; - -/* - * Using a temporary mm allows to set temporary mappings that are not accessible - * by other CPUs. Such mappings are needed to perform sensitive memory writes - * that override the kernel memory protections (e.g., W^X), without exposing the - * temporary page-table mappings that are required for these write operations to - * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the - * mapping is torn down. - * - * Context: The temporary mm needs to be used exclusively by a single core. To - * harden security IRQs must be disabled while the temporary mm is - * loaded, thereby preventing interrupt handler bugs from overriding - * the kernel memory protection. - */ -static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) -{ - temp_mm_state_t temp_state; - - lockdep_assert_irqs_disabled(); - - /* - * Make sure not to be in TLB lazy mode, as otherwise we'll end up - * with a stale address space WITHOUT being in lazy mode after - * restoring the previous mm. - */ - if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) - leave_mm(); - - temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); - switch_mm_irqs_off(NULL, mm, current); - - /* - * If breakpoints are enabled, disable them while the temporary mm is - * used. Userspace might set up watchpoints on addresses that are used - * in the temporary mm, which would lead to wrong signals being sent or - * crashes. - * - * Note that breakpoints are not disabled selectively, which also causes - * kernel breakpoints (e.g., perf's) to be disabled. This might be - * undesirable, but still seems reasonable as the code that runs in the - * temporary mm should be short. - */ - if (hw_breakpoint_active()) - hw_breakpoint_disable(); - - return temp_state; -} - -__ro_after_init struct mm_struct *poking_mm; -__ro_after_init unsigned long poking_addr; - -static inline void unuse_temporary_mm(temp_mm_state_t prev_state) -{ - lockdep_assert_irqs_disabled(); - - switch_mm_irqs_off(NULL, prev_state.mm, current); - - /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */ - cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(poking_mm)); - - /* - * Restore the breakpoints if they were disabled before the temporary mm - * was loaded. - */ - if (hw_breakpoint_active()) - hw_breakpoint_restore(); -} +__ro_after_init struct mm_struct *text_poke_mm; +__ro_after_init unsigned long text_poke_mm_addr; static void text_poke_memcpy(void *dst, const void *src, size_t len) { @@ -2531,7 +2434,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l { bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; struct page *pages[2] = {NULL}; - temp_mm_state_t prev; + struct mm_struct *prev_mm; unsigned long flags; pte_t pte, *ptep; spinlock_t *ptl; @@ -2568,7 +2471,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l /* * The lock is not really needed, but this allows to avoid open-coding. */ - ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl); /* * This must not fail; preallocated in poking_init(). @@ -2578,21 +2481,21 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l local_irq_save(flags); pte = mk_pte(pages[0], pgprot); - set_pte_at(poking_mm, poking_addr, ptep, pte); + set_pte_at(text_poke_mm, text_poke_mm_addr, ptep, pte); if (cross_page_boundary) { pte = mk_pte(pages[1], pgprot); - set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); + set_pte_at(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1, pte); } /* * Loading the temporary mm behaves as a compiler barrier, which * guarantees that the PTE will be set at the time memcpy() is done. */ - prev = use_temporary_mm(poking_mm); + prev_mm = use_temporary_mm(text_poke_mm); kasan_disable_current(); - func((u8 *)poking_addr + offset_in_page(addr), src, len); + func((u8 *)text_poke_mm_addr + offset_in_page(addr), src, len); kasan_enable_current(); /* @@ -2601,22 +2504,22 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l */ barrier(); - pte_clear(poking_mm, poking_addr, ptep); + pte_clear(text_poke_mm, text_poke_mm_addr, ptep); if (cross_page_boundary) - pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); + pte_clear(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1); /* * Loading the previous page-table hierarchy requires a serializing * instruction that already allows the core to see the updated version. * Xen-PV is assumed to serialize execution in a similar manner. */ - unuse_temporary_mm(prev); + unuse_temporary_mm(prev_mm); /* * Flushing the TLB might involve IPIs, which would require enabled * IRQs, but not if the mm is not used, as it is in this point. */ - flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + + flush_tlb_mm_range(text_poke_mm, text_poke_mm_addr, text_poke_mm_addr + (cross_page_boundary ? 2 : 1) * PAGE_SIZE, PAGE_SHIFT, false); @@ -2752,7 +2655,7 @@ static void do_sync_core(void *info) sync_core(); } -void text_poke_sync(void) +void smp_text_poke_sync_each_cpu(void) { on_each_cpu(do_sync_core, NULL, 1); } @@ -2762,64 +2665,66 @@ void text_poke_sync(void) * this thing. When len == 6 everything is prefixed with 0x0f and we map * opcode to Jcc.d8, using len to distinguish. */ -struct text_poke_loc { +struct smp_text_poke_loc { /* addr := _stext + rel_addr */ s32 rel_addr; s32 disp; u8 len; u8 opcode; - const u8 text[POKE_MAX_OPCODE_SIZE]; - /* see text_poke_bp_batch() */ + const u8 text[TEXT_POKE_MAX_OPCODE_SIZE]; + /* see smp_text_poke_batch_finish() */ u8 old; }; -struct bp_patching_desc { - struct text_poke_loc *vec; +#define TEXT_POKE_ARRAY_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc)) + +static struct smp_text_poke_array { + struct smp_text_poke_loc vec[TEXT_POKE_ARRAY_MAX]; int nr_entries; - atomic_t refs; -}; +} text_poke_array; -static struct bp_patching_desc bp_desc; +static DEFINE_PER_CPU(atomic_t, text_poke_array_refs); -static __always_inline -struct bp_patching_desc *try_get_desc(void) +/* + * These four __always_inline annotations imply noinstr, necessary + * due to smp_text_poke_int3_handler() being noinstr: + */ + +static __always_inline bool try_get_text_poke_array(void) { - struct bp_patching_desc *desc = &bp_desc; + atomic_t *refs = this_cpu_ptr(&text_poke_array_refs); - if (!raw_atomic_inc_not_zero(&desc->refs)) - return NULL; + if (!raw_atomic_inc_not_zero(refs)) + return false; - return desc; + return true; } -static __always_inline void put_desc(void) +static __always_inline void put_text_poke_array(void) { - struct bp_patching_desc *desc = &bp_desc; + atomic_t *refs = this_cpu_ptr(&text_poke_array_refs); smp_mb__before_atomic(); - raw_atomic_dec(&desc->refs); + raw_atomic_dec(refs); } -static __always_inline void *text_poke_addr(struct text_poke_loc *tp) +static __always_inline void *text_poke_addr(const struct smp_text_poke_loc *tpl) { - return _stext + tp->rel_addr; + return _stext + tpl->rel_addr; } -static __always_inline int patch_cmp(const void *key, const void *elt) +static __always_inline int patch_cmp(const void *tpl_a, const void *tpl_b) { - struct text_poke_loc *tp = (struct text_poke_loc *) elt; - - if (key < text_poke_addr(tp)) + if (tpl_a < text_poke_addr(tpl_b)) return -1; - if (key > text_poke_addr(tp)) + if (tpl_a > text_poke_addr(tpl_b)) return 1; return 0; } -noinstr int poke_int3_handler(struct pt_regs *regs) +noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) { - struct bp_patching_desc *desc; - struct text_poke_loc *tp; + struct smp_text_poke_loc *tpl; int ret = 0; void *ip; @@ -2828,41 +2733,40 @@ noinstr int poke_int3_handler(struct pt_regs *regs) /* * Having observed our INT3 instruction, we now must observe - * bp_desc with non-zero refcount: + * text_poke_array with non-zero refcount: * - * bp_desc.refs = 1 INT3 - * WMB RMB - * write INT3 if (bp_desc.refs != 0) + * text_poke_array_refs = 1 INT3 + * WMB RMB + * write INT3 if (text_poke_array_refs != 0) */ smp_rmb(); - desc = try_get_desc(); - if (!desc) + if (!try_get_text_poke_array()) return 0; /* - * Discount the INT3. See text_poke_bp_batch(). + * Discount the INT3. See smp_text_poke_batch_finish(). */ ip = (void *) regs->ip - INT3_INSN_SIZE; /* * Skip the binary search if there is a single member in the vector. */ - if (unlikely(desc->nr_entries > 1)) { - tp = __inline_bsearch(ip, desc->vec, desc->nr_entries, - sizeof(struct text_poke_loc), + if (unlikely(text_poke_array.nr_entries > 1)) { + tpl = __inline_bsearch(ip, text_poke_array.vec, text_poke_array.nr_entries, + sizeof(struct smp_text_poke_loc), patch_cmp); - if (!tp) + if (!tpl) goto out_put; } else { - tp = desc->vec; - if (text_poke_addr(tp) != ip) + tpl = text_poke_array.vec; + if (text_poke_addr(tpl) != ip) goto out_put; } - ip += tp->len; + ip += tpl->len; - switch (tp->opcode) { + switch (tpl->opcode) { case INT3_INSN_OPCODE: /* * Someone poked an explicit INT3, they'll want to handle it, @@ -2875,16 +2779,16 @@ noinstr int poke_int3_handler(struct pt_regs *regs) break; case CALL_INSN_OPCODE: - int3_emulate_call(regs, (long)ip + tp->disp); + int3_emulate_call(regs, (long)ip + tpl->disp); break; case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: - int3_emulate_jmp(regs, (long)ip + tp->disp); + int3_emulate_jmp(regs, (long)ip + tpl->disp); break; case 0x70 ... 0x7f: /* Jcc */ - int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp); + int3_emulate_jcc(regs, tpl->opcode & 0xf, (long)ip, tpl->disp); break; default: @@ -2894,51 +2798,50 @@ noinstr int poke_int3_handler(struct pt_regs *regs) ret = 1; out_put: - put_desc(); + put_text_poke_array(); return ret; } -#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) -static struct text_poke_loc tp_vec[TP_VEC_MAX]; -static int tp_vec_nr; - /** - * text_poke_bp_batch() -- update instructions on live kernel on SMP - * @tp: vector of instructions to patch - * @nr_entries: number of entries in the vector + * smp_text_poke_batch_finish() -- update instructions on live kernel on SMP * - * Modify multi-byte instruction by using int3 breakpoint on SMP. - * We completely avoid stop_machine() here, and achieve the - * synchronization using int3 breakpoint. + * Input state: + * text_poke_array.vec: vector of instructions to patch + * text_poke_array.nr_entries: number of entries in the vector + * + * Modify multi-byte instructions by using INT3 breakpoints on SMP. + * We completely avoid using stop_machine() here, and achieve the + * synchronization using INT3 breakpoints and SMP cross-calls. * * The way it is done: * - For each entry in the vector: - * - add a int3 trap to the address that will be patched - * - sync cores + * - add an INT3 trap to the address that will be patched + * - SMP sync all CPUs * - For each entry in the vector: * - update all but the first byte of the patched range - * - sync cores + * - SMP sync all CPUs * - For each entry in the vector: - * - replace the first byte (int3) by the first byte of + * - replace the first byte (INT3) by the first byte of the * replacing opcode - * - sync cores + * - SMP sync all CPUs */ -static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) +void smp_text_poke_batch_finish(void) { unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; int do_sync; - lockdep_assert_held(&text_mutex); + if (!text_poke_array.nr_entries) + return; - bp_desc.vec = tp; - bp_desc.nr_entries = nr_entries; + lockdep_assert_held(&text_mutex); /* - * Corresponds to the implicit memory barrier in try_get_desc() to - * ensure reading a non-zero refcount provides up to date bp_desc data. + * Corresponds to the implicit memory barrier in try_get_text_poke_array() to + * ensure reading a non-zero refcount provides up to date text_poke_array data. */ - atomic_set_release(&bp_desc.refs, 1); + for_each_possible_cpu(i) + atomic_set_release(per_cpu_ptr(&text_poke_array_refs, i), 1); /* * Function tracing can enable thousands of places that need to be @@ -2951,33 +2854,33 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries cond_resched(); /* - * Corresponding read barrier in int3 notifier for making sure the - * nr_entries and handler are correctly ordered wrt. patching. + * Corresponding read barrier in INT3 notifier for making sure the + * text_poke_array.nr_entries and handler are correctly ordered wrt. patching. */ smp_wmb(); /* - * First step: add a int3 trap to the address that will be patched. + * First step: add a INT3 trap to the address that will be patched. */ - for (i = 0; i < nr_entries; i++) { - tp[i].old = *(u8 *)text_poke_addr(&tp[i]); - text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); + for (i = 0; i < text_poke_array.nr_entries; i++) { + text_poke_array.vec[i].old = *(u8 *)text_poke_addr(&text_poke_array.vec[i]); + text_poke(text_poke_addr(&text_poke_array.vec[i]), &int3, INT3_INSN_SIZE); } - text_poke_sync(); + smp_text_poke_sync_each_cpu(); /* * Second step: update all but the first byte of the patched range. */ - for (do_sync = 0, i = 0; i < nr_entries; i++) { - u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, }; - u8 _new[POKE_MAX_OPCODE_SIZE+1]; - const u8 *new = tp[i].text; - int len = tp[i].len; + for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) { + u8 old[TEXT_POKE_MAX_OPCODE_SIZE+1] = { text_poke_array.vec[i].old, }; + u8 _new[TEXT_POKE_MAX_OPCODE_SIZE+1]; + const u8 *new = text_poke_array.vec[i].text; + int len = text_poke_array.vec[i].len; if (len - INT3_INSN_SIZE > 0) { memcpy(old + INT3_INSN_SIZE, - text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE, len - INT3_INSN_SIZE); if (len == 6) { @@ -2986,7 +2889,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries new = _new; } - text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + text_poke(text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE, new + INT3_INSN_SIZE, len - INT3_INSN_SIZE); @@ -3017,7 +2920,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * The old instruction is recorded so that the event can be * processed forwards or backwards. */ - perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len); + perf_event_text_poke(text_poke_addr(&text_poke_array.vec[i]), old, len, new, len); } if (do_sync) { @@ -3026,63 +2929,79 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * not necessary and we'd be safe even without it. But * better safe than sorry (plus there's not only Intel). */ - text_poke_sync(); + smp_text_poke_sync_each_cpu(); } /* - * Third step: replace the first byte (int3) by the first byte of + * Third step: replace the first byte (INT3) by the first byte of the * replacing opcode. */ - for (do_sync = 0, i = 0; i < nr_entries; i++) { - u8 byte = tp[i].text[0]; + for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) { + u8 byte = text_poke_array.vec[i].text[0]; - if (tp[i].len == 6) + if (text_poke_array.vec[i].len == 6) byte = 0x0f; if (byte == INT3_INSN_OPCODE) continue; - text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE); + text_poke(text_poke_addr(&text_poke_array.vec[i]), &byte, INT3_INSN_SIZE); do_sync++; } if (do_sync) - text_poke_sync(); + smp_text_poke_sync_each_cpu(); /* * Remove and wait for refs to be zero. + * + * Notably, if after step-3 above the INT3 got removed, then the + * smp_text_poke_sync_each_cpu() will have serialized against any running INT3 + * handlers and the below spin-wait will not happen. + * + * IOW. unless the replacement instruction is INT3, this case goes + * unused. */ - if (!atomic_dec_and_test(&bp_desc.refs)) - atomic_cond_read_acquire(&bp_desc.refs, !VAL); + for_each_possible_cpu(i) { + atomic_t *refs = per_cpu_ptr(&text_poke_array_refs, i); + + if (unlikely(!atomic_dec_and_test(refs))) + atomic_cond_read_acquire(refs, !VAL); + } + + /* They are all completed: */ + text_poke_array.nr_entries = 0; } -static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, - const void *opcode, size_t len, const void *emulate) +static void __smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) { + struct smp_text_poke_loc *tpl; struct insn insn; int ret, i = 0; + tpl = &text_poke_array.vec[text_poke_array.nr_entries++]; + if (len == 6) i = 1; - memcpy((void *)tp->text, opcode+i, len-i); + memcpy((void *)tpl->text, opcode+i, len-i); if (!emulate) emulate = opcode; ret = insn_decode_kernel(&insn, emulate); BUG_ON(ret < 0); - tp->rel_addr = addr - (void *)_stext; - tp->len = len; - tp->opcode = insn.opcode.bytes[0]; + tpl->rel_addr = addr - (void *)_stext; + tpl->len = len; + tpl->opcode = insn.opcode.bytes[0]; if (is_jcc32(&insn)) { /* * Map Jcc.d32 onto Jcc.d8 and use len to distinguish. */ - tp->opcode = insn.opcode.bytes[1] - 0x10; + tpl->opcode = insn.opcode.bytes[1] - 0x10; } - switch (tp->opcode) { + switch (tpl->opcode) { case RET_INSN_OPCODE: case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: @@ -3091,14 +3010,14 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, * next instruction can be padded with INT3. */ for (i = insn.length; i < len; i++) - BUG_ON(tp->text[i] != INT3_INSN_OPCODE); + BUG_ON(tpl->text[i] != INT3_INSN_OPCODE); break; default: BUG_ON(len != insn.length); } - switch (tp->opcode) { + switch (tpl->opcode) { case INT3_INSN_OPCODE: case RET_INSN_OPCODE: break; @@ -3107,21 +3026,21 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: case 0x70 ... 0x7f: /* Jcc */ - tp->disp = insn.immediate.value; + tpl->disp = insn.immediate.value; break; default: /* assume NOP */ switch (len) { case 2: /* NOP2 -- emulate as JMP8+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); - tp->opcode = JMP8_INSN_OPCODE; - tp->disp = 0; + tpl->opcode = JMP8_INSN_OPCODE; + tpl->disp = 0; break; case 5: /* NOP5 -- emulate as JMP32+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); - tp->opcode = JMP32_INSN_OPCODE; - tp->disp = 0; + tpl->opcode = JMP32_INSN_OPCODE; + tpl->disp = 0; break; default: /* unknown instruction */ @@ -3132,51 +3051,50 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, } /* - * We hard rely on the tp_vec being ordered; ensure this is so by flushing + * We hard rely on the text_poke_array.vec being ordered; ensure this is so by flushing * early if needed. */ -static bool tp_order_fail(void *addr) +static bool text_poke_addr_ordered(void *addr) { - struct text_poke_loc *tp; - - if (!tp_vec_nr) - return false; + WARN_ON_ONCE(!addr); - if (!addr) /* force */ + if (!text_poke_array.nr_entries) return true; - tp = &tp_vec[tp_vec_nr - 1]; - if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr) - return true; - - return false; -} - -static void text_poke_flush(void *addr) -{ - if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) { - text_poke_bp_batch(tp_vec, tp_vec_nr); - tp_vec_nr = 0; - } -} + /* + * If the last current entry's address is higher than the + * new entry's address we'd like to add, then ordering + * is violated and we must first flush all pending patching + * requests: + */ + if (text_poke_addr(text_poke_array.vec + text_poke_array.nr_entries-1) > addr) + return false; -void text_poke_finish(void) -{ - text_poke_flush(NULL); + return true; } -void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) +/** + * smp_text_poke_batch_add() -- update instruction on live kernel on SMP, batched + * @addr: address to patch + * @opcode: opcode of new instruction + * @len: length to copy + * @emulate: instruction to be emulated + * + * Add a new instruction to the current queue of to-be-patched instructions + * the kernel maintains. The patching request will not be executed immediately, + * but becomes part of an array of patching requests, optimized for batched + * execution. All pending patching requests will be executed on the next + * smp_text_poke_batch_finish() call. + */ +void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) { - struct text_poke_loc *tp; - - text_poke_flush(addr); - - tp = &tp_vec[tp_vec_nr++]; - text_poke_loc_init(tp, addr, opcode, len, emulate); + if (text_poke_array.nr_entries == TEXT_POKE_ARRAY_MAX || !text_poke_addr_ordered(addr)) + smp_text_poke_batch_finish(); + __smp_text_poke_batch_add(addr, opcode, len, emulate); } /** - * text_poke_bp() -- update instructions on live kernel on SMP + * smp_text_poke_single() -- update instruction on live kernel on SMP immediately * @addr: address to patch * @opcode: opcode of new instruction * @len: length to copy @@ -3184,12 +3102,11 @@ void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const voi * * Update a single instruction with the vector in the stack, avoiding * dynamically allocated memory. This function should be used when it is - * not possible to allocate memory. + * not possible to allocate memory for a vector. The single instruction + * is patched in immediately. */ -void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) +void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate) { - struct text_poke_loc tp; - - text_poke_loc_init(&tp, addr, opcode, len, emulate); - text_poke_bp_batch(&tp, 1); + __smp_text_poke_batch_add(addr, opcode, len, emulate); + smp_text_poke_batch_finish(); } |