diff options
author | Salvatore Bonaccorso <carnil@debian.org> | 2021-08-03 05:50:12 +0000 |
---|---|---|
committer | Salvatore Bonaccorso <carnil@debian.org> | 2021-08-03 05:50:12 +0000 |
commit | b2bfc542bce52eb5b6d8849795279d5275e25176 (patch) | |
tree | ad9594a22e60add6de05f70b073f64f00c52db6b | |
parent | b26bd6c2c5474bf73db3628c1037be8b202c3345 (diff) | |
parent | 2af7220d861470329d104dc1e133c2b384a6dc90 (diff) | |
download | kernel_replicant_linux-b2bfc542bce52eb5b6d8849795279d5275e25176.tar.gz kernel_replicant_linux-b2bfc542bce52eb5b6d8849795279d5275e25176.tar.bz2 kernel_replicant_linux-b2bfc542bce52eb5b6d8849795279d5275e25176.zip |
Merge branch 'bpf-5.10.y-bugfixes' into 'sid'
cherry-pick some bpf related fixes for 5.10.y
See merge request kernel-team/linux!380
11 files changed, 1302 insertions, 0 deletions
diff --git a/debian/changelog b/debian/changelog index 002c3a72fbb5..2ac1d78018ae 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,20 @@ +linux (5.10.46-4) UNRELEASED; urgency=medium + + * bpf: Introduce BPF nospec instruction for mitigating Spectre v4 + (CVE-2021-34556, CVE-2021-35477) + * bpf: Fix leakage due to insufficient speculative store bypass mitigation + (CVE-2021-34556, CVE-2021-35477) + * bpf: Remove superfluous aux sanitation on subprog rejection + * Ignore ABI changes for bpf_offload_dev_create and bpf_verifier_log_write + * bpf: Add kconfig knob for disabling unpriv bpf by default + * init: Enable BPF_UNPRIV_DEFAULT_OFF (Closes: #990411) + * linux-image: Add NEWS entry documenting that unprivileged calls to bpf() are + disabled by default in Debian. + * bpf: verifier: Allocate idmap scratch in verifier env + * bpf: Fix pointer arithmetic mask tightening under state pruning + + -- Salvatore Bonaccorso <carnil@debian.org> Mon, 02 Aug 2021 12:36:15 +0200 + linux (5.10.46-3) unstable; urgency=medium * [armhf] Add mdio-aspeed to nic-modules. diff --git a/debian/config/config b/debian/config/config index 7ac1a23f12d9..31957d369cf1 100644 --- a/debian/config/config +++ b/debian/config/config @@ -6425,6 +6425,10 @@ CONFIG_KALLSYMS=y CONFIG_BPF_LSM=y CONFIG_BPF_SYSCALL=y # CONFIG_BPF_JIT_ALWAYS_ON is not set +#. Debian backport of 08389d888287 ("bpf: Add kconfig knob for disabling unpriv +#. bpf by default") in 5.13-rc4 adds the configuration option to init/Kconfig +#. and needs to be moved once rebasing to 5.13-rc4 and later. +CONFIG_BPF_UNPRIV_DEFAULT_OFF=y CONFIG_USERFAULTFD=y CONFIG_RSEQ=y # CONFIG_DEBUG_RSEQ is not set diff --git a/debian/config/defines b/debian/config/defines index c68a936fff0f..2c49966dde3f 100644 --- a/debian/config/defines +++ b/debian/config/defines @@ -4,6 +4,8 @@ ignore-changes: __cpuhp_* __udp_gso_segment bpf_analyzer + bpf_offload_dev_create + bpf_verifier_log_write cxl_* dax_flush ieee80211_nullfunc_get diff --git a/debian/linux-image.NEWS b/debian/linux-image.NEWS index 899e30abcaa2..f8e1fc022907 100644 --- a/debian/linux-image.NEWS +++ b/debian/linux-image.NEWS @@ -1,3 +1,19 @@ +linux (5.10.46-4) unstable; urgency=medium + + * From Linux 5.10.46-4, unprivileged calls to bpf() are disabled by + default, mitigating several security issues. However, an admin can + still change this setting later on, if needed, by writing 0 or 1 to + the kernel.unprivileged_bpf_disabled sysctl. + + If you prefer to keep unprivileged calls to bpf() enabled, set the + sysctl: + + kernel.unprivileged_bpf_disabled = 0 + + which is the upstream default. + + -- Salvatore Bonaccorso <carnil@debian.org> Mon, 02 Aug 2021 22:59:24 +0200 + linux (5.10~rc7-1~exp2) unstable; urgency=medium * From Linux 5.10, all users are allowed to create user namespaces by diff --git a/debian/patches/bugfix/all/bpf-Add-kconfig-knob-for-disabling-unpriv-bpf-by-def.patch b/debian/patches/bugfix/all/bpf-Add-kconfig-knob-for-disabling-unpriv-bpf-by-def.patch new file mode 100644 index 000000000000..6f51701866a2 --- /dev/null +++ b/debian/patches/bugfix/all/bpf-Add-kconfig-knob-for-disabling-unpriv-bpf-by-def.patch @@ -0,0 +1,134 @@ +From: Daniel Borkmann <daniel@iogearbox.net> +Date: Tue, 11 May 2021 22:35:17 +0200 +Subject: bpf: Add kconfig knob for disabling unpriv bpf by default +Origin: https://git.kernel.org/linus/08389d888287c3823f80b0216766b71e17f0aba5 + +Add a kconfig knob which allows for unprivileged bpf to be disabled by default. +If set, the knob sets /proc/sys/kernel/unprivileged_bpf_disabled to value of 2. + +This still allows a transition of 2 -> {0,1} through an admin. Similarly, +this also still keeps 1 -> {1} behavior intact, so that once set to permanently +disabled, it cannot be undone aside from a reboot. + +We've also added extra2 with max of 2 for the procfs handler, so that an admin +still has a chance to toggle between 0 <-> 2. + +Either way, as an additional alternative, applications can make use of CAP_BPF +that we added a while ago. + +Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> +Signed-off-by: Alexei Starovoitov <ast@kernel.org> +Link: https://lore.kernel.org/bpf/74ec548079189e4e4dffaeb42b8987bb3c852eee.1620765074.git.daniel@iogearbox.net +[Salvatore Bonaccorso: Backport to 5.10.y: Filename change from +kernel/bpf/Kconfig back to init/Kconfig] +--- + Documentation/admin-guide/sysctl/kernel.rst | 17 +++++++++--- + kernel/bpf/Kconfig | 10 +++++++ + kernel/bpf/syscall.c | 3 ++- + kernel/sysctl.c | 29 +++++++++++++++++---- + 4 files changed, 50 insertions(+), 9 deletions(-) + +--- a/Documentation/admin-guide/sysctl/kernel.rst ++++ b/Documentation/admin-guide/sysctl/kernel.rst +@@ -1457,11 +1457,22 @@ unprivileged_bpf_disabled + ========================= + + Writing 1 to this entry will disable unprivileged calls to ``bpf()``; +-once disabled, calling ``bpf()`` without ``CAP_SYS_ADMIN`` will return +-``-EPERM``. ++once disabled, calling ``bpf()`` without ``CAP_SYS_ADMIN`` or ``CAP_BPF`` ++will return ``-EPERM``. Once set to 1, this can't be cleared from the ++running kernel anymore. + +-Once set, this can't be cleared. ++Writing 2 to this entry will also disable unprivileged calls to ``bpf()``, ++however, an admin can still change this setting later on, if needed, by ++writing 0 or 1 to this entry. + ++If ``BPF_UNPRIV_DEFAULT_OFF`` is enabled in the kernel config, then this ++entry will default to 2 instead of 0. ++ ++= ============================================================= ++0 Unprivileged calls to ``bpf()`` are enabled ++1 Unprivileged calls to ``bpf()`` are disabled without recovery ++2 Unprivileged calls to ``bpf()`` are disabled ++= ============================================================= + + watchdog + ======== +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1722,6 +1722,16 @@ config BPF_JIT_DEFAULT_ON + def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON + depends on HAVE_EBPF_JIT && BPF_JIT + ++config BPF_UNPRIV_DEFAULT_OFF ++ bool "Disable unprivileged BPF by default" ++ depends on BPF_SYSCALL ++ help ++ Disables unprivileged BPF by default by setting the corresponding ++ /proc/sys/kernel/unprivileged_bpf_disabled knob to 2. An admin can ++ still reenable it by setting it to 0 later on, or permanently ++ disable it by setting it to 1 (from which no other transition to ++ 0 is possible anymore). ++ + source "kernel/bpf/preload/Kconfig" + + config USERFAULTFD +--- a/kernel/bpf/syscall.c ++++ b/kernel/bpf/syscall.c +@@ -50,7 +50,8 @@ static DEFINE_SPINLOCK(map_idr_lock); + static DEFINE_IDR(link_idr); + static DEFINE_SPINLOCK(link_idr_lock); + +-int sysctl_unprivileged_bpf_disabled __read_mostly; ++int sysctl_unprivileged_bpf_disabled __read_mostly = ++ IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0; + + static const struct bpf_map_ops * const bpf_map_types[] = { + #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -237,7 +237,27 @@ static int bpf_stats_handler(struct ctl_ + mutex_unlock(&bpf_stats_enabled_mutex); + return ret; + } +-#endif ++ ++static int bpf_unpriv_handler(struct ctl_table *table, int write, ++ void *buffer, size_t *lenp, loff_t *ppos) ++{ ++ int ret, unpriv_enable = *(int *)table->data; ++ bool locked_state = unpriv_enable == 1; ++ struct ctl_table tmp = *table; ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ tmp.data = &unpriv_enable; ++ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); ++ if (write && !ret) { ++ if (locked_state && unpriv_enable != 1) ++ return -EPERM; ++ *(int *)table->data = unpriv_enable; ++ } ++ return ret; ++} ++#endif /* CONFIG_BPF_SYSCALL && CONFIG_SYSCTL */ + + /* + * /proc/sys support +@@ -2639,10 +2659,9 @@ static struct ctl_table kern_table[] = { + .data = &sysctl_unprivileged_bpf_disabled, + .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), + .mode = 0644, +- /* only handle a transition from default "0" to "1" */ +- .proc_handler = proc_dointvec_minmax, +- .extra1 = SYSCTL_ONE, +- .extra2 = SYSCTL_ONE, ++ .proc_handler = bpf_unpriv_handler, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &two, + }, + { + .procname = "bpf_stats_enabled", diff --git a/debian/patches/bugfix/all/bpf-fix-leakage-due-to-insufficient-speculative-stor.patch b/debian/patches/bugfix/all/bpf-fix-leakage-due-to-insufficient-speculative-stor.patch new file mode 100644 index 000000000000..33b10802fe68 --- /dev/null +++ b/debian/patches/bugfix/all/bpf-fix-leakage-due-to-insufficient-speculative-stor.patch @@ -0,0 +1,452 @@ +From 7e0f6483e208dc514244e383e74ff3b15bd638df Mon Sep 17 00:00:00 2001 +From: Sasha Levin <sashal@kernel.org> +Date: Tue, 13 Jul 2021 08:18:31 +0000 +Subject: bpf: Fix leakage due to insufficient speculative store bypass + mitigation + +From: Daniel Borkmann <daniel@iogearbox.net> + +[ Upstream commit 2039f26f3aca5b0e419b98f65dd36481337b86ee ] + +Spectre v4 gadgets make use of memory disambiguation, which is a set of +techniques that execute memory access instructions, that is, loads and +stores, out of program order; Intel's optimization manual, section 2.4.4.5: + + A load instruction micro-op may depend on a preceding store. Many + microarchitectures block loads until all preceding store addresses are + known. The memory disambiguator predicts which loads will not depend on + any previous stores. When the disambiguator predicts that a load does + not have such a dependency, the load takes its data from the L1 data + cache. Eventually, the prediction is verified. If an actual conflict is + detected, the load and all succeeding instructions are re-executed. + +af86ca4e3088 ("bpf: Prevent memory disambiguation attack") tried to mitigate +this attack by sanitizing the memory locations through preemptive "fast" +(low latency) stores of zero prior to the actual "slow" (high latency) store +of a pointer value such that upon dependency misprediction the CPU then +speculatively executes the load of the pointer value and retrieves the zero +value instead of the attacker controlled scalar value previously stored at +that location, meaning, subsequent access in the speculative domain is then +redirected to the "zero page". + +The sanitized preemptive store of zero prior to the actual "slow" store is +done through a simple ST instruction based on r10 (frame pointer) with +relative offset to the stack location that the verifier has been tracking +on the original used register for STX, which does not have to be r10. Thus, +there are no memory dependencies for this store, since it's only using r10 +and immediate constant of zero; hence af86ca4e3088 /assumed/ a low latency +operation. + +However, a recent attack demonstrated that this mitigation is not sufficient +since the preemptive store of zero could also be turned into a "slow" store +and is thus bypassed as well: + + [...] + // r2 = oob address (e.g. scalar) + // r7 = pointer to map value + 31: (7b) *(u64 *)(r10 -16) = r2 + // r9 will remain "fast" register, r10 will become "slow" register below + 32: (bf) r9 = r10 + // JIT maps BPF reg to x86 reg: + // r9 -> r15 (callee saved) + // r10 -> rbp + // train store forward prediction to break dependency link between both r9 + // and r10 by evicting them from the predictor's LRU table. + 33: (61) r0 = *(u32 *)(r7 +24576) + 34: (63) *(u32 *)(r7 +29696) = r0 + 35: (61) r0 = *(u32 *)(r7 +24580) + 36: (63) *(u32 *)(r7 +29700) = r0 + 37: (61) r0 = *(u32 *)(r7 +24584) + 38: (63) *(u32 *)(r7 +29704) = r0 + 39: (61) r0 = *(u32 *)(r7 +24588) + 40: (63) *(u32 *)(r7 +29708) = r0 + [...] + 543: (61) r0 = *(u32 *)(r7 +25596) + 544: (63) *(u32 *)(r7 +30716) = r0 + // prepare call to bpf_ringbuf_output() helper. the latter will cause rbp + // to spill to stack memory while r13/r14/r15 (all callee saved regs) remain + // in hardware registers. rbp becomes slow due to push/pop latency. below is + // disasm of bpf_ringbuf_output() helper for better visual context: + // + // ffffffff8117ee20: 41 54 push r12 + // ffffffff8117ee22: 55 push rbp + // ffffffff8117ee23: 53 push rbx + // ffffffff8117ee24: 48 f7 c1 fc ff ff ff test rcx,0xfffffffffffffffc + // ffffffff8117ee2b: 0f 85 af 00 00 00 jne ffffffff8117eee0 <-- jump taken + // [...] + // ffffffff8117eee0: 49 c7 c4 ea ff ff ff mov r12,0xffffffffffffffea + // ffffffff8117eee7: 5b pop rbx + // ffffffff8117eee8: 5d pop rbp + // ffffffff8117eee9: 4c 89 e0 mov rax,r12 + // ffffffff8117eeec: 41 5c pop r12 + // ffffffff8117eeee: c3 ret + 545: (18) r1 = map[id:4] + 547: (bf) r2 = r7 + 548: (b7) r3 = 0 + 549: (b7) r4 = 4 + 550: (85) call bpf_ringbuf_output#194288 + // instruction 551 inserted by verifier \ + 551: (7a) *(u64 *)(r10 -16) = 0 | /both/ are now slow stores here + // storing map value pointer r7 at fp-16 | since value of r10 is "slow". + 552: (7b) *(u64 *)(r10 -16) = r7 / + // following "fast" read to the same memory location, but due to dependency + // misprediction it will speculatively execute before insn 551/552 completes. + 553: (79) r2 = *(u64 *)(r9 -16) + // in speculative domain contains attacker controlled r2. in non-speculative + // domain this contains r7, and thus accesses r7 +0 below. + 554: (71) r3 = *(u8 *)(r2 +0) + // leak r3 + +As can be seen, the current speculative store bypass mitigation which the +verifier inserts at line 551 is insufficient since /both/, the write of +the zero sanitation as well as the map value pointer are a high latency +instruction due to prior memory access via push/pop of r10 (rbp) in contrast +to the low latency read in line 553 as r9 (r15) which stays in hardware +registers. Thus, architecturally, fp-16 is r7, however, microarchitecturally, +fp-16 can still be r2. + +Initial thoughts to address this issue was to track spilled pointer loads +from stack and enforce their load via LDX through r10 as well so that /both/ +the preemptive store of zero /as well as/ the load use the /same/ register +such that a dependency is created between the store and load. However, this +option is not sufficient either since it can be bypassed as well under +speculation. An updated attack with pointer spill/fills now _all_ based on +r10 would look as follows: + + [...] + // r2 = oob address (e.g. scalar) + // r7 = pointer to map value + [...] + // longer store forward prediction training sequence than before. + 2062: (61) r0 = *(u32 *)(r7 +25588) + 2063: (63) *(u32 *)(r7 +30708) = r0 + 2064: (61) r0 = *(u32 *)(r7 +25592) + 2065: (63) *(u32 *)(r7 +30712) = r0 + 2066: (61) r0 = *(u32 *)(r7 +25596) + 2067: (63) *(u32 *)(r7 +30716) = r0 + // store the speculative load address (scalar) this time after the store + // forward prediction training. + 2068: (7b) *(u64 *)(r10 -16) = r2 + // preoccupy the CPU store port by running sequence of dummy stores. + 2069: (63) *(u32 *)(r7 +29696) = r0 + 2070: (63) *(u32 *)(r7 +29700) = r0 + 2071: (63) *(u32 *)(r7 +29704) = r0 + 2072: (63) *(u32 *)(r7 +29708) = r0 + 2073: (63) *(u32 *)(r7 +29712) = r0 + 2074: (63) *(u32 *)(r7 +29716) = r0 + 2075: (63) *(u32 *)(r7 +29720) = r0 + 2076: (63) *(u32 *)(r7 +29724) = r0 + 2077: (63) *(u32 *)(r7 +29728) = r0 + 2078: (63) *(u32 *)(r7 +29732) = r0 + 2079: (63) *(u32 *)(r7 +29736) = r0 + 2080: (63) *(u32 *)(r7 +29740) = r0 + 2081: (63) *(u32 *)(r7 +29744) = r0 + 2082: (63) *(u32 *)(r7 +29748) = r0 + 2083: (63) *(u32 *)(r7 +29752) = r0 + 2084: (63) *(u32 *)(r7 +29756) = r0 + 2085: (63) *(u32 *)(r7 +29760) = r0 + 2086: (63) *(u32 *)(r7 +29764) = r0 + 2087: (63) *(u32 *)(r7 +29768) = r0 + 2088: (63) *(u32 *)(r7 +29772) = r0 + 2089: (63) *(u32 *)(r7 +29776) = r0 + 2090: (63) *(u32 *)(r7 +29780) = r0 + 2091: (63) *(u32 *)(r7 +29784) = r0 + 2092: (63) *(u32 *)(r7 +29788) = r0 + 2093: (63) *(u32 *)(r7 +29792) = r0 + 2094: (63) *(u32 *)(r7 +29796) = r0 + 2095: (63) *(u32 *)(r7 +29800) = r0 + 2096: (63) *(u32 *)(r7 +29804) = r0 + 2097: (63) *(u32 *)(r7 +29808) = r0 + 2098: (63) *(u32 *)(r7 +29812) = r0 + // overwrite scalar with dummy pointer; same as before, also including the + // sanitation store with 0 from the current mitigation by the verifier. + 2099: (7a) *(u64 *)(r10 -16) = 0 | /both/ are now slow stores here + 2100: (7b) *(u64 *)(r10 -16) = r7 | since store unit is still busy. + // load from stack intended to bypass stores. + 2101: (79) r2 = *(u64 *)(r10 -16) + 2102: (71) r3 = *(u8 *)(r2 +0) + // leak r3 + [...] + +Looking at the CPU microarchitecture, the scheduler might issue loads (such +as seen in line 2101) before stores (line 2099,2100) because the load execution +units become available while the store execution unit is still busy with the +sequence of dummy stores (line 2069-2098). And so the load may use the prior +stored scalar from r2 at address r10 -16 for speculation. The updated attack +may work less reliable on CPU microarchitectures where loads and stores share +execution resources. + +This concludes that the sanitizing with zero stores from af86ca4e3088 ("bpf: +Prevent memory disambiguation attack") is insufficient. Moreover, the detection +of stack reuse from af86ca4e3088 where previously data (STACK_MISC) has been +written to a given stack slot where a pointer value is now to be stored does +not have sufficient coverage as precondition for the mitigation either; for +several reasons outlined as follows: + + 1) Stack content from prior program runs could still be preserved and is + therefore not "random", best example is to split a speculative store + bypass attack between tail calls, program A would prepare and store the + oob address at a given stack slot and then tail call into program B which + does the "slow" store of a pointer to the stack with subsequent "fast" + read. From program B PoV such stack slot type is STACK_INVALID, and + therefore also must be subject to mitigation. + + 2) The STACK_SPILL must not be coupled to register_is_const(&stack->spilled_ptr) + condition, for example, the previous content of that memory location could + also be a pointer to map or map value. Without the fix, a speculative + store bypass is not mitigated in such precondition and can then lead to + a type confusion in the speculative domain leaking kernel memory near + these pointer types. + +While brainstorming on various alternative mitigation possibilities, we also +stumbled upon a retrospective from Chrome developers [0]: + + [...] For variant 4, we implemented a mitigation to zero the unused memory + of the heap prior to allocation, which cost about 1% when done concurrently + and 4% for scavenging. Variant 4 defeats everything we could think of. We + explored more mitigations for variant 4 but the threat proved to be more + pervasive and dangerous than we anticipated. For example, stack slots used + by the register allocator in the optimizing compiler could be subject to + type confusion, leading to pointer crafting. Mitigating type confusion for + stack slots alone would have required a complete redesign of the backend of + the optimizing compiler, perhaps man years of work, without a guarantee of + completeness. [...] + +From BPF side, the problem space is reduced, however, options are rather +limited. One idea that has been explored was to xor-obfuscate pointer spills +to the BPF stack: + + [...] + // preoccupy the CPU store port by running sequence of dummy stores. + [...] + 2106: (63) *(u32 *)(r7 +29796) = r0 + 2107: (63) *(u32 *)(r7 +29800) = r0 + 2108: (63) *(u32 *)(r7 +29804) = r0 + 2109: (63) *(u32 *)(r7 +29808) = r0 + 2110: (63) *(u32 *)(r7 +29812) = r0 + // overwrite scalar with dummy pointer; xored with random 'secret' value + // of 943576462 before store ... + 2111: (b4) w11 = 943576462 + 2112: (af) r11 ^= r7 + 2113: (7b) *(u64 *)(r10 -16) = r11 + 2114: (79) r11 = *(u64 *)(r10 -16) + 2115: (b4) w2 = 943576462 + 2116: (af) r2 ^= r11 + // ... and restored with the same 'secret' value with the help of AX reg. + 2117: (71) r3 = *(u8 *)(r2 +0) + [...] + +While the above would not prevent speculation, it would make data leakage +infeasible by directing it to random locations. In order to be effective +and prevent type confusion under speculation, such random secret would have +to be regenerated for each store. The additional complexity involved for a +tracking mechanism that prevents jumps such that restoring spilled pointers +would not get corrupted is not worth the gain for unprivileged. Hence, the +fix in here eventually opted for emitting a non-public BPF_ST | BPF_NOSPEC +instruction which the x86 JIT translates into a lfence opcode. Inserting the +latter in between the store and load instruction is one of the mitigations +options [1]. The x86 instruction manual notes: + + [...] An LFENCE that follows an instruction that stores to memory might + complete before the data being stored have become globally visible. [...] + +The latter meaning that the preceding store instruction finished execution +and the store is at minimum guaranteed to be in the CPU's store queue, but +it's not guaranteed to be in that CPU's L1 cache at that point (globally +visible). The latter would only be guaranteed via sfence. So the load which +is guaranteed to execute after the lfence for that local CPU would have to +rely on store-to-load forwarding. [2], in section 2.3 on store buffers says: + + [...] For every store operation that is added to the ROB, an entry is + allocated in the store buffer. This entry requires both the virtual and + physical address of the target. Only if there is no free entry in the store + buffer, the frontend stalls until there is an empty slot available in the + store buffer again. Otherwise, the CPU can immediately continue adding + subsequent instructions to the ROB and execute them out of order. On Intel + CPUs, the store buffer has up to 56 entries. [...] + +One small upside on the fix is that it lifts constraints from af86ca4e3088 +where the sanitize_stack_off relative to r10 must be the same when coming +from different paths. The BPF_ST | BPF_NOSPEC gets emitted after a BPF_STX +or BPF_ST instruction. This happens either when we store a pointer or data +value to the BPF stack for the first time, or upon later pointer spills. +The former needs to be enforced since otherwise stale stack data could be +leaked under speculation as outlined earlier. For non-x86 JITs the BPF_ST | +BPF_NOSPEC mapping is currently optimized away, but others could emit a +speculation barrier as well if necessary. For real-world unprivileged +programs e.g. generated by LLVM, pointer spill/fill is only generated upon +register pressure and LLVM only tries to do that for pointers which are not +used often. The program main impact will be the initial BPF_ST | BPF_NOSPEC +sanitation for the STACK_INVALID case when the first write to a stack slot +occurs e.g. upon map lookup. In future we might refine ways to mitigate +the latter cost. + + [0] https://arxiv.org/pdf/1902.05178.pdf + [1] https://msrc-blog.microsoft.com/2018/05/21/analysis-and-mitigation-of-speculative-store-bypass-cve-2018-3639/ + [2] https://arxiv.org/pdf/1905.05725.pdf + +Fixes: af86ca4e3088 ("bpf: Prevent memory disambiguation attack") +Fixes: f7cf25b2026d ("bpf: track spill/fill of constants") +Co-developed-by: Piotr Krysiuk <piotras@gmail.com> +Co-developed-by: Benedict Schlueter <benedict.schlueter@rub.de> +Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> +Signed-off-by: Piotr Krysiuk <piotras@gmail.com> +Signed-off-by: Benedict Schlueter <benedict.schlueter@rub.de> +Acked-by: Alexei Starovoitov <ast@kernel.org> +Signed-off-by: Sasha Levin <sashal@kernel.org> +--- + include/linux/bpf_verifier.h | 2 +- + kernel/bpf/verifier.c | 87 +++++++++++++----------------------- + 2 files changed, 33 insertions(+), 56 deletions(-) + +diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h +index 2739a6431b9e..3d6fb346dc3b 100644 +--- a/include/linux/bpf_verifier.h ++++ b/include/linux/bpf_verifier.h +@@ -319,8 +319,8 @@ struct bpf_insn_aux_data { + }; + u64 map_key_state; /* constant (32 bit) key tracking for maps */ + int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ +- int sanitize_stack_off; /* stack slot to be cleared */ + u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ ++ bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ + bool zext_dst; /* this insn zero extends dst reg */ + u8 alu_state; /* used in combination with alu_limit */ + +diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c +index 36bc34fce623..e038d672200e 100644 +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -2297,6 +2297,19 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, + cur = env->cur_state->frame[env->cur_state->curframe]; + if (value_regno >= 0) + reg = &cur->regs[value_regno]; ++ if (!env->bypass_spec_v4) { ++ bool sanitize = reg && is_spillable_regtype(reg->type); ++ ++ for (i = 0; i < size; i++) { ++ if (state->stack[spi].slot_type[i] == STACK_INVALID) { ++ sanitize = true; ++ break; ++ } ++ } ++ ++ if (sanitize) ++ env->insn_aux_data[insn_idx].sanitize_stack_spill = true; ++ } + + if (reg && size == BPF_REG_SIZE && register_is_bounded(reg) && + !register_is_null(reg) && env->bpf_capable) { +@@ -2319,47 +2332,10 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, + verbose(env, "invalid size of register spill\n"); + return -EACCES; + } +- + if (state != cur && reg->type == PTR_TO_STACK) { + verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); + return -EINVAL; + } +- +- if (!env->bypass_spec_v4) { +- bool sanitize = false; +- +- if (state->stack[spi].slot_type[0] == STACK_SPILL && +- register_is_const(&state->stack[spi].spilled_ptr)) +- sanitize = true; +- for (i = 0; i < BPF_REG_SIZE; i++) +- if (state->stack[spi].slot_type[i] == STACK_MISC) { +- sanitize = true; +- break; +- } +- if (sanitize) { +- int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off; +- int soff = (-spi - 1) * BPF_REG_SIZE; +- +- /* detected reuse of integer stack slot with a pointer +- * which means either llvm is reusing stack slot or +- * an attacker is trying to exploit CVE-2018-3639 +- * (speculative store bypass) +- * Have to sanitize that slot with preemptive +- * store of zero. +- */ +- if (*poff && *poff != soff) { +- /* disallow programs where single insn stores +- * into two different stack slots, since verifier +- * cannot sanitize them +- */ +- verbose(env, +- "insn %d cannot access two stack slots fp%d and fp%d", +- insn_idx, *poff, soff); +- return -EINVAL; +- } +- *poff = soff; +- } +- } + save_register_state(state, spi, reg); + } else { + u8 type = STACK_MISC; +@@ -10947,35 +10923,33 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) + + for (i = 0; i < insn_cnt; i++, insn++) { + bpf_convert_ctx_access_t convert_ctx_access; ++ bool ctx_access; + + if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || + insn->code == (BPF_LDX | BPF_MEM | BPF_H) || + insn->code == (BPF_LDX | BPF_MEM | BPF_W) || +- insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) ++ insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) { + type = BPF_READ; +- else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || +- insn->code == (BPF_STX | BPF_MEM | BPF_H) || +- insn->code == (BPF_STX | BPF_MEM | BPF_W) || +- insn->code == (BPF_STX | BPF_MEM | BPF_DW)) ++ ctx_access = true; ++ } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || ++ insn->code == (BPF_STX | BPF_MEM | BPF_H) || ++ insn->code == (BPF_STX | BPF_MEM | BPF_W) || ++ insn->code == (BPF_STX | BPF_MEM | BPF_DW) || ++ insn->code == (BPF_ST | BPF_MEM | BPF_B) || ++ insn->code == (BPF_ST | BPF_MEM | BPF_H) || ++ insn->code == (BPF_ST | BPF_MEM | BPF_W) || ++ insn->code == (BPF_ST | BPF_MEM | BPF_DW)) { + type = BPF_WRITE; +- else ++ ctx_access = BPF_CLASS(insn->code) == BPF_STX; ++ } else { + continue; ++ } + + if (type == BPF_WRITE && +- env->insn_aux_data[i + delta].sanitize_stack_off) { ++ env->insn_aux_data[i + delta].sanitize_stack_spill) { + struct bpf_insn patch[] = { +- /* Sanitize suspicious stack slot with zero. +- * There are no memory dependencies for this store, +- * since it's only using frame pointer and immediate +- * constant of zero +- */ +- BPF_ST_MEM(BPF_DW, BPF_REG_FP, +- env->insn_aux_data[i + delta].sanitize_stack_off, +- 0), +- /* the original STX instruction will immediately +- * overwrite the same stack slot with appropriate value +- */ + *insn, ++ BPF_ST_NOSPEC(), + }; + + cnt = ARRAY_SIZE(patch); +@@ -10989,6 +10963,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) + continue; + } + ++ if (!ctx_access) ++ continue; ++ + switch (env->insn_aux_data[i + delta].ptr_type) { + case PTR_TO_CTX: + if (!ops->convert_ctx_access) +-- +2.30.2 + diff --git a/debian/patches/bugfix/all/bpf-fix-pointer-arithmetic-mask-tightening-under-state-pruning.patch b/debian/patches/bugfix/all/bpf-fix-pointer-arithmetic-mask-tightening-under-state-pruning.patch new file mode 100644 index 000000000000..717ca0d70c3b --- /dev/null +++ b/debian/patches/bugfix/all/bpf-fix-pointer-arithmetic-mask-tightening-under-state-pruning.patch @@ -0,0 +1,121 @@ +From e042aa532c84d18ff13291d00620502ce7a38dda Mon Sep 17 00:00:00 2001 +From: Daniel Borkmann <daniel@iogearbox.net> +Date: Fri, 16 Jul 2021 09:18:21 +0000 +Subject: bpf: Fix pointer arithmetic mask tightening under state pruning + +From: Daniel Borkmann <daniel@iogearbox.net> + +commit e042aa532c84d18ff13291d00620502ce7a38dda upstream. + +In 7fedb63a8307 ("bpf: Tighten speculative pointer arithmetic mask") we +narrowed the offset mask for unprivileged pointer arithmetic in order to +mitigate a corner case where in the speculative domain it is possible to +advance, for example, the map value pointer by up to value_size-1 out-of- +bounds in order to leak kernel memory via side-channel to user space. + +The verifier's state pruning for scalars leaves one corner case open +where in the first verification path R_x holds an unknown scalar with an +aux->alu_limit of e.g. 7, and in a second verification path that same +register R_x, here denoted as R_x', holds an unknown scalar which has +tighter bounds and would thus satisfy range_within(R_x, R_x') as well as +tnum_in(R_x, R_x') for state pruning, yielding an aux->alu_limit of 3: +Given the second path fits the register constraints for pruning, the final +generated mask from aux->alu_limit will remain at 7. While technically +not wrong for the non-speculative domain, it would however be possible +to craft similar cases where the mask would be too wide as in 7fedb63a8307. + +One way to fix it is to detect the presence of unknown scalar map pointer +arithmetic and force a deeper search on unknown scalars to ensure that +we do not run into a masking mismatch. + +Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> +Acked-by: Alexei Starovoitov <ast@kernel.org> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + include/linux/bpf_verifier.h | 1 + + kernel/bpf/verifier.c | 27 +++++++++++++++++---------- + 2 files changed, 18 insertions(+), 10 deletions(-) + +--- a/include/linux/bpf_verifier.h ++++ b/include/linux/bpf_verifier.h +@@ -397,6 +397,7 @@ struct bpf_verifier_env { + struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ + u32 used_map_cnt; /* number of used maps */ + u32 id_gen; /* used to generate unique reg IDs */ ++ bool explore_alu_limits; + bool allow_ptr_leaks; + bool allow_uninit_stack; + bool allow_ptr_to_map_access; +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -5792,6 +5792,12 @@ static int sanitize_ptr_alu(struct bpf_v + alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0; + alu_state |= ptr_is_dst_reg ? + BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; ++ ++ /* Limit pruning on unknown scalars to enable deep search for ++ * potential masking differences from other program paths. ++ */ ++ if (!off_is_imm) ++ env->explore_alu_limits = true; + } + + err = update_alu_sanitation_state(aux, alu_state, alu_limit); +@@ -9088,8 +9094,8 @@ next: + } + + /* Returns true if (rold safe implies rcur safe) */ +-static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, +- struct bpf_id_pair *idmap) ++static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, ++ struct bpf_reg_state *rcur, struct bpf_id_pair *idmap) + { + bool equal; + +@@ -9115,6 +9121,8 @@ static bool regsafe(struct bpf_reg_state + return false; + switch (rold->type) { + case SCALAR_VALUE: ++ if (env->explore_alu_limits) ++ return false; + if (rcur->type == SCALAR_VALUE) { + if (!rold->precise && !rcur->precise) + return true; +@@ -9204,9 +9212,8 @@ static bool regsafe(struct bpf_reg_state + return false; + } + +-static bool stacksafe(struct bpf_func_state *old, +- struct bpf_func_state *cur, +- struct bpf_id_pair *idmap) ++static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, ++ struct bpf_func_state *cur, struct bpf_id_pair *idmap) + { + int i, spi; + +@@ -9251,9 +9258,8 @@ static bool stacksafe(struct bpf_func_st + continue; + if (old->stack[spi].slot_type[0] != STACK_SPILL) + continue; +- if (!regsafe(&old->stack[spi].spilled_ptr, +- &cur->stack[spi].spilled_ptr, +- idmap)) ++ if (!regsafe(env, &old->stack[spi].spilled_ptr, ++ &cur->stack[spi].spilled_ptr, idmap)) + /* when explored and current stack slot are both storing + * spilled registers, check that stored pointers types + * are the same as well. +@@ -9310,10 +9316,11 @@ static bool func_states_equal(struct bpf + + memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch)); + for (i = 0; i < MAX_BPF_REG; i++) +- if (!regsafe(&old->regs[i], &cur->regs[i], env->idmap_scratch)) ++ if (!regsafe(env, &old->regs[i], &cur->regs[i], ++ env->idmap_scratch)) + return false; + +- if (!stacksafe(old, cur, env->idmap_scratch)) ++ if (!stacksafe(env, old, cur, env->idmap_scratch)) + return false; + + if (!refsafe(old, cur)) diff --git a/debian/patches/bugfix/all/bpf-introduce-bpf-nospec-instruction-for-mitigating-.patch b/debian/patches/bugfix/all/bpf-introduce-bpf-nospec-instruction-for-mitigating-.patch new file mode 100644 index 000000000000..fd34a13e0454 --- /dev/null +++ b/debian/patches/bugfix/all/bpf-introduce-bpf-nospec-instruction-for-mitigating-.patch @@ -0,0 +1,322 @@ +From 4be98754f14316b6ab86ff08b955b892ab146676 Mon Sep 17 00:00:00 2001 +From: Sasha Levin <sashal@kernel.org> +Date: Tue, 13 Jul 2021 08:18:31 +0000 +Subject: bpf: Introduce BPF nospec instruction for mitigating Spectre v4 + +From: Daniel Borkmann <daniel@iogearbox.net> + +[ Upstream commit f5e81d1117501546b7be050c5fbafa6efd2c722c ] + +In case of JITs, each of the JIT backends compiles the BPF nospec instruction +/either/ to a machine instruction which emits a speculation barrier /or/ to +/no/ machine instruction in case the underlying architecture is not affected +by Speculative Store Bypass or has different mitigations in place already. + +This covers both x86 and (implicitly) arm64: In case of x86, we use 'lfence' +instruction for mitigation. In case of arm64, we rely on the firmware mitigation +as controlled via the ssbd kernel parameter. Whenever the mitigation is enabled, +it works for all of the kernel code with no need to provide any additional +instructions here (hence only comment in arm64 JIT). Other archs can follow +as needed. The BPF nospec instruction is specifically targeting Spectre v4 +since i) we don't use a serialization barrier for the Spectre v1 case, and +ii) mitigation instructions for v1 and v4 might be different on some archs. + +The BPF nospec is required for a future commit, where the BPF verifier does +annotate intermediate BPF programs with speculation barriers. + +Co-developed-by: Piotr Krysiuk <piotras@gmail.com> +Co-developed-by: Benedict Schlueter <benedict.schlueter@rub.de> +Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> +Signed-off-by: Piotr Krysiuk <piotras@gmail.com> +Signed-off-by: Benedict Schlueter <benedict.schlueter@rub.de> +Acked-by: Alexei Starovoitov <ast@kernel.org> +Signed-off-by: Sasha Levin <sashal@kernel.org> +--- + arch/arm/net/bpf_jit_32.c | 3 +++ + arch/arm64/net/bpf_jit_comp.c | 13 +++++++++++++ + arch/mips/net/ebpf_jit.c | 3 +++ + arch/powerpc/net/bpf_jit_comp64.c | 6 ++++++ + arch/riscv/net/bpf_jit_comp32.c | 4 ++++ + arch/riscv/net/bpf_jit_comp64.c | 4 ++++ + arch/s390/net/bpf_jit_comp.c | 5 +++++ + arch/sparc/net/bpf_jit_comp_64.c | 3 +++ + arch/x86/net/bpf_jit_comp.c | 7 +++++++ + arch/x86/net/bpf_jit_comp32.c | 6 ++++++ + include/linux/filter.h | 15 +++++++++++++++ + kernel/bpf/core.c | 19 ++++++++++++++++++- + kernel/bpf/disasm.c | 16 +++++++++------- + 13 files changed, 96 insertions(+), 8 deletions(-) + +diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c +index 0207b6ea6e8a..ce8b04326352 100644 +--- a/arch/arm/net/bpf_jit_32.c ++++ b/arch/arm/net/bpf_jit_32.c +@@ -1602,6 +1602,9 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) + rn = arm_bpf_get_reg32(src_lo, tmp2[1], ctx); + emit_ldx_r(dst, rn, off, ctx, BPF_SIZE(code)); + break; ++ /* speculation barrier */ ++ case BPF_ST | BPF_NOSPEC: ++ break; + /* ST: *(size *)(dst + off) = imm */ + case BPF_ST | BPF_MEM | BPF_W: + case BPF_ST | BPF_MEM | BPF_H: +diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c +index ef9f1d5e989d..345066b8e9fc 100644 +--- a/arch/arm64/net/bpf_jit_comp.c ++++ b/arch/arm64/net/bpf_jit_comp.c +@@ -829,6 +829,19 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, + return ret; + break; + ++ /* speculation barrier */ ++ case BPF_ST | BPF_NOSPEC: ++ /* ++ * Nothing required here. ++ * ++ * In case of arm64, we rely on the firmware mitigation of ++ * Speculative Store Bypass as controlled via the ssbd kernel ++ * parameter. Whenever the mitigation is enabled, it works ++ * for all of the kernel code with no need to provide any ++ * additional instructions. ++ */ ++ break; ++ + /* ST: *(size *)(dst + off) = imm */ + case BPF_ST | BPF_MEM | BPF_W: + case BPF_ST | BPF_MEM | BPF_H: +diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c +index 561154cbcc40..b31b91e57c34 100644 +--- a/arch/mips/net/ebpf_jit.c ++++ b/arch/mips/net/ebpf_jit.c +@@ -1355,6 +1355,9 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, + } + break; + ++ case BPF_ST | BPF_NOSPEC: /* speculation barrier */ ++ break; ++ + case BPF_ST | BPF_B | BPF_MEM: + case BPF_ST | BPF_H | BPF_MEM: + case BPF_ST | BPF_W | BPF_MEM: +diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c +index 022103c6a201..658ca2bab13c 100644 +--- a/arch/powerpc/net/bpf_jit_comp64.c ++++ b/arch/powerpc/net/bpf_jit_comp64.c +@@ -646,6 +646,12 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, + } + break; + ++ /* ++ * BPF_ST NOSPEC (speculation barrier) ++ */ ++ case BPF_ST | BPF_NOSPEC: ++ break; ++ + /* + * BPF_ST(X) + */ +diff --git a/arch/riscv/net/bpf_jit_comp32.c b/arch/riscv/net/bpf_jit_comp32.c +index 579575f9cdae..f300f93ba645 100644 +--- a/arch/riscv/net/bpf_jit_comp32.c ++++ b/arch/riscv/net/bpf_jit_comp32.c +@@ -1251,6 +1251,10 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, + return -1; + break; + ++ /* speculation barrier */ ++ case BPF_ST | BPF_NOSPEC: ++ break; ++ + case BPF_ST | BPF_MEM | BPF_B: + case BPF_ST | BPF_MEM | BPF_H: + case BPF_ST | BPF_MEM | BPF_W: +diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c +index 8a56b5293117..c113ae818b14 100644 +--- a/arch/riscv/net/bpf_jit_comp64.c ++++ b/arch/riscv/net/bpf_jit_comp64.c +@@ -939,6 +939,10 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, + emit_ld(rd, 0, RV_REG_T1, ctx); + break; + ++ /* speculation barrier */ ++ case BPF_ST | BPF_NOSPEC: ++ break; ++ + /* ST: *(size *)(dst + off) = imm */ + case BPF_ST | BPF_MEM | BPF_B: + emit_imm(RV_REG_T1, imm, ctx); +diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c +index fc44dce59536..dee01d3b23a4 100644 +--- a/arch/s390/net/bpf_jit_comp.c ++++ b/arch/s390/net/bpf_jit_comp.c +@@ -1153,6 +1153,11 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, + break; + } + break; ++ /* ++ * BPF_NOSPEC (speculation barrier) ++ */ ++ case BPF_ST | BPF_NOSPEC: ++ break; + /* + * BPF_ST(X) + */ +diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c +index 3364e2a00989..fef734473c0f 100644 +--- a/arch/sparc/net/bpf_jit_comp_64.c ++++ b/arch/sparc/net/bpf_jit_comp_64.c +@@ -1287,6 +1287,9 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) + return 1; + break; + } ++ /* speculation barrier */ ++ case BPF_ST | BPF_NOSPEC: ++ break; + /* ST: *(size *)(dst + off) = imm */ + case BPF_ST | BPF_MEM | BPF_W: + case BPF_ST | BPF_MEM | BPF_H: +diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c +index d5fa77256058..0a962cd6bac1 100644 +--- a/arch/x86/net/bpf_jit_comp.c ++++ b/arch/x86/net/bpf_jit_comp.c +@@ -1141,6 +1141,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, + } + break; + ++ /* speculation barrier */ ++ case BPF_ST | BPF_NOSPEC: ++ if (boot_cpu_has(X86_FEATURE_XMM2)) ++ /* Emit 'lfence' */ ++ EMIT3(0x0F, 0xAE, 0xE8); ++ break; ++ + /* ST: *(u8*)(dst_reg + off) = imm */ + case BPF_ST | BPF_MEM | BPF_B: + if (is_ereg(dst_reg)) +diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c +index 2cf4d217840d..4bd0f98df700 100644 +--- a/arch/x86/net/bpf_jit_comp32.c ++++ b/arch/x86/net/bpf_jit_comp32.c +@@ -1705,6 +1705,12 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, + i++; + break; + } ++ /* speculation barrier */ ++ case BPF_ST | BPF_NOSPEC: ++ if (boot_cpu_has(X86_FEATURE_XMM2)) ++ /* Emit 'lfence' */ ++ EMIT3(0x0F, 0xAE, 0xE8); ++ break; + /* ST: *(u8*)(dst_reg + off) = imm */ + case BPF_ST | BPF_MEM | BPF_H: + case BPF_ST | BPF_MEM | BPF_B: +diff --git a/include/linux/filter.h b/include/linux/filter.h +index e2ffa02f9067..822b701c803d 100644 +--- a/include/linux/filter.h ++++ b/include/linux/filter.h +@@ -72,6 +72,11 @@ struct ctl_table_header; + /* unused opcode to mark call to interpreter with arguments */ + #define BPF_CALL_ARGS 0xe0 + ++/* unused opcode to mark speculation barrier for mitigating ++ * Speculative Store Bypass ++ */ ++#define BPF_NOSPEC 0xc0 ++ + /* As per nm, we expose JITed images as text (code) section for + * kallsyms. That way, tools like perf can find it to match + * addresses. +@@ -372,6 +377,16 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) + .off = 0, \ + .imm = 0 }) + ++/* Speculation barrier */ ++ ++#define BPF_ST_NOSPEC() \ ++ ((struct bpf_insn) { \ ++ .code = BPF_ST | BPF_NOSPEC, \ ++ .dst_reg = 0, \ ++ .src_reg = 0, \ ++ .off = 0, \ ++ .imm = 0 }) ++ + /* Internal classic blocks for direct assignment */ + + #define __BPF_STMT(CODE, K) \ +diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c +index 75c2d184018a..d12efb2550d3 100644 +--- a/kernel/bpf/core.c ++++ b/kernel/bpf/core.c +@@ -32,6 +32,8 @@ + #include <linux/perf_event.h> + #include <linux/extable.h> + #include <linux/log2.h> ++ ++#include <asm/barrier.h> + #include <asm/unaligned.h> + + /* Registers */ +@@ -1380,6 +1382,7 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) + /* Non-UAPI available opcodes. */ + [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, + [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, ++ [BPF_ST | BPF_NOSPEC] = &&ST_NOSPEC, + [BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B, + [BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H, + [BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W, +@@ -1624,7 +1627,21 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) + COND_JMP(s, JSGE, >=) + COND_JMP(s, JSLE, <=) + #undef COND_JMP +- /* STX and ST and LDX*/ ++ /* ST, STX and LDX*/ ++ ST_NOSPEC: ++ /* Speculation barrier for mitigating Speculative Store Bypass. ++ * In case of arm64, we rely on the firmware mitigation as ++ * controlled via the ssbd kernel parameter. Whenever the ++ * mitigation is enabled, it works for all of the kernel code ++ * with no need to provide any additional instructions here. ++ * In case of x86, we use 'lfence' insn for mitigation. We ++ * reuse preexisting logic from Spectre v1 mitigation that ++ * happens to produce the required code on x86 for v4 as well. ++ */ ++#ifdef CONFIG_X86 ++ barrier_nospec(); ++#endif ++ CONT; + #define LDST(SIZEOP, SIZE) \ + STX_MEM_##SIZEOP: \ + *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \ +diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c +index b44d8c447afd..ff1dd7d45b58 100644 +--- a/kernel/bpf/disasm.c ++++ b/kernel/bpf/disasm.c +@@ -162,15 +162,17 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, + else + verbose(cbs->private_data, "BUG_%02x\n", insn->code); + } else if (class == BPF_ST) { +- if (BPF_MODE(insn->code) != BPF_MEM) { ++ if (BPF_MODE(insn->code) == BPF_MEM) { ++ verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n", ++ insn->code, ++ bpf_ldst_string[BPF_SIZE(insn->code) >> 3], ++ insn->dst_reg, ++ insn->off, insn->imm); ++ } else if (BPF_MODE(insn->code) == 0xc0 /* BPF_NOSPEC, no UAPI */) { ++ verbose(cbs->private_data, "(%02x) nospec\n", insn->code); ++ } else { + verbose(cbs->private_data, "BUG_st_%02x\n", insn->code); +- return; + } +- verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n", +- insn->code, +- bpf_ldst_string[BPF_SIZE(insn->code) >> 3], +- insn->dst_reg, +- insn->off, insn->imm); + } else if (class == BPF_LDX) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code); +-- +2.30.2 + diff --git a/debian/patches/bugfix/all/bpf-remove-superfluous-aux-sanitation-on-subprog-rejection.patch b/debian/patches/bugfix/all/bpf-remove-superfluous-aux-sanitation-on-subprog-rejection.patch new file mode 100644 index 000000000000..e3b5a84476f9 --- /dev/null +++ b/debian/patches/bugfix/all/bpf-remove-superfluous-aux-sanitation-on-subprog-rejection.patch @@ -0,0 +1,79 @@ +From 59089a189e3adde4cf85f2ce479738d1ae4c514d Mon Sep 17 00:00:00 2001 +From: Daniel Borkmann <daniel@iogearbox.net> +Date: Tue, 29 Jun 2021 09:39:15 +0000 +Subject: bpf: Remove superfluous aux sanitation on subprog rejection + +From: Daniel Borkmann <daniel@iogearbox.net> + +commit 59089a189e3adde4cf85f2ce479738d1ae4c514d upstream. + +Follow-up to fe9a5ca7e370 ("bpf: Do not mark insn as seen under speculative +path verification"). The sanitize_insn_aux_data() helper does not serve a +particular purpose in today's code. The original intention for the helper +was that if function-by-function verification fails, a given program would +be cleared from temporary insn_aux_data[], and then its verification would +be re-attempted in the context of the main program a second time. + +However, a failure in do_check_subprogs() will skip do_check_main() and +propagate the error to the user instead, thus such situation can never occur. +Given its interaction is not compatible to the Spectre v1 mitigation (due to +comparing aux->seen with env->pass_cnt), just remove sanitize_insn_aux_data() +to avoid future bugs in this area. + +Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> +Acked-by: Alexei Starovoitov <ast@kernel.org> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + kernel/bpf/verifier.c | 34 ---------------------------------- + 1 file changed, 34 deletions(-) + +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -11707,37 +11707,6 @@ static void free_states(struct bpf_verif + } + } + +-/* The verifier is using insn_aux_data[] to store temporary data during +- * verification and to store information for passes that run after the +- * verification like dead code sanitization. do_check_common() for subprogram N +- * may analyze many other subprograms. sanitize_insn_aux_data() clears all +- * temporary data after do_check_common() finds that subprogram N cannot be +- * verified independently. pass_cnt counts the number of times +- * do_check_common() was run and insn->aux->seen tells the pass number +- * insn_aux_data was touched. These variables are compared to clear temporary +- * data from failed pass. For testing and experiments do_check_common() can be +- * run multiple times even when prior attempt to verify is unsuccessful. +- * +- * Note that special handling is needed on !env->bypass_spec_v1 if this is +- * ever called outside of error path with subsequent program rejection. +- */ +-static void sanitize_insn_aux_data(struct bpf_verifier_env *env) +-{ +- struct bpf_insn *insn = env->prog->insnsi; +- struct bpf_insn_aux_data *aux; +- int i, class; +- +- for (i = 0; i < env->prog->len; i++) { +- class = BPF_CLASS(insn[i].code); +- if (class != BPF_LDX && class != BPF_STX) +- continue; +- aux = &env->insn_aux_data[i]; +- if (aux->seen != env->pass_cnt) +- continue; +- memset(aux, 0, offsetof(typeof(*aux), orig_idx)); +- } +-} +- + static int do_check_common(struct bpf_verifier_env *env, int subprog) + { + bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); +@@ -11807,9 +11776,6 @@ out: + if (!ret && pop_log) + bpf_vlog_reset(&env->log, 0); + free_states(env); +- if (ret) +- /* clean aux data in case subprog was rejected */ +- sanitize_insn_aux_data(env); + return ret; + } + diff --git a/debian/patches/bugfix/all/bpf-verifier-allocate-idmap-scratch-in-verifier-env.patch b/debian/patches/bugfix/all/bpf-verifier-allocate-idmap-scratch-in-verifier-env.patch new file mode 100644 index 000000000000..020ce21220e7 --- /dev/null +++ b/debian/patches/bugfix/all/bpf-verifier-allocate-idmap-scratch-in-verifier-env.patch @@ -0,0 +1,149 @@ +From c9e73e3d2b1eb1ea7ff068e05007eec3bd8ef1c9 Mon Sep 17 00:00:00 2001 +From: Lorenz Bauer <lmb@cloudflare.com> +Date: Thu, 29 Apr 2021 14:46:56 +0100 +Subject: bpf: verifier: Allocate idmap scratch in verifier env + +From: Lorenz Bauer <lmb@cloudflare.com> + +commit c9e73e3d2b1eb1ea7ff068e05007eec3bd8ef1c9 upstream. + +func_states_equal makes a very short lived allocation for idmap, +probably because it's too large to fit on the stack. However the +function is called quite often, leading to a lot of alloc / free +churn. Replace the temporary allocation with dedicated scratch +space in struct bpf_verifier_env. + +Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> +Signed-off-by: Alexei Starovoitov <ast@kernel.org> +Acked-by: Edward Cree <ecree.xilinx@gmail.com> +Link: https://lore.kernel.org/bpf/20210429134656.122225-4-lmb@cloudflare.com +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + include/linux/bpf_verifier.h | 8 +++++++ + kernel/bpf/verifier.c | 46 ++++++++++++++----------------------------- + 2 files changed, 23 insertions(+), 31 deletions(-) + +--- a/include/linux/bpf_verifier.h ++++ b/include/linux/bpf_verifier.h +@@ -204,6 +204,13 @@ struct bpf_idx_pair { + u32 idx; + }; + ++struct bpf_id_pair { ++ u32 old; ++ u32 cur; ++}; ++ ++/* Maximum number of register states that can exist at once */ ++#define BPF_ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) + #define MAX_CALL_FRAMES 8 + struct bpf_verifier_state { + /* call stack tracking */ +@@ -401,6 +408,7 @@ struct bpf_verifier_env { + const struct bpf_line_info *prev_linfo; + struct bpf_verifier_log log; + struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1]; ++ struct bpf_id_pair idmap_scratch[BPF_ID_MAP_SIZE]; + struct { + int *insn_state; + int *insn_stack; +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -8962,13 +8962,6 @@ static bool range_within(struct bpf_reg_ + old->s32_max_value >= cur->s32_max_value; + } + +-/* Maximum number of register states that can exist at once */ +-#define ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) +-struct idpair { +- u32 old; +- u32 cur; +-}; +- + /* If in the old state two registers had the same id, then they need to have + * the same id in the new state as well. But that id could be different from + * the old state, so we need to track the mapping from old to new ids. +@@ -8979,11 +8972,11 @@ struct idpair { + * So we look through our idmap to see if this old id has been seen before. If + * so, we require the new id to match; otherwise, we add the id pair to the map. + */ +-static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap) ++static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap) + { + unsigned int i; + +- for (i = 0; i < ID_MAP_SIZE; i++) { ++ for (i = 0; i < BPF_ID_MAP_SIZE; i++) { + if (!idmap[i].old) { + /* Reached an empty slot; haven't seen this id before */ + idmap[i].old = old_id; +@@ -9096,7 +9089,7 @@ next: + + /* Returns true if (rold safe implies rcur safe) */ + static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, +- struct idpair *idmap) ++ struct bpf_id_pair *idmap) + { + bool equal; + +@@ -9213,7 +9206,7 @@ static bool regsafe(struct bpf_reg_state + + static bool stacksafe(struct bpf_func_state *old, + struct bpf_func_state *cur, +- struct idpair *idmap) ++ struct bpf_id_pair *idmap) + { + int i, spi; + +@@ -9310,32 +9303,23 @@ static bool refsafe(struct bpf_func_stat + * whereas register type in current state is meaningful, it means that + * the current state will reach 'bpf_exit' instruction safely + */ +-static bool func_states_equal(struct bpf_func_state *old, ++static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old, + struct bpf_func_state *cur) + { +- struct idpair *idmap; +- bool ret = false; + int i; + +- idmap = kcalloc(ID_MAP_SIZE, sizeof(struct idpair), GFP_KERNEL); +- /* If we failed to allocate the idmap, just say it's not safe */ +- if (!idmap) +- return false; +- +- for (i = 0; i < MAX_BPF_REG; i++) { +- if (!regsafe(&old->regs[i], &cur->regs[i], idmap)) +- goto out_free; +- } ++ memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch)); ++ for (i = 0; i < MAX_BPF_REG; i++) ++ if (!regsafe(&old->regs[i], &cur->regs[i], env->idmap_scratch)) ++ return false; + +- if (!stacksafe(old, cur, idmap)) +- goto out_free; ++ if (!stacksafe(old, cur, env->idmap_scratch)) ++ return false; + + if (!refsafe(old, cur)) +- goto out_free; +- ret = true; +-out_free: +- kfree(idmap); +- return ret; ++ return false; ++ ++ return true; + } + + static bool states_equal(struct bpf_verifier_env *env, +@@ -9362,7 +9346,7 @@ static bool states_equal(struct bpf_veri + for (i = 0; i <= old->curframe; i++) { + if (old->frame[i]->callsite != cur->frame[i]->callsite) + return false; +- if (!func_states_equal(old->frame[i], cur->frame[i])) ++ if (!func_states_equal(env, old->frame[i], cur->frame[i])) + return false; + } + return true; diff --git a/debian/patches/series b/debian/patches/series index 3ae926b62985..aa7245e357ab 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -129,6 +129,12 @@ bugfix/all/Input-joydev-prevent-use-of-not-validated-data-in-JS.patch bugfix/all/sctp-validate-from_addr_param-return.patch bugfix/all/sctp-add-size-validation-when-walking-chunks.patch bugfix/all/sctp-fix-return-value-check-in-__sctp_rcv_asconf_loo.patch +bugfix/all/bpf-introduce-bpf-nospec-instruction-for-mitigating-.patch +bugfix/all/bpf-fix-leakage-due-to-insufficient-speculative-stor.patch +bugfix/all/bpf-remove-superfluous-aux-sanitation-on-subprog-rejection.patch +bugfix/all/bpf-Add-kconfig-knob-for-disabling-unpriv-bpf-by-def.patch +bugfix/all/bpf-verifier-allocate-idmap-scratch-in-verifier-env.patch +bugfix/all/bpf-fix-pointer-arithmetic-mask-tightening-under-state-pruning.patch # Fix exported symbol versions bugfix/all/module-disable-matching-missing-version-crc.patch |