[amd64] Fix nested NMI handling (CVE-2015-3290, CVE-2015-3291, CVE-2015-5157)

svn path=/dists/sid/linux/; revision=22842
author: Ben Hutchings <benh@debian.org> 2015-07-22 20:46:22 +0000
committer: Ben Hutchings <benh@debian.org> 2015-07-22 20:46:22 +0000
commit: 561e869fdaf54df3b2bcc5f44e8133024c3a3dc7 (patch)
tree: e5602af106ebfc1bfb932e4cd0169bd9e8329fe7
parent: 2017c10b63fae2ca56bb12ca8063b69ecba04e2c (diff)
download: kernel_replicant_linux-561e869fdaf54df3b2bcc5f44e8133024c3a3dc7.tar.gz
kernel_replicant_linux-561e869fdaf54df3b2bcc5f44e8133024c3a3dc7.tar.bz2
kernel_replicant_linux-561e869fdaf54df3b2bcc5f44e8133024c3a3dc7.zip
11 files changed, 1022 insertions, 0 deletions
diff --git a/debian/changelog b/debian/changelog
index f2612c41ee80..84e378d5c077 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,7 +1,17 @@
 linux (4.0.8-2) UNRELEASED; urgency=medium
 
+  [ Uwe Kleine-König ]
   * [rt] Update to 4.0.8-rt6
 
+  [ Ben Hutchings ]
+  * [amd64] Fix nested NMI handling (CVE-2015-3290, CVE-2015-3291,
+    CVE-2015-5157)
+    - Enable nested do_nmi handling for 64-bit kernels
+    - Remove asm code that saves cr2
+    - Switch stacks on userspace NMI entry
+    - Reorder nested NMI checks
+    - Use DF to avoid userspace RSP confusing nested NMI detection
+
  -- Uwe Kleine-König <uwe@kleine-koenig.org>  Tue, 21 Jul 2015 23:19:12 +0200
 
 linux (4.0.8-1) unstable; urgency=medium
diff --git a/debian/patches/bugfix/x86/0001-x86-asm-entry-64-Fold-the-test_in_nmi-macro-into-its.patch b/debian/patches/bugfix/x86/0001-x86-asm-entry-64-Fold-the-test_in_nmi-macro-into-its.patch
new file mode 100644
index 000000000000..2744ca48c22f
--- /dev/null
+++ b/debian/patches/bugfix/x86/0001-x86-asm-entry-64-Fold-the-test_in_nmi-macro-into-its.patch
@@ -0,0 +1,70 @@
+From: Denys Vlasenko <dvlasenk@redhat.com>
+Date: Wed, 1 Apr 2015 16:50:57 +0200
+Subject: [1/9] x86/asm/entry/64: Fold the 'test_in_nmi' macro into its
+ only user
+Origin: https://git.kernel.org/linus/0784b36448a2a85b95b6eb21a69b9045c896c065
+
+No code changes.
+
+Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
+Acked-by: Borislav Petkov <bp@suse.de>
+Cc: Alexei Starovoitov <ast@plumgrid.com>
+Cc: Andy Lutomirski <luto@amacapital.net>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Frederic Weisbecker <fweisbec@gmail.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Kees Cook <keescook@chromium.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Will Drewry <wad@chromium.org>
+Link: http://lkml.kernel.org/r/1427899858-7165-1-git-send-email-dvlasenk@redhat.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+---
+ arch/x86/kernel/entry_64.S | 28 +++++++++++++---------------
+ 1 file changed, 13 insertions(+), 15 deletions(-)
+
+--- a/arch/x86/kernel/entry_64.S
++++ b/arch/x86/kernel/entry_64.S
+@@ -1462,19 +1462,7 @@ ENTRY(error_exit)
+ 	CFI_ENDPROC
+ END(error_exit)
+ 
+-/*
+- * Test if a given stack is an NMI stack or not.
+- */
+-	.macro test_in_nmi reg stack nmi_ret normal_ret
+-	cmpq %\reg, \stack
+-	ja \normal_ret
+-	subq $EXCEPTION_STKSZ, %\reg
+-	cmpq %\reg, \stack
+-	jb \normal_ret
+-	jmp \nmi_ret
+-	.endm
+-
+-	/* runs on exception stack */
++/* Runs on exception stack */
+ ENTRY(nmi)
+ 	INTR_FRAME
+ 	PARAVIRT_ADJUST_EXCEPTION_FRAME
+@@ -1535,8 +1523,18 @@ ENTRY(nmi)
+ 	 * We check the variable because the first NMI could be in a
+ 	 * breakpoint routine using a breakpoint stack.
+ 	 */
+-	lea 6*8(%rsp), %rdx
+-	test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
++	lea	6*8(%rsp), %rdx
++	/* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
++	cmpq	%rdx, 4*8(%rsp)
++	/* If the stack pointer is above the NMI stack, this is a normal NMI */
++	ja	first_nmi
++	subq	$EXCEPTION_STKSZ, %rdx
++	cmpq	%rdx, 4*8(%rsp)
++	/* If it is below the NMI stack, it is a normal NMI */
++	jb	first_nmi
++	/* Ah, it is within the NMI stack, treat it as nested */
++	jmp	nested_nmi
++
+ 	CFI_REMEMBER_STATE
+ 
+ nested_nmi:
diff --git a/debian/patches/bugfix/x86/0002-x86-asm-entry-64-Remove-a-redundant-jump.patch b/debian/patches/bugfix/x86/0002-x86-asm-entry-64-Remove-a-redundant-jump.patch
new file mode 100644
index 000000000000..b04ce3a7b870
--- /dev/null
+++ b/debian/patches/bugfix/x86/0002-x86-asm-entry-64-Remove-a-redundant-jump.patch
@@ -0,0 +1,41 @@
+From: Denys Vlasenko <dvlasenk@redhat.com>
+Date: Tue, 7 Apr 2015 22:43:41 +0200
+Subject: [2/9] x86/asm/entry/64: Remove a redundant jump
+Origin: https://git.kernel.org/linus/a30b0085f54efae11f6256df4e4a16af7eefc1c4
+
+Jumping to the very next instruction is not very useful:
+
+        jmp label
+    label:
+
+Removing the jump.
+
+Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Alexei Starovoitov <ast@plumgrid.com>
+Cc: Andy Lutomirski <luto@amacapital.net>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Frederic Weisbecker <fweisbec@gmail.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Kees Cook <keescook@chromium.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Will Drewry <wad@chromium.org>
+Link: http://lkml.kernel.org/r/1428439424-7258-5-git-send-email-dvlasenk@redhat.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+---
+ arch/x86/kernel/entry_64.S | 1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/arch/x86/kernel/entry_64.S
++++ b/arch/x86/kernel/entry_64.S
+@@ -1533,7 +1533,6 @@ ENTRY(nmi)
+ 	/* If it is below the NMI stack, it is a normal NMI */
+ 	jb	first_nmi
+ 	/* Ah, it is within the NMI stack, treat it as nested */
+-	jmp	nested_nmi
+ 
+ 	CFI_REMEMBER_STATE
+ 
diff --git a/debian/patches/bugfix/x86/0003-x86-asm-entry-64-Remove-pointless-jump-to-irq_return.patch b/debian/patches/bugfix/x86/0003-x86-asm-entry-64-Remove-pointless-jump-to-irq_return.patch
new file mode 100644
index 000000000000..af71d5282229
--- /dev/null
+++ b/debian/patches/bugfix/x86/0003-x86-asm-entry-64-Remove-pointless-jump-to-irq_return.patch
@@ -0,0 +1,47 @@
+From: Andy Lutomirski <luto@kernel.org>
+Date: Thu, 4 Jun 2015 13:24:29 -0700
+Subject: [3/9] x86/asm/entry/64: Remove pointless jump to irq_return
+Origin: https://git.kernel.org/linus/5ca6f70f387b4f82903037cc3c5488e2c97dcdbc
+
+INTERRUPT_RETURN turns into a jmp instruction.  There's no need
+for extra indirection.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Cc: <linux-kernel@vger.kernel.org>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Andy Lutomirski <luto@amacapital.net>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/2f2318653dbad284a59311f13f08cea71298fd7c.1433449436.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+[bwh: Backported to 4.0: adjust filename, context]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+---
+ arch/x86/kernel/entry_64.S | 4 +---
+ 1 file changed, 1 insertion(+), 3 deletions(-)
+
+--- a/arch/x86/kernel/entry_64.S
++++ b/arch/x86/kernel/entry_64.S
+@@ -862,8 +862,6 @@ retint_restore_args:	/* return to kernel
+ 	TRACE_IRQS_IRETQ
+ restore_args:
+ 	RESTORE_ARGS 1,8,1
+-
+-irq_return:
+ 	INTERRUPT_RETURN
+ 
+ ENTRY(native_iret)
+@@ -1708,7 +1706,7 @@ nmi_restore:
+ 
+ 	/* Clear the NMI executing stack variable */
+ 	movq $0, 5*8(%rsp)
+-	jmp irq_return
++	INTERRUPT_RETURN
+ 	CFI_ENDPROC
+ END(nmi)
+ 
diff --git a/debian/patches/bugfix/x86/0004-x86-nmi-Enable-nested-do_nmi-handling-for-64-bit-ker.patch b/debian/patches/bugfix/x86/0004-x86-nmi-Enable-nested-do_nmi-handling-for-64-bit-ker.patch
new file mode 100644
index 000000000000..cb18ec247688
--- /dev/null
+++ b/debian/patches/bugfix/x86/0004-x86-nmi-Enable-nested-do_nmi-handling-for-64-bit-ker.patch
@@ -0,0 +1,191 @@
+From: Andy Lutomirski <luto@kernel.org>
+Date: Wed, 15 Jul 2015 10:29:33 -0700
+Subject: [4/9] x86/nmi: Enable nested do_nmi() handling for 64-bit kernels
+Origin: https://git.kernel.org/linus/9d05041679904b12c12421cbcf9cb5f4860a8d7b
+
+32-bit kernels handle nested NMIs in C.  Enable the exact same
+handling on 64-bit kernels as well.  This isn't currently
+necessary, but it will become necessary once the asm code starts
+allowing limited nesting.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
+Cc: Borislav Petkov <bp@suse.de>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: stable@vger.kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+---
+ arch/x86/kernel/nmi.c | 123 +++++++++++++++++++++-----------------------------
+ 1 file changed, 52 insertions(+), 71 deletions(-)
+
+--- a/arch/x86/kernel/nmi.c
++++ b/arch/x86/kernel/nmi.c
+@@ -408,15 +408,15 @@ static void default_do_nmi(struct pt_reg
+ NOKPROBE_SYMBOL(default_do_nmi);
+ 
+ /*
+- * NMIs can hit breakpoints which will cause it to lose its
+- * NMI context with the CPU when the breakpoint does an iret.
+- */
+-#ifdef CONFIG_X86_32
+-/*
+- * For i386, NMIs use the same stack as the kernel, and we can
+- * add a workaround to the iret problem in C (preventing nested
+- * NMIs if an NMI takes a trap). Simply have 3 states the NMI
+- * can be in:
++ * NMIs can hit breakpoints which will cause it to lose its NMI context
++ * with the CPU when the breakpoint or page fault does an IRET.
++ *
++ * As a result, NMIs can nest if NMIs get unmasked due an IRET during
++ * NMI processing.  On x86_64, the asm glue protects us from nested NMIs
++ * if the outer NMI came from kernel mode, but we can still nest if the
++ * outer NMI came from user mode.
++ *
++ * To handle these nested NMIs, we have three states:
+  *
+  *  1) not running
+  *  2) executing
+@@ -430,15 +430,14 @@ NOKPROBE_SYMBOL(default_do_nmi);
+  * (Note, the latch is binary, thus multiple NMIs triggering,
+  *  when one is running, are ignored. Only one NMI is restarted.)
+  *
+- * If an NMI hits a breakpoint that executes an iret, another
+- * NMI can preempt it. We do not want to allow this new NMI
+- * to run, but we want to execute it when the first one finishes.
+- * We set the state to "latched", and the exit of the first NMI will
+- * perform a dec_return, if the result is zero (NOT_RUNNING), then
+- * it will simply exit the NMI handler. If not, the dec_return
+- * would have set the state to NMI_EXECUTING (what we want it to
+- * be when we are running). In this case, we simply jump back
+- * to rerun the NMI handler again, and restart the 'latched' NMI.
++ * If an NMI executes an iret, another NMI can preempt it. We do not
++ * want to allow this new NMI to run, but we want to execute it when the
++ * first one finishes.  We set the state to "latched", and the exit of
++ * the first NMI will perform a dec_return, if the result is zero
++ * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the
++ * dec_return would have set the state to NMI_EXECUTING (what we want it
++ * to be when we are running). In this case, we simply jump back to
++ * rerun the NMI handler again, and restart the 'latched' NMI.
+  *
+  * No trap (breakpoint or page fault) should be hit before nmi_restart,
+  * thus there is no race between the first check of state for NOT_RUNNING
+@@ -461,49 +460,36 @@ enum nmi_states {
+ static DEFINE_PER_CPU(enum nmi_states, nmi_state);
+ static DEFINE_PER_CPU(unsigned long, nmi_cr2);
+ 
+-#define nmi_nesting_preprocess(regs)					\
+-	do {								\
+-		if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {	\
+-			this_cpu_write(nmi_state, NMI_LATCHED);		\
+-			return;						\
+-		}							\
+-		this_cpu_write(nmi_state, NMI_EXECUTING);		\
+-		this_cpu_write(nmi_cr2, read_cr2());			\
+-	} while (0);							\
+-	nmi_restart:
+-
+-#define nmi_nesting_postprocess()					\
+-	do {								\
+-		if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))	\
+-			write_cr2(this_cpu_read(nmi_cr2));		\
+-		if (this_cpu_dec_return(nmi_state))			\
+-			goto nmi_restart;				\
+-	} while (0)
+-#else /* x86_64 */
++#ifdef CONFIG_X86_64
+ /*
+- * In x86_64 things are a bit more difficult. This has the same problem
+- * where an NMI hitting a breakpoint that calls iret will remove the
+- * NMI context, allowing a nested NMI to enter. What makes this more
+- * difficult is that both NMIs and breakpoints have their own stack.
+- * When a new NMI or breakpoint is executed, the stack is set to a fixed
+- * point. If an NMI is nested, it will have its stack set at that same
+- * fixed address that the first NMI had, and will start corrupting the
+- * stack. This is handled in entry_64.S, but the same problem exists with
+- * the breakpoint stack.
+- *
+- * If a breakpoint is being processed, and the debug stack is being used,
+- * if an NMI comes in and also hits a breakpoint, the stack pointer
+- * will be set to the same fixed address as the breakpoint that was
+- * interrupted, causing that stack to be corrupted. To handle this case,
+- * check if the stack that was interrupted is the debug stack, and if
+- * so, change the IDT so that new breakpoints will use the current stack
+- * and not switch to the fixed address. On return of the NMI, switch back
+- * to the original IDT.
++ * In x86_64, we need to handle breakpoint -> NMI -> breakpoint.  Without
++ * some care, the inner breakpoint will clobber the outer breakpoint's
++ * stack.
++ *
++ * If a breakpoint is being processed, and the debug stack is being
++ * used, if an NMI comes in and also hits a breakpoint, the stack
++ * pointer will be set to the same fixed address as the breakpoint that
++ * was interrupted, causing that stack to be corrupted. To handle this
++ * case, check if the stack that was interrupted is the debug stack, and
++ * if so, change the IDT so that new breakpoints will use the current
++ * stack and not switch to the fixed address. On return of the NMI,
++ * switch back to the original IDT.
+  */
+ static DEFINE_PER_CPU(int, update_debug_stack);
++#endif
+ 
+-static inline void nmi_nesting_preprocess(struct pt_regs *regs)
++dotraplinkage notrace void
++do_nmi(struct pt_regs *regs, long error_code)
+ {
++	if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
++		this_cpu_write(nmi_state, NMI_LATCHED);
++		return;
++	}
++	this_cpu_write(nmi_state, NMI_EXECUTING);
++	this_cpu_write(nmi_cr2, read_cr2());
++nmi_restart:
++
++#ifdef CONFIG_X86_64
+ 	/*
+ 	 * If we interrupted a breakpoint, it is possible that
+ 	 * the nmi handler will have breakpoints too. We need to
+@@ -514,22 +500,8 @@ static inline void nmi_nesting_preproces
+ 		debug_stack_set_zero();
+ 		this_cpu_write(update_debug_stack, 1);
+ 	}
+-}
+-
+-static inline void nmi_nesting_postprocess(void)
+-{
+-	if (unlikely(this_cpu_read(update_debug_stack))) {
+-		debug_stack_reset();
+-		this_cpu_write(update_debug_stack, 0);
+-	}
+-}
+ #endif
+ 
+-dotraplinkage notrace void
+-do_nmi(struct pt_regs *regs, long error_code)
+-{
+-	nmi_nesting_preprocess(regs);
+-
+ 	nmi_enter();
+ 
+ 	inc_irq_stat(__nmi_count);
+@@ -539,8 +511,17 @@ do_nmi(struct pt_regs *regs, long error_
+ 
+ 	nmi_exit();
+ 
+-	/* On i386, may loop back to preprocess */
+-	nmi_nesting_postprocess();
++#ifdef CONFIG_X86_64
++	if (unlikely(this_cpu_read(update_debug_stack))) {
++		debug_stack_reset();
++		this_cpu_write(update_debug_stack, 0);
++	}
++#endif
++
++	if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
++		write_cr2(this_cpu_read(nmi_cr2));
++	if (this_cpu_dec_return(nmi_state))
++		goto nmi_restart;
+ }
+ NOKPROBE_SYMBOL(do_nmi);
+ 
diff --git a/debian/patches/bugfix/x86/0005-x86-nmi-64-Remove-asm-code-that-saves-cr2.patch b/debian/patches/bugfix/x86/0005-x86-nmi-64-Remove-asm-code-that-saves-cr2.patch
new file mode 100644
index 000000000000..33fdec1efb96
--- /dev/null
+++ b/debian/patches/bugfix/x86/0005-x86-nmi-64-Remove-asm-code-that-saves-cr2.patch
@@ -0,0 +1,53 @@
+From: Andy Lutomirski <luto@kernel.org>
+Date: Wed, 15 Jul 2015 10:29:34 -0700
+Subject: [5/9] x86/nmi/64: Remove asm code that saves CR2
+Origin: https://git.kernel.org/linus/0e181bb58143cb4a2e8f01c281b0816cd0e4798e
+
+Now that do_nmi saves CR2, we don't need to save it in asm.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
+Acked-by: Borislav Petkov <bp@suse.de>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: stable@vger.kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+[bwh: Backported to 4.0: adjust filename, context]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+---
+ arch/x86/kernel/entry_64.S | 18 ------------------
+ 1 file changed, 18 deletions(-)
+
+--- a/arch/x86/kernel/entry_64.S
++++ b/arch/x86/kernel/entry_64.S
+@@ -1673,29 +1673,11 @@ end_repeat_nmi:
+ 	call save_paranoid
+ 	DEFAULT_FRAME 0
+ 
+-	/*
+-	 * Save off the CR2 register. If we take a page fault in the NMI then
+-	 * it could corrupt the CR2 value. If the NMI preempts a page fault
+-	 * handler before it was able to read the CR2 register, and then the
+-	 * NMI itself takes a page fault, the page fault that was preempted
+-	 * will read the information from the NMI page fault and not the
+-	 * origin fault. Save it off and restore it if it changes.
+-	 * Use the r12 callee-saved register.
+-	 */
+-	movq %cr2, %r12
+-
+ 	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
+ 	movq %rsp,%rdi
+ 	movq $-1,%rsi
+ 	call do_nmi
+ 
+-	/* Did the NMI take a page fault? Restore cr2 if it did */
+-	movq %cr2, %rcx
+-	cmpq %rcx, %r12
+-	je 1f
+-	movq %r12, %cr2
+-1:
+-	
+ 	testl %ebx,%ebx				/* swapgs needed? */
+ 	jnz nmi_restore
+ nmi_swapgs:
diff --git a/debian/patches/bugfix/x86/0006-x86-nmi-64-Switch-stacks-on-userspace-NMI-entry.patch b/debian/patches/bugfix/x86/0006-x86-nmi-64-Switch-stacks-on-userspace-NMI-entry.patch
new file mode 100644
index 000000000000..168bf3d3cccf
--- /dev/null
+++ b/debian/patches/bugfix/x86/0006-x86-nmi-64-Switch-stacks-on-userspace-NMI-entry.patch
@@ -0,0 +1,135 @@
+From: Andy Lutomirski <luto@kernel.org>
+Date: Wed, 15 Jul 2015 10:29:35 -0700
+Subject: [6/9] x86/nmi/64: Switch stacks on userspace NMI entry
+Origin: https://git.kernel.org/linus/9b6e6a8334d56354853f9c255d1395c2ba570e0a
+
+Returning to userspace is tricky: IRET can fail, and ESPFIX can
+rearrange the stack prior to IRET.
+
+The NMI nesting fixup relies on a precise stack layout and
+atomic IRET.  Rather than trying to teach the NMI nesting fixup
+to handle ESPFIX and failed IRET, punt: run NMIs that came from
+user mode on the normal kernel stack.
+
+This will make some nested NMIs visible to C code, but the C
+code is okay with that.
+
+As a side effect, this should speed up perf: it eliminates an
+RDMSR when NMIs come from user mode.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: stable@vger.kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+[bwh: Backported to 4.0:
+ - Adjust filename, context
+ - s/restore_c_regs_and_iret/restore_args/
+ - Use kernel_stack + KERNEL_STACK_OFFSET instead of cpu_current_top_of_stack]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+[luto: Open-coded return path to avoid dependency on partial pt_regs details]
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+---
+ arch/x86/kernel/entry_64.S | 79 +++++++++++++++++++++++++++++++++++++++++++---
+ 1 file changed, 75 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/kernel/entry_64.S
++++ b/arch/x86/kernel/entry_64.S
+@@ -1494,19 +1494,90 @@ ENTRY(nmi)
+ 	 * a nested NMI that updated the copy interrupt stack frame, a
+ 	 * jump will be made to the repeat_nmi code that will handle the second
+ 	 * NMI.
++	 *
++	 * However, espfix prevents us from directly returning to userspace
++	 * with a single IRET instruction.  Similarly, IRET to user mode
++	 * can fault.  We therefore handle NMIs from user space like
++	 * other IST entries.
+ 	 */
+ 
+ 	/* Use %rdx as out temp variable throughout */
+ 	pushq_cfi %rdx
+ 	CFI_REL_OFFSET rdx, 0
+ 
++	testb	$3, CS-RIP+8(%rsp)
++	jz	.Lnmi_from_kernel
++
++	/*
++	 * NMI from user mode.  We need to run on the thread stack, but we
++	 * can't go through the normal entry paths: NMIs are masked, and
++	 * we don't want to enable interrupts, because then we'll end
++	 * up in an awkward situation in which IRQs are on but NMIs
++	 * are off.
++	 */
++
++	SWAPGS
++	cld
++	movq	%rsp, %rdx
++	movq	PER_CPU_VAR(kernel_stack), %rsp
++	addq	$KERNEL_STACK_OFFSET, %rsp
++	pushq	5*8(%rdx)	/* pt_regs->ss */
++	pushq	4*8(%rdx)	/* pt_regs->rsp */
++	pushq	3*8(%rdx)	/* pt_regs->flags */
++	pushq	2*8(%rdx)	/* pt_regs->cs */
++	pushq	1*8(%rdx)	/* pt_regs->rip */
++	pushq   $-1		/* pt_regs->orig_ax */
++	pushq   %rdi		/* pt_regs->di */
++	pushq   %rsi		/* pt_regs->si */
++	pushq   (%rdx)		/* pt_regs->dx */
++	pushq   %rcx		/* pt_regs->cx */
++	pushq   %rax		/* pt_regs->ax */
++	pushq   %r8		/* pt_regs->r8 */
++	pushq   %r9		/* pt_regs->r9 */
++	pushq   %r10		/* pt_regs->r10 */
++	pushq   %r11		/* pt_regs->r11 */
++	pushq	%rbx		/* pt_regs->rbx */
++	pushq	%rbp		/* pt_regs->rbp */
++	pushq	%r12		/* pt_regs->r12 */
++	pushq	%r13		/* pt_regs->r13 */
++	pushq	%r14		/* pt_regs->r14 */
++	pushq	%r15		/* pt_regs->r15 */
++
++	/*
++	 * At this point we no longer need to worry about stack damage
++	 * due to nesting -- we're on the normal thread stack and we're
++	 * done with the NMI stack.
++	 */
++
++	movq	%rsp, %rdi
++	movq	$-1, %rsi
++	call	do_nmi
++
++	/*
++	 * Return back to user mode.  We must *not* do the normal exit
++	 * work, because we don't want to enable interrupts.  Fortunately,
++	 * do_nmi doesn't modify pt_regs.
++	 */
++	SWAPGS
++
+ 	/*
+-	 * If %cs was not the kernel segment, then the NMI triggered in user
+-	 * space, which means it is definitely not nested.
++	 * Open-code the entire return process for compatibility with varying
++	 * register layouts across different kernel versions.
+ 	 */
+-	cmpl $__KERNEL_CS, 16(%rsp)
+-	jne first_nmi
++	addq	$6*8, %rsp	/* skip bx, bp, and r12-r15 */
++	popq	%r11		/* pt_regs->r11 */
++	popq	%r10		/* pt_regs->r10 */
++	popq	%r9		/* pt_regs->r9 */
++	popq	%r8		/* pt_regs->r8 */
++	popq	%rax		/* pt_regs->ax */
++	popq	%rcx		/* pt_regs->cx */
++	popq	%rdx		/* pt_regs->dx */
++	popq	%rsi		/* pt_regs->si */
++	popq	%rdi		/* pt_regs->di */
++	addq	$8, %rsp	/* skip orig_ax */
++	INTERRUPT_RETURN
+ 
++.Lnmi_from_kernel:
+ 	/*
+ 	 * Check the special variable on the stack to see if NMIs are
+ 	 * executing.
diff --git a/debian/patches/bugfix/x86/0007-x86-nmi-64-Improve-nested-NMI-comments.patch b/debian/patches/bugfix/x86/0007-x86-nmi-64-Improve-nested-NMI-comments.patch
new file mode 100644
index 000000000000..a5e18861e87f
--- /dev/null
+++ b/debian/patches/bugfix/x86/0007-x86-nmi-64-Improve-nested-NMI-comments.patch
@@ -0,0 +1,285 @@
+From: Andy Lutomirski <luto@kernel.org>
+Date: Wed, 15 Jul 2015 10:29:36 -0700
+Subject: [7/9] x86/nmi/64: Improve nested NMI comments
+Origin: https://git.kernel.org/linus/0b22930ebad563ae97ff3f8d7b9f12060b4c6e6b
+
+I found the nested NMI documentation to be difficult to follow.
+Improve the comments.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
+Cc: Borislav Petkov <bp@suse.de>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: stable@vger.kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+[bwh: Backported to 4.0: adjust filename, context]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+---
+ arch/x86/kernel/entry_64.S | 159 ++++++++++++++++++++++++++-------------------
+ arch/x86/kernel/nmi.c      |   4 +-
+ 2 files changed, 93 insertions(+), 70 deletions(-)
+
+--- a/arch/x86/kernel/entry_64.S
++++ b/arch/x86/kernel/entry_64.S
+@@ -1481,11 +1481,12 @@ ENTRY(nmi)
+ 	 *  If the variable is not set and the stack is not the NMI
+ 	 *  stack then:
+ 	 *    o Set the special variable on the stack
+-	 *    o Copy the interrupt frame into a "saved" location on the stack
+-	 *    o Copy the interrupt frame into a "copy" location on the stack
++	 *    o Copy the interrupt frame into an "outermost" location on the
++	 *      stack
++	 *    o Copy the interrupt frame into an "iret" location on the stack
+ 	 *    o Continue processing the NMI
+ 	 *  If the variable is set or the previous stack is the NMI stack:
+-	 *    o Modify the "copy" location to jump to the repeate_nmi
++	 *    o Modify the "iret" location to jump to the repeat_nmi
+ 	 *    o return back to the first NMI
+ 	 *
+ 	 * Now on exit of the first NMI, we first clear the stack variable
+@@ -1579,18 +1580,60 @@ ENTRY(nmi)
+ 
+ .Lnmi_from_kernel:
+ 	/*
+-	 * Check the special variable on the stack to see if NMIs are
+-	 * executing.
++	 * Here's what our stack frame will look like:
++	 * +---------------------------------------------------------+
++	 * | original SS                                             |
++	 * | original Return RSP                                     |
++	 * | original RFLAGS                                         |
++	 * | original CS                                             |
++	 * | original RIP                                            |
++	 * +---------------------------------------------------------+
++	 * | temp storage for rdx                                    |
++	 * +---------------------------------------------------------+
++	 * | "NMI executing" variable                                |
++	 * +---------------------------------------------------------+
++	 * | iret SS          } Copied from "outermost" frame        |
++	 * | iret Return RSP  } on each loop iteration; overwritten  |
++	 * | iret RFLAGS      } by a nested NMI to force another     |
++	 * | iret CS          } iteration if needed.                 |
++	 * | iret RIP         }                                      |
++	 * +---------------------------------------------------------+
++	 * | outermost SS          } initialized in first_nmi;       |
++	 * | outermost Return RSP  } will not be changed before      |
++	 * | outermost RFLAGS      } NMI processing is done.         |
++	 * | outermost CS          } Copied to "iret" frame on each  |
++	 * | outermost RIP         } iteration.                      |
++	 * +---------------------------------------------------------+
++	 * | pt_regs                                                 |
++	 * +---------------------------------------------------------+
++	 *
++	 * The "original" frame is used by hardware.  Before re-enabling
++	 * NMIs, we need to be done with it, and we need to leave enough
++	 * space for the asm code here.
++	 *
++	 * We return by executing IRET while RSP points to the "iret" frame.
++	 * That will either return for real or it will loop back into NMI
++	 * processing.
++	 *
++	 * The "outermost" frame is copied to the "iret" frame on each
++	 * iteration of the loop, so each iteration starts with the "iret"
++	 * frame pointing to the final return target.
++	 */
++
++	/*
++	 * Determine whether we're a nested NMI.
++	 *
++	 * First check "NMI executing".  If it's set, then we're nested.
++	 * This will not detect if we interrupted an outer NMI just
++	 * before IRET.
+ 	 */
+ 	cmpl $1, -8(%rsp)
+ 	je nested_nmi
+ 
+ 	/*
+-	 * Now test if the previous stack was an NMI stack.
+-	 * We need the double check. We check the NMI stack to satisfy the
+-	 * race when the first NMI clears the variable before returning.
+-	 * We check the variable because the first NMI could be in a
+-	 * breakpoint routine using a breakpoint stack.
++	 * Now test if the previous stack was an NMI stack.  This covers
++	 * the case where we interrupt an outer NMI after it clears
++	 * "NMI executing" but before IRET.
+ 	 */
+ 	lea	6*8(%rsp), %rdx
+ 	/* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
+@@ -1607,9 +1650,11 @@ ENTRY(nmi)
+ 
+ nested_nmi:
+ 	/*
+-	 * Do nothing if we interrupted the fixup in repeat_nmi.
+-	 * It's about to repeat the NMI handler, so we are fine
+-	 * with ignoring this one.
++	 * If we interrupted an NMI that is between repeat_nmi and
++	 * end_repeat_nmi, then we must not modify the "iret" frame
++	 * because it's being written by the outer NMI.  That's okay;
++	 * the outer NMI handler is about to call do_nmi anyway,
++	 * so we can just resume the outer NMI.
+ 	 */
+ 	movq $repeat_nmi, %rdx
+ 	cmpq 8(%rsp), %rdx
+@@ -1619,7 +1664,10 @@ nested_nmi:
+ 	ja nested_nmi_out
+ 
+ 1:
+-	/* Set up the interrupted NMIs stack to jump to repeat_nmi */
++	/*
++	 * Modify the "iret" frame to point to repeat_nmi, forcing another
++	 * iteration of NMI handling.
++	 */
+ 	leaq -1*8(%rsp), %rdx
+ 	movq %rdx, %rsp
+ 	CFI_ADJUST_CFA_OFFSET 1*8
+@@ -1638,60 +1686,23 @@ nested_nmi_out:
+ 	popq_cfi %rdx
+ 	CFI_RESTORE rdx
+ 
+-	/* No need to check faults here */
++	/* We are returning to kernel mode, so this cannot result in a fault. */
+ 	INTERRUPT_RETURN
+ 
+ 	CFI_RESTORE_STATE
+ first_nmi:
+-	/*
+-	 * Because nested NMIs will use the pushed location that we
+-	 * stored in rdx, we must keep that space available.
+-	 * Here's what our stack frame will look like:
+-	 * +-------------------------+
+-	 * | original SS             |
+-	 * | original Return RSP     |
+-	 * | original RFLAGS         |
+-	 * | original CS             |
+-	 * | original RIP            |
+-	 * +-------------------------+
+-	 * | temp storage for rdx    |
+-	 * +-------------------------+
+-	 * | NMI executing variable  |
+-	 * +-------------------------+
+-	 * | copied SS               |
+-	 * | copied Return RSP       |
+-	 * | copied RFLAGS           |
+-	 * | copied CS               |
+-	 * | copied RIP              |
+-	 * +-------------------------+
+-	 * | Saved SS                |
+-	 * | Saved Return RSP        |
+-	 * | Saved RFLAGS            |
+-	 * | Saved CS                |
+-	 * | Saved RIP               |
+-	 * +-------------------------+
+-	 * | pt_regs                 |
+-	 * +-------------------------+
+-	 *
+-	 * The saved stack frame is used to fix up the copied stack frame
+-	 * that a nested NMI may change to make the interrupted NMI iret jump
+-	 * to the repeat_nmi. The original stack frame and the temp storage
+-	 * is also used by nested NMIs and can not be trusted on exit.
+-	 */
+-	/* Do not pop rdx, nested NMIs will corrupt that part of the stack */
++	/* Restore rdx. */
+ 	movq (%rsp), %rdx
+ 	CFI_RESTORE rdx
+ 
+-	/* Set the NMI executing variable on the stack. */
++	/* Set "NMI executing" on the stack. */
+ 	pushq_cfi $1
+ 
+-	/*
+-	 * Leave room for the "copied" frame
+-	 */
++	/* Leave room for the "iret" frame */
+ 	subq $(5*8), %rsp
+ 	CFI_ADJUST_CFA_OFFSET 5*8
+ 
+-	/* Copy the stack frame to the Saved frame */
++	/* Copy the "original" frame to the "outermost" frame */
+ 	.rept 5
+ 	pushq_cfi 11*8(%rsp)
+ 	.endr
+@@ -1699,6 +1710,7 @@ first_nmi:
+ 
+ 	/* Everything up to here is safe from nested NMIs */
+ 
++repeat_nmi:
+ 	/*
+ 	 * If there was a nested NMI, the first NMI's iret will return
+ 	 * here. But NMIs are still enabled and we can take another
+@@ -1707,16 +1719,21 @@ first_nmi:
+ 	 * it will just return, as we are about to repeat an NMI anyway.
+ 	 * This makes it safe to copy to the stack frame that a nested
+ 	 * NMI will update.
+-	 */
+-repeat_nmi:
+-	/*
+-	 * Update the stack variable to say we are still in NMI (the update
+-	 * is benign for the non-repeat case, where 1 was pushed just above
+-	 * to this very stack slot).
++	 *
++	 * RSP is pointing to "outermost RIP".  gsbase is unknown, but, if
++	 * we're repeating an NMI, gsbase has the same value that it had on
++	 * the first iteration.  paranoid_entry will load the kernel
++	 * gsbase if needed before we call do_nmi.
++	 *
++	 * Set "NMI executing" in case we came back here via IRET.
+ 	 */
+ 	movq $1, 10*8(%rsp)
+ 
+-	/* Make another copy, this one may be modified by nested NMIs */
++	/*
++	 * Copy the "outermost" frame to the "iret" frame.  NMIs that nest
++	 * here must not modify the "iret" frame while we're writing to
++	 * it or it will end up containing garbage.
++	 */
+ 	addq $(10*8), %rsp
+ 	CFI_ADJUST_CFA_OFFSET -10*8
+ 	.rept 5
+@@ -1727,9 +1744,9 @@ repeat_nmi:
+ end_repeat_nmi:
+ 
+ 	/*
+-	 * Everything below this point can be preempted by a nested
+-	 * NMI if the first NMI took an exception and reset our iret stack
+-	 * so that we repeat another NMI.
++	 * Everything below this point can be preempted by a nested NMI.
++	 * If this happens, then the inner NMI will change the "iret"
++	 * frame to point back to repeat_nmi.
+ 	 */
+ 	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
+ 	subq $ORIG_RAX-R15, %rsp
+@@ -1754,11 +1771,17 @@ end_repeat_nmi:
+ nmi_swapgs:
+ 	SWAPGS_UNSAFE_STACK
+ nmi_restore:
+-	/* Pop the extra iret frame at once */
++
+ 	RESTORE_ALL 6*8
+ 
+-	/* Clear the NMI executing stack variable */
++	/* Clear "NMI executing". */
+ 	movq $0, 5*8(%rsp)
++
++	/*
++	 * INTERRUPT_RETURN reads the "iret" frame and exits the NMI
++	 * stack in a single instruction.  We are returning to kernel
++	 * mode, so this cannot result in a fault.
++	 */
+ 	INTERRUPT_RETURN
+ 	CFI_ENDPROC
+ END(nmi)
+--- a/arch/x86/kernel/nmi.c
++++ b/arch/x86/kernel/nmi.c
+@@ -408,8 +408,8 @@ static void default_do_nmi(struct pt_reg
+ NOKPROBE_SYMBOL(default_do_nmi);
+ 
+ /*
+- * NMIs can hit breakpoints which will cause it to lose its NMI context
+- * with the CPU when the breakpoint or page fault does an IRET.
++ * NMIs can page fault or hit breakpoints which will cause it to lose
++ * its NMI context with the CPU when the breakpoint or page fault does an IRET.
+  *
+  * As a result, NMIs can nest if NMIs get unmasked due an IRET during
+  * NMI processing.  On x86_64, the asm glue protects us from nested NMIs
diff --git a/debian/patches/bugfix/x86/0008-x86-nmi-64-Reorder-nested-NMI-checks.patch b/debian/patches/bugfix/x86/0008-x86-nmi-64-Reorder-nested-NMI-checks.patch
new file mode 100644
index 000000000000..68357e1c1dec
--- /dev/null
+++ b/debian/patches/bugfix/x86/0008-x86-nmi-64-Reorder-nested-NMI-checks.patch
@@ -0,0 +1,91 @@
+From: Andy Lutomirski <luto@kernel.org>
+Date: Wed, 15 Jul 2015 10:29:37 -0700
+Subject: [8/9] x86/nmi/64: Reorder nested NMI checks
+Origin: https://git.kernel.org/linus/a27507ca2d796cfa8d907de31ad730359c8a6d06
+
+Check the repeat_nmi .. end_repeat_nmi special case first.  The
+next patch will rework the RSP check and, as a side effect, the
+RSP check will no longer detect repeat_nmi .. end_repeat_nmi, so
+we'll need this ordering of the checks.
+
+Note: this is more subtle than it appears.  The check for
+repeat_nmi .. end_repeat_nmi jumps straight out of the NMI code
+instead of adjusting the "iret" frame to force a repeat.  This
+is necessary, because the code between repeat_nmi and
+end_repeat_nmi sets "NMI executing" and then writes to the
+"iret" frame itself.  If a nested NMI comes in and modifies the
+"iret" frame while repeat_nmi is also modifying it, we'll end up
+with garbage.  The old code got this right, as does the new
+code, but the new code is a bit more explicit.
+
+If we were to move the check right after the "NMI executing"
+check, then we'd get it wrong and have random crashes.
+
+( Because the "NMI executing" check would jump to the code that would
+  modify the "iret" frame without checking if the interrupted NMI was
+  currently modifying it. )
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
+Cc: Borislav Petkov <bp@suse.de>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: stable@vger.kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+[bwh: Backported to 4.0: adjust filename, spacing]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+---
+ arch/x86/kernel/entry_64.S | 34 ++++++++++++++++++----------------
+ 1 file changed, 18 insertions(+), 16 deletions(-)
+
+--- a/arch/x86/kernel/entry_64.S
++++ b/arch/x86/kernel/entry_64.S
+@@ -1623,7 +1623,24 @@ ENTRY(nmi)
+ 	/*
+ 	 * Determine whether we're a nested NMI.
+ 	 *
+-	 * First check "NMI executing".  If it's set, then we're nested.
++	 * If we interrupted kernel code between repeat_nmi and
++	 * end_repeat_nmi, then we are a nested NMI.  We must not
++	 * modify the "iret" frame because it's being written by
++	 * the outer NMI.  That's okay; the outer NMI handler is
++	 * about to about to call do_nmi anyway, so we can just
++	 * resume the outer NMI.
++	 */
++
++	movq	$repeat_nmi, %rdx
++	cmpq	8(%rsp), %rdx
++	ja	1f
++	movq	$end_repeat_nmi, %rdx
++	cmpq	8(%rsp), %rdx
++	ja	nested_nmi_out
++1:
++
++	/*
++	 * Now check "NMI executing".  If it's set, then we're nested.
+ 	 * This will not detect if we interrupted an outer NMI just
+ 	 * before IRET.
+ 	 */
+@@ -1650,21 +1667,6 @@ ENTRY(nmi)
+ 
+ nested_nmi:
+ 	/*
+-	 * If we interrupted an NMI that is between repeat_nmi and
+-	 * end_repeat_nmi, then we must not modify the "iret" frame
+-	 * because it's being written by the outer NMI.  That's okay;
+-	 * the outer NMI handler is about to call do_nmi anyway,
+-	 * so we can just resume the outer NMI.
+-	 */
+-	movq $repeat_nmi, %rdx
+-	cmpq 8(%rsp), %rdx
+-	ja 1f
+-	movq $end_repeat_nmi, %rdx
+-	cmpq 8(%rsp), %rdx
+-	ja nested_nmi_out
+-
+-1:
+-	/*
+ 	 * Modify the "iret" frame to point to repeat_nmi, forcing another
+ 	 * iteration of NMI handling.
+ 	 */
diff --git a/debian/patches/bugfix/x86/0009-x86-nmi-64-Use-DF-to-avoid-userspace-RSP-confusing-n.patch b/debian/patches/bugfix/x86/0009-x86-nmi-64-Use-DF-to-avoid-userspace-RSP-confusing-n.patch
new file mode 100644
index 000000000000..1d4bf0a75a7f
--- /dev/null
+++ b/debian/patches/bugfix/x86/0009-x86-nmi-64-Use-DF-to-avoid-userspace-RSP-confusing-n.patch
@@ -0,0 +1,90 @@
+From: Andy Lutomirski <luto@kernel.org>
+Date: Wed, 15 Jul 2015 10:29:38 -0700
+Subject: x86/nmi/64: Use DF to avoid userspace RSP confusing nested NMI
+ detection
+Origin: https://git.kernel.org/linus/810bc075f78ff2c221536eb3008eac6a492dba2d
+
+We have a tricky bug in the nested NMI code: if we see RSP
+pointing to the NMI stack on NMI entry from kernel mode, we
+assume that we are executing a nested NMI.
+
+This isn't quite true.  A malicious userspace program can point
+RSP at the NMI stack, issue SYSCALL, and arrange for an NMI to
+happen while RSP is still pointing at the NMI stack.
+
+Fix it with a sneaky trick.  Set DF in the region of code that
+the RSP check is intended to detect.  IRET will clear DF
+atomically.
+
+( Note: other than paravirt, there's little need for all this
+  complexity. We could check RIP instead of RSP. )
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
+Cc: Borislav Petkov <bp@suse.de>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: stable@vger.kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+[bwh: Backported to 4.0: adjust filename, context]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+---
+ arch/x86/kernel/entry_64.S | 29 +++++++++++++++++++++++++----
+ 1 file changed, 25 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/kernel/entry_64.S
++++ b/arch/x86/kernel/entry_64.S
+@@ -1650,7 +1650,14 @@ ENTRY(nmi)
+ 	/*
+ 	 * Now test if the previous stack was an NMI stack.  This covers
+ 	 * the case where we interrupt an outer NMI after it clears
+-	 * "NMI executing" but before IRET.
++	 * "NMI executing" but before IRET.  We need to be careful, though:
++	 * there is one case in which RSP could point to the NMI stack
++	 * despite there being no NMI active: naughty userspace controls
++	 * RSP at the very beginning of the SYSCALL targets.  We can
++	 * pull a fast one on naughty userspace, though: we program
++	 * SYSCALL to mask DF, so userspace cannot cause DF to be set
++	 * if it controls the kernel's RSP.  We set DF before we clear
++	 * "NMI executing".
+ 	 */
+ 	lea	6*8(%rsp), %rdx
+ 	/* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
+@@ -1661,10 +1668,16 @@ ENTRY(nmi)
+ 	cmpq	%rdx, 4*8(%rsp)
+ 	/* If it is below the NMI stack, it is a normal NMI */
+ 	jb	first_nmi
+-	/* Ah, it is within the NMI stack, treat it as nested */
++
++	/* Ah, it is within the NMI stack. */
++
++	testb	$(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
++	jz	first_nmi	/* RSP was user controlled. */
+ 
+ 	CFI_REMEMBER_STATE
+ 
++	/* This is a nested NMI. */
++
+ nested_nmi:
+ 	/*
+ 	 * Modify the "iret" frame to point to repeat_nmi, forcing another
+@@ -1776,8 +1789,16 @@ nmi_restore:
+ 
+ 	RESTORE_ALL 6*8
+ 
+-	/* Clear "NMI executing". */
+-	movq $0, 5*8(%rsp)
++	/*
++	 * Clear "NMI executing".  Set DF first so that we can easily
++	 * distinguish the remaining code between here and IRET from
++	 * the SYSCALL entry and exit paths.  On a native kernel, we
++	 * could just inspect RIP, but, on paravirt kernels,
++	 * INTERRUPT_RETURN can translate into a jump into a
++	 * hypercall page.
++	 */
++	std
++	movq	$0, 5*8(%rsp)		/* clear "NMI executing" */
+ 
+ 	/*
+ 	 * INTERRUPT_RETURN reads the "iret" frame and exits the NMI
diff --git a/debian/patches/series b/debian/patches/series
index ab67a7b2506f..6620467247a4 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -82,3 +82,12 @@ debian/udp-fix-abi-change-in-4.0.6.patch
 bugfix/mips/mips-normalise-code-flow-in-the-cpu-exception-handle.patch
 bugfix/mips/mips-correct-fp-isa-requirements.patch
 bugfix/x86/kvm-x86-fix-kvm_apic_has_events-to-check-for-null-po.patch
+bugfix/x86/0001-x86-asm-entry-64-Fold-the-test_in_nmi-macro-into-its.patch
+bugfix/x86/0002-x86-asm-entry-64-Remove-a-redundant-jump.patch
+bugfix/x86/0003-x86-asm-entry-64-Remove-pointless-jump-to-irq_return.patch
+bugfix/x86/0004-x86-nmi-Enable-nested-do_nmi-handling-for-64-bit-ker.patch
+bugfix/x86/0005-x86-nmi-64-Remove-asm-code-that-saves-cr2.patch
+bugfix/x86/0006-x86-nmi-64-Switch-stacks-on-userspace-NMI-entry.patch
+bugfix/x86/0007-x86-nmi-64-Improve-nested-NMI-comments.patch
+bugfix/x86/0008-x86-nmi-64-Reorder-nested-NMI-checks.patch
+bugfix/x86/0009-x86-nmi-64-Use-DF-to-avoid-userspace-RSP-confusing-n.patch
author	Ben Hutchings <benh@debian.org>	2015-07-22 20:46:22 +0000
committer	Ben Hutchings <benh@debian.org>	2015-07-22 20:46:22 +0000
commit	561e869fdaf54df3b2bcc5f44e8133024c3a3dc7 (patch)
tree	e5602af106ebfc1bfb932e4cd0169bd9e8329fe7
parent	2017c10b63fae2ca56bb12ca8063b69ecba04e2c (diff)
download	kernel_replicant_linux-561e869fdaf54df3b2bcc5f44e8133024c3a3dc7.tar.gz kernel_replicant_linux-561e869fdaf54df3b2bcc5f44e8133024c3a3dc7.tar.bz2 kernel_replicant_linux-561e869fdaf54df3b2bcc5f44e8133024c3a3dc7.zip