From e545a6140b698b2494daf0b32107bdcc5e901390 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Sun, 7 Sep 2008 16:57:22 +0200 Subject: kernel/cpu.c: create a CPU_STARTING cpu_chain notifier Right now, there is no notifier that is called on a new cpu, before the new cpu begins processing interrupts/softirqs. Various kernel function would need that notification, e.g. kvm works around by calling smp_call_function_single(), rcu polls cpu_online_map. The patch adds a CPU_STARTING notification. It also adds a helper function that sends the message to all cpu_chain handlers. Tested on x86-64. All other archs are untested. Especially on sparc, I'm not sure if I got it right. Signed-off-by: Manfred Spraul Signed-off-by: Ingo Molnar --- arch/s390/kernel/smp.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/s390') diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 00b9b4dec5e..9e8b1f9b8f4 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -585,6 +585,8 @@ int __cpuinit start_secondary(void *cpuvoid) /* Enable pfault pseudo page faults on this cpu. */ pfault_init(); + /* call cpu notifiers */ + notify_cpu_starting(smp_processor_id()); /* Mark this cpu as online */ spin_lock(&call_lock); cpu_set(smp_processor_id(), cpu_online_map); -- cgit v1.2.3 From 3d6e48f43340343d97839eadb1ab7b6a3ea98797 Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Tue, 9 Sep 2008 12:38:56 +0200 Subject: [S390] CVE-2008-1514: prevent ptrace padding area read/write in 31-bit mode When running a 31-bit ptrace, on either an s390 or s390x kernel, reads and writes into a padding area in struct user_regs_struct32 will result in a kernel panic. This is also known as CVE-2008-1514. Test case available here: http://sources.redhat.com/cgi-bin/cvsweb.cgi/~checkout~/tests/ptrace-tests/tests/user-area-padding.c?cvsroot=systemtap Steps to reproduce: 1) wget the above 2) gcc -o user-area-padding-31bit user-area-padding.c -Wall -ggdb2 -D_GNU_SOURCE -m31 3) ./user-area-padding-31bit Test status ----------- Without patch, both s390 and s390x kernels panic. With patch, the test case, as well as the gdb testsuite, pass without incident, padding area reads returning zero, writes ignored. Nb: original version returned -EINVAL on write attempts, which broke the gdb test and made the test case slightly unhappy, Jan Kratochvil suggested the change to return 0 on write attempts. Signed-off-by: Jarod Wilson Tested-by: Jan Kratochvil Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/compat_ptrace.h | 1 + arch/s390/kernel/ptrace.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'arch/s390') diff --git a/arch/s390/kernel/compat_ptrace.h b/arch/s390/kernel/compat_ptrace.h index cde81fa64f8..a2be3a978d5 100644 --- a/arch/s390/kernel/compat_ptrace.h +++ b/arch/s390/kernel/compat_ptrace.h @@ -42,6 +42,7 @@ struct user_regs_struct32 u32 gprs[NUM_GPRS]; u32 acrs[NUM_ACRS]; u32 orig_gpr2; + /* nb: there's a 4-byte hole here */ s390_fp_regs fp_regs; /* * These per registers are in here so that gdb can modify them diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 2815bfe348a..c8b08289eb8 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -170,6 +170,13 @@ static unsigned long __peek_user(struct task_struct *child, addr_t addr) */ tmp = (addr_t) task_pt_regs(child)->orig_gpr2; + } else if (addr < (addr_t) &dummy->regs.fp_regs) { + /* + * prevent reads of padding hole between + * orig_gpr2 and fp_regs on s390. + */ + tmp = 0; + } else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) { /* * floating point regs. are stored in the thread structure @@ -270,6 +277,13 @@ static int __poke_user(struct task_struct *child, addr_t addr, addr_t data) */ task_pt_regs(child)->orig_gpr2 = data; + } else if (addr < (addr_t) &dummy->regs.fp_regs) { + /* + * prevent writes of padding hole between + * orig_gpr2 and fp_regs on s390. + */ + return 0; + } else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) { /* * floating point regs. are stored in the thread structure @@ -428,6 +442,13 @@ static u32 __peek_user_compat(struct task_struct *child, addr_t addr) */ tmp = *(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4); + } else if (addr < (addr_t) &dummy32->regs.fp_regs) { + /* + * prevent reads of padding hole between + * orig_gpr2 and fp_regs on s390. + */ + tmp = 0; + } else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) { /* * floating point regs. are stored in the thread structure @@ -514,6 +535,13 @@ static int __poke_user_compat(struct task_struct *child, */ *(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4) = tmp; + } else if (addr < (addr_t) &dummy32->regs.fp_regs) { + /* + * prevent writess of padding hole between + * orig_gpr2 and fp_regs on s390. + */ + return 0; + } else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) { /* * floating point regs. are stored in the thread structure -- cgit v1.2.3 From d3d238c7744d08c36a114a59cb537d4c0c6c9a86 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 3 Oct 2008 21:54:59 +0200 Subject: [S390] nohz: Fix __udelay. This fixes a regression that came with 934b2857cc576ae53c92a66e63fce7ddcfa74691 ("[S390] nohz/sclp: disable timer on synchronous waits."). If udelay() gets called from a disabled context it sets the clock comparator to a value where it expects the next interrupt. When the interrupt happens the clock comparator gets not reset and therefore the interrupt condition doesn't get cleared. The result is an endless timer interrupt loop. In addition this patch fixes also the following: rcutorture reveals that our __udelay implementation is still buggy, since it might schedule tasklets, but prevents their execution: NOHZ: local_softirq_pending 42 NOHZ: local_softirq_pending 02 NOHZ: local_softirq_pending 142 NOHZ: local_softirq_pending 02 To fix this we make sure that only the clock comparator interrupt is enabled when the enabled wait psw is loaded. Also no code gets called anymore which might schedule tasklets. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/time.c | 2 ++ arch/s390/lib/delay.c | 88 ++++++++++++++++++++++++++++++------------------- 2 files changed, 56 insertions(+), 34 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index ca114fe46ff..06acb1a18bb 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -169,6 +169,8 @@ void init_cpu_timer(void) static void clock_comparator_interrupt(__u16 code) { + if (S390_lowcore.clock_comparator == -1ULL) + set_clock_comparator(S390_lowcore.clock_comparator); } static void etr_timing_alert(struct etr_irq_parm *); diff --git a/arch/s390/lib/delay.c b/arch/s390/lib/delay.c index fc6ab6094df..0953cee05ef 100644 --- a/arch/s390/lib/delay.c +++ b/arch/s390/lib/delay.c @@ -1,14 +1,9 @@ /* - * arch/s390/lib/delay.c * Precise Delay Loops for S390 * - * S390 version - * Copyright (C) 1999 IBM Deutschland Entwicklung GmbH, IBM Corporation - * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), - * - * Derived from "arch/i386/lib/delay.c" - * Copyright (C) 1993 Linus Torvalds - * Copyright (C) 1997 Martin Mares + * Copyright IBM Corp. 1999,2008 + * Author(s): Martin Schwidefsky , + * Heiko Carstens , */ #include @@ -29,30 +24,31 @@ void __delay(unsigned long loops) asm volatile("0: brct %0,0b" : : "d" ((loops/2) + 1)); } -/* - * Waits for 'usecs' microseconds using the TOD clock comparator. - */ -void __udelay(unsigned long usecs) +static void __udelay_disabled(unsigned long usecs) { - u64 end, time, old_cc = 0; - unsigned long flags, cr0, mask, dummy; - int irq_context; + unsigned long mask, cr0, cr0_saved; + u64 clock_saved; - irq_context = in_interrupt(); - if (!irq_context) - local_bh_disable(); - local_irq_save(flags); - if (raw_irqs_disabled_flags(flags)) { - old_cc = local_tick_disable(); - S390_lowcore.clock_comparator = -1ULL; - __ctl_store(cr0, 0, 0); - dummy = (cr0 & 0xffff00e0) | 0x00000800; - __ctl_load(dummy , 0, 0); - mask = psw_kernel_bits | PSW_MASK_WAIT | PSW_MASK_EXT; - } else - mask = psw_kernel_bits | PSW_MASK_WAIT | - PSW_MASK_EXT | PSW_MASK_IO; + clock_saved = local_tick_disable(); + set_clock_comparator(get_clock() + ((u64) usecs << 12)); + __ctl_store(cr0_saved, 0, 0); + cr0 = (cr0_saved & 0xffff00e0) | 0x00000800; + __ctl_load(cr0 , 0, 0); + mask = psw_kernel_bits | PSW_MASK_WAIT | PSW_MASK_EXT; + trace_hardirqs_on(); + __load_psw_mask(mask); + local_irq_disable(); + __ctl_load(cr0_saved, 0, 0); + local_tick_enable(clock_saved); + set_clock_comparator(S390_lowcore.clock_comparator); +} +static void __udelay_enabled(unsigned long usecs) +{ + unsigned long mask; + u64 end, time; + + mask = psw_kernel_bits | PSW_MASK_WAIT | PSW_MASK_EXT | PSW_MASK_IO; end = get_clock() + ((u64) usecs << 12); do { time = end < S390_lowcore.clock_comparator ? @@ -62,13 +58,37 @@ void __udelay(unsigned long usecs) __load_psw_mask(mask); local_irq_disable(); } while (get_clock() < end); + set_clock_comparator(S390_lowcore.clock_comparator); +} - if (raw_irqs_disabled_flags(flags)) { - __ctl_load(cr0, 0, 0); - local_tick_enable(old_cc); +/* + * Waits for 'usecs' microseconds using the TOD clock comparator. + */ +void __udelay(unsigned long usecs) +{ + unsigned long flags; + + preempt_disable(); + local_irq_save(flags); + if (in_irq()) { + __udelay_disabled(usecs); + goto out; + } + if (in_softirq()) { + if (raw_irqs_disabled_flags(flags)) + __udelay_disabled(usecs); + else + __udelay_enabled(usecs); + goto out; } - if (!irq_context) + if (raw_irqs_disabled_flags(flags)) { + local_bh_disable(); + __udelay_disabled(usecs); _local_bh_enable(); - set_clock_comparator(S390_lowcore.clock_comparator); + goto out; + } + __udelay_enabled(usecs); +out: local_irq_restore(flags); + preempt_enable(); } -- cgit v1.2.3 From 7a0f475513fa573bc8e072021960313da32f0ee3 Mon Sep 17 00:00:00 2001 From: Klaus-Dieter Wacker Date: Fri, 10 Oct 2008 21:33:18 +0200 Subject: [S390] qdio enhanced SIGA (iqdio) support. Add support for z10 HiperSockets multiwrite SBALs on output queues. This is used on LPAR with EDDP enabled devices. Signed-off-by: Klaus-Dieter Wacker Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/qdio.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/s390') diff --git a/arch/s390/include/asm/qdio.h b/arch/s390/include/asm/qdio.h index 6813772171f..4734c3f0535 100644 --- a/arch/s390/include/asm/qdio.h +++ b/arch/s390/include/asm/qdio.h @@ -299,7 +299,13 @@ struct qdio_ssqd_desc { u8 mbccnt; u16 qdioac2; u64 sch_token; - u64:64; + u8 mro; + u8 mri; + u8:8; + u8 sbalic; + u16:16; + u8:8; + u8 mmwc; } __attribute__ ((packed)); /* params are: ccw_device, qdio_error, queue_number, -- cgit v1.2.3 From d86730bb9597b02bff59a3a5a01c0094d71a265f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 10 Oct 2008 21:33:19 +0200 Subject: [S390] s390: use sys_pause for 31bit pause entry point sys32_pause is a useless copy of the generic sys_pause. (and it's certainly not there for old sparc32 binaries..) Signed-off-by: Christoph Hellwig Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/compat_linux.c | 8 -------- arch/s390/kernel/compat_linux.h | 1 - arch/s390/kernel/compat_wrapper.S | 2 -- arch/s390/kernel/syscalls.S | 2 +- 4 files changed, 1 insertion(+), 12 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c index d7f22226fc4..98e246dc023 100644 --- a/arch/s390/kernel/compat_linux.c +++ b/arch/s390/kernel/compat_linux.c @@ -608,14 +608,6 @@ asmlinkage long sys32_settimeofday(struct compat_timeval __user *tv, struct time return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); } -/* These are here just in case some old sparc32 binary calls it. */ -asmlinkage long sys32_pause(void) -{ - current->state = TASK_INTERRUPTIBLE; - schedule(); - return -ERESTARTNOHAND; -} - asmlinkage long sys32_pread64(unsigned int fd, char __user *ubuf, size_t count, u32 poshi, u32 poslo) { diff --git a/arch/s390/kernel/compat_linux.h b/arch/s390/kernel/compat_linux.h index 20723a06201..05f8516366a 100644 --- a/arch/s390/kernel/compat_linux.h +++ b/arch/s390/kernel/compat_linux.h @@ -206,7 +206,6 @@ long sys32_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz); long sys32_settimeofday(struct compat_timeval __user *tv, struct timezone __user *tz); -long sys32_pause(void); long sys32_pread64(unsigned int fd, char __user *ubuf, size_t count, u32 poshi, u32 poslo); long sys32_pwrite64(unsigned int fd, const char __user *ubuf, diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S index 328a20e880b..ee51ca9e23b 100644 --- a/arch/s390/kernel/compat_wrapper.S +++ b/arch/s390/kernel/compat_wrapper.S @@ -128,8 +128,6 @@ sys32_alarm_wrapper: llgfr %r2,%r2 # unsigned int jg sys_alarm # branch to system call -#sys32_pause_wrapper # void - .globl compat_sys_utime_wrapper compat_sys_utime_wrapper: llgtr %r2,%r2 # char * diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S index c66d35e5514..3ae303914b4 100644 --- a/arch/s390/kernel/syscalls.S +++ b/arch/s390/kernel/syscalls.S @@ -37,7 +37,7 @@ SYSCALL(sys_stime,sys_ni_syscall,sys32_stime_wrapper) /* 25 old stime syscall * SYSCALL(sys_ptrace,sys_ptrace,sys32_ptrace_wrapper) SYSCALL(sys_alarm,sys_alarm,sys32_alarm_wrapper) NI_SYSCALL /* old fstat syscall */ -SYSCALL(sys_pause,sys_pause,sys32_pause) +SYSCALL(sys_pause,sys_pause,sys_pause) SYSCALL(sys_utime,sys_utime,compat_sys_utime_wrapper) /* 30 */ NI_SYSCALL /* old stty syscall */ NI_SYSCALL /* old gtty syscall */ -- cgit v1.2.3 From 753c4dd6a2fa2af81f5d809d610d29f2d9dd9bc1 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 10 Oct 2008 21:33:20 +0200 Subject: [S390] ptrace changes * System call parameter and result access functions * Add tracehook calls * Split syscall_trace into two functions do_syscall_trace_enter and do_syscall_trace_exit Signed-off-by: Martin Schwidefsky --- arch/s390/Kconfig | 1 + arch/s390/include/asm/ptrace.h | 1 + arch/s390/include/asm/syscall.h | 80 +++++++++++++++++++++++++++++++++++++ arch/s390/include/asm/thread_info.h | 2 + arch/s390/kernel/entry.S | 50 ++++++++++++++++++----- arch/s390/kernel/entry64.S | 42 ++++++++++++++----- arch/s390/kernel/ptrace.c | 61 +++++++++++++++------------- arch/s390/kernel/signal.c | 13 ++++++ 8 files changed, 202 insertions(+), 48 deletions(-) create mode 100644 arch/s390/include/asm/syscall.h (limited to 'arch/s390') diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 8d41908e251..4c03049e7db 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -74,6 +74,7 @@ config S390 select HAVE_KPROBES select HAVE_KRETPROBES select HAVE_KVM if 64BIT + select HAVE_ARCH_TRACEHOOK source "init/Kconfig" diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index af2c9ac28a0..a7226f8143f 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -490,6 +490,7 @@ extern void user_disable_single_step(struct task_struct *); #define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0) #define instruction_pointer(regs) ((regs)->psw.addr & PSW_ADDR_INSN) +#define user_stack_pointer(regs)((regs)->gprs[15]) #define regs_return_value(regs)((regs)->gprs[2]) #define profile_pc(regs) instruction_pointer(regs) extern void show_regs(struct pt_regs * regs); diff --git a/arch/s390/include/asm/syscall.h b/arch/s390/include/asm/syscall.h new file mode 100644 index 00000000000..6e623971fbb --- /dev/null +++ b/arch/s390/include/asm/syscall.h @@ -0,0 +1,80 @@ +/* + * Access to user system call parameters and results + * + * Copyright IBM Corp. 2008 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License (version 2 only) + * as published by the Free Software Foundation. + */ + +#ifndef _ASM_SYSCALL_H +#define _ASM_SYSCALL_H 1 + +#include + +static inline long syscall_get_nr(struct task_struct *task, + struct pt_regs *regs) +{ + if (regs->trap != __LC_SVC_OLD_PSW) + return -1; + return regs->gprs[2]; +} + +static inline void syscall_rollback(struct task_struct *task, + struct pt_regs *regs) +{ + regs->gprs[2] = regs->orig_gpr2; +} + +static inline long syscall_get_error(struct task_struct *task, + struct pt_regs *regs) +{ + return (regs->gprs[2] >= -4096UL) ? -regs->gprs[2] : 0; +} + +static inline long syscall_get_return_value(struct task_struct *task, + struct pt_regs *regs) +{ + return regs->gprs[2]; +} + +static inline void syscall_set_return_value(struct task_struct *task, + struct pt_regs *regs, + int error, long val) +{ + regs->gprs[2] = error ? -error : val; +} + +static inline void syscall_get_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned int i, unsigned int n, + unsigned long *args) +{ + BUG_ON(i + n > 6); +#ifdef CONFIG_COMPAT + if (test_tsk_thread_flag(task, TIF_31BIT)) { + if (i + n == 6) + args[--n] = (u32) regs->args[0]; + while (n-- > 0) + args[n] = (u32) regs->gprs[2 + i + n]; + } +#endif + if (i + n == 6) + args[--n] = regs->args[0]; + memcpy(args, ®s->gprs[2 + i], n * sizeof(args[0])); +} + +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned int i, unsigned int n, + const unsigned long *args) +{ + BUG_ON(i + n > 6); + if (i + n == 6) + regs->args[0] = args[--n]; + memcpy(®s->gprs[2 + i], args, n * sizeof(args[0])); +} + +#endif /* _ASM_SYSCALL_H */ diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index 91a8f93ad35..ea40a9d690f 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -86,6 +86,7 @@ static inline struct thread_info *current_thread_info(void) * thread information flags bit numbers */ #define TIF_SYSCALL_TRACE 0 /* syscall trace active */ +#define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_RESTART_SVC 4 /* restart svc with new svc number */ @@ -100,6 +101,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_RESTORE_SIGMASK 20 /* restore signal mask in do_signal() */ #define _TIF_SYSCALL_TRACE (1< #include #include +#include #include #include @@ -639,40 +640,44 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, } #endif -asmlinkage void -syscall_trace(struct pt_regs *regs, int entryexit) +asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) { - if (unlikely(current->audit_context) && entryexit) - audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]), regs->gprs[2]); - - if (!test_thread_flag(TIF_SYSCALL_TRACE)) - goto out; - if (!(current->ptrace & PT_PTRACED)) - goto out; - ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) - ? 0x80 : 0)); + long ret; /* - * If the debuffer has set an invalid system call number, - * we prepare to skip the system call restart handling. + * The sysc_tracesys code in entry.S stored the system + * call number to gprs[2]. */ - if (!entryexit && regs->gprs[2] >= NR_syscalls) + ret = regs->gprs[2]; + if (test_thread_flag(TIF_SYSCALL_TRACE) && + (tracehook_report_syscall_entry(regs) || + regs->gprs[2] >= NR_syscalls)) { + /* + * Tracing decided this syscall should not happen or the + * debugger stored an invalid system call number. Skip + * the system call and the system call restart handling. + */ regs->trap = -1; - - /* - * this isn't the same as continuing with a signal, but it will do - * for normal use. strace only continues with a signal if the - * stopping signal is not SIGTRAP. -brl - */ - if (current->exit_code) { - send_sig(current->exit_code, current, 1); - current->exit_code = 0; + ret = -1; } - out: - if (unlikely(current->audit_context) && !entryexit) - audit_syscall_entry(test_thread_flag(TIF_31BIT)?AUDIT_ARCH_S390:AUDIT_ARCH_S390X, - regs->gprs[2], regs->orig_gpr2, regs->gprs[3], - regs->gprs[4], regs->gprs[5]); + + if (unlikely(current->audit_context)) + audit_syscall_entry(test_thread_flag(TIF_31BIT) ? + AUDIT_ARCH_S390 : AUDIT_ARCH_S390X, + regs->gprs[2], regs->orig_gpr2, + regs->gprs[3], regs->gprs[4], + regs->gprs[5]); + return ret; +} + +asmlinkage void do_syscall_trace_exit(struct pt_regs *regs) +{ + if (unlikely(current->audit_context)) + audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]), + regs->gprs[2]); + + if (test_thread_flag(TIF_SYSCALL_TRACE)) + tracehook_report_syscall_exit(regs, 0); } /* diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index b9768204021..4f7fc3059a8 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -507,6 +508,12 @@ void do_signal(struct pt_regs *regs) */ if (current->thread.per_info.single_step) set_thread_flag(TIF_SINGLE_STEP); + + /* + * Let tracing know that we've done the handler setup. + */ + tracehook_signal_handler(signr, &info, &ka, regs, + test_thread_flag(TIF_SINGLE_STEP)); } return; } @@ -526,3 +533,9 @@ void do_signal(struct pt_regs *regs) set_thread_flag(TIF_RESTART_SVC); } } + +void do_notify_resume(struct pt_regs *regs) +{ + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); +} -- cgit v1.2.3 From b2300b9efe1b8174833e17f37e975c9da00c388a Mon Sep 17 00:00:00 2001 From: Hongjie Yang Date: Fri, 10 Oct 2008 21:33:21 +0200 Subject: [S390] dcssblk: add >2G DCSSs support and stacked contiguous DCSSs support. The DCSS block device driver is modified to add >2G DCSSs support and allow a DCSS block device to map to a set of contiguous DCSSs. The extmem code is also modified to use new Diagnose x'64' subcodes for >2G DCSSs. Signed-off-by: Hongjie Yang Signed-off-by: Martin Schwidefsky --- arch/s390/mm/extmem.c | 251 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 212 insertions(+), 39 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c index f231f5ec74b..580fc64cc73 100644 --- a/arch/s390/mm/extmem.c +++ b/arch/s390/mm/extmem.c @@ -43,20 +43,40 @@ #define DCSS_FINDSEG 0x0c #define DCSS_LOADNOLY 0x10 #define DCSS_SEGEXT 0x18 +#define DCSS_LOADSHRX 0x20 +#define DCSS_LOADNSRX 0x24 +#define DCSS_FINDSEGX 0x2c +#define DCSS_SEGEXTX 0x38 #define DCSS_FINDSEGA 0x0c struct qrange { - unsigned int start; // 3byte start address, 1 byte type - unsigned int end; // 3byte end address, 1 byte reserved + unsigned long start; /* last byte type */ + unsigned long end; /* last byte reserved */ }; struct qout64 { + unsigned long segstart; + unsigned long segend; + int segcnt; + int segrcnt; + struct qrange range[6]; +}; + +#ifdef CONFIG_64BIT +struct qrange_old { + unsigned int start; /* last byte type */ + unsigned int end; /* last byte reserved */ +}; + +/* output area format for the Diag x'64' old subcode x'18' */ +struct qout64_old { int segstart; int segend; int segcnt; int segrcnt; - struct qrange range[6]; + struct qrange_old range[6]; }; +#endif struct qin64 { char qopcode; @@ -86,6 +106,55 @@ static DEFINE_MUTEX(dcss_lock); static LIST_HEAD(dcss_list); static char *segtype_string[] = { "SW", "EW", "SR", "ER", "SN", "EN", "SC", "EW/EN-MIXED" }; +static int loadshr_scode, loadnsr_scode, findseg_scode; +static int segext_scode, purgeseg_scode; +static int scode_set; + +/* set correct Diag x'64' subcodes. */ +static int +dcss_set_subcodes(void) +{ +#ifdef CONFIG_64BIT + char *name = kmalloc(8 * sizeof(char), GFP_DMA); + unsigned long rx, ry; + int rc; + + if (name == NULL) + return -ENOMEM; + + rx = (unsigned long) name; + ry = DCSS_FINDSEGX; + + strcpy(name, "dummy"); + asm volatile( + " diag %0,%1,0x64\n" + "0: ipm %2\n" + " srl %2,28\n" + " j 2f\n" + "1: la %2,3\n" + "2:\n" + EX_TABLE(0b, 1b) + : "+d" (rx), "+d" (ry), "=d" (rc) : : "cc"); + + kfree(name); + /* Diag x'64' new subcodes are supported, set to new subcodes */ + if (rc != 3) { + loadshr_scode = DCSS_LOADSHRX; + loadnsr_scode = DCSS_LOADNSRX; + purgeseg_scode = DCSS_PURGESEG; + findseg_scode = DCSS_FINDSEGX; + segext_scode = DCSS_SEGEXTX; + return 0; + } +#endif + /* Diag x'64' new subcodes are not supported, set to old subcodes */ + loadshr_scode = DCSS_LOADNOLY; + loadnsr_scode = DCSS_LOADNSR; + purgeseg_scode = DCSS_PURGESEG; + findseg_scode = DCSS_FINDSEG; + segext_scode = DCSS_SEGEXT; + return 0; +} /* * Create the 8 bytes, ebcdic VM segment name from @@ -135,25 +204,45 @@ segment_by_name (char *name) * Perform a function on a dcss segment. */ static inline int -dcss_diag (__u8 func, void *parameter, +dcss_diag(int *func, void *parameter, unsigned long *ret1, unsigned long *ret2) { unsigned long rx, ry; int rc; + if (scode_set == 0) { + rc = dcss_set_subcodes(); + if (rc < 0) + return rc; + scode_set = 1; + } rx = (unsigned long) parameter; - ry = (unsigned long) func; - asm volatile( + ry = (unsigned long) *func; + #ifdef CONFIG_64BIT - " sam31\n" - " diag %0,%1,0x64\n" - " sam64\n" + /* 64-bit Diag x'64' new subcode, keep in 64-bit addressing mode */ + if (*func > DCSS_SEGEXT) + asm volatile( + " diag %0,%1,0x64\n" + " ipm %2\n" + " srl %2,28\n" + : "+d" (rx), "+d" (ry), "=d" (rc) : : "cc"); + /* 31-bit Diag x'64' old subcode, switch to 31-bit addressing mode */ + else + asm volatile( + " sam31\n" + " diag %0,%1,0x64\n" + " sam64\n" + " ipm %2\n" + " srl %2,28\n" + : "+d" (rx), "+d" (ry), "=d" (rc) : : "cc"); #else + asm volatile( " diag %0,%1,0x64\n" -#endif " ipm %2\n" " srl %2,28\n" : "+d" (rx), "+d" (ry), "=d" (rc) : : "cc"); +#endif *ret1 = rx; *ret2 = ry; return rc; @@ -190,14 +279,45 @@ query_segment_type (struct dcss_segment *seg) qin->qoutlen = sizeof(struct qout64); memcpy (qin->qname, seg->dcss_name, 8); - diag_cc = dcss_diag (DCSS_SEGEXT, qin, &dummy, &vmrc); + diag_cc = dcss_diag(&segext_scode, qin, &dummy, &vmrc); + if (diag_cc < 0) { + rc = diag_cc; + goto out_free; + } if (diag_cc > 1) { PRINT_WARN ("segment_type: diag returned error %ld\n", vmrc); rc = dcss_diag_translate_rc (vmrc); goto out_free; } +#ifdef CONFIG_64BIT + /* Only old format of output area of Diagnose x'64' is supported, + copy data for the new format. */ + if (segext_scode == DCSS_SEGEXT) { + struct qout64_old *qout_old; + qout_old = kzalloc(sizeof(struct qout64_old), GFP_DMA); + if (qout_old == NULL) { + rc = -ENOMEM; + goto out_free; + } + memcpy(qout_old, qout, sizeof(struct qout64_old)); + qout->segstart = (unsigned long) qout_old->segstart; + qout->segend = (unsigned long) qout_old->segend; + qout->segcnt = qout_old->segcnt; + qout->segrcnt = qout_old->segrcnt; + + if (qout->segcnt > 6) + qout->segrcnt = 6; + for (i = 0; i < qout->segrcnt; i++) { + qout->range[i].start = + (unsigned long) qout_old->range[i].start; + qout->range[i].end = + (unsigned long) qout_old->range[i].end; + } + kfree(qout_old); + } +#endif if (qout->segcnt > 6) { rc = -ENOTSUPP; goto out_free; @@ -268,6 +388,30 @@ segment_type (char* name) return seg.vm_segtype; } +/* + * check if segment collides with other segments that are currently loaded + * returns 1 if this is the case, 0 if no collision was found + */ +static int +segment_overlaps_others (struct dcss_segment *seg) +{ + struct list_head *l; + struct dcss_segment *tmp; + + BUG_ON(!mutex_is_locked(&dcss_lock)); + list_for_each(l, &dcss_list) { + tmp = list_entry(l, struct dcss_segment, list); + if ((tmp->start_addr >> 20) > (seg->end >> 20)) + continue; + if ((tmp->end >> 20) < (seg->start_addr >> 20)) + continue; + if (seg == tmp) + continue; + return 1; + } + return 0; +} + /* * real segment loading function, called from segment_load */ @@ -276,7 +420,8 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long { struct dcss_segment *seg = kmalloc(sizeof(struct dcss_segment), GFP_DMA); - int dcss_command, rc, diag_cc; + int rc, diag_cc; + unsigned long start_addr, end_addr, dummy; if (seg == NULL) { rc = -ENOMEM; @@ -287,6 +432,13 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long if (rc < 0) goto out_free; + if (loadshr_scode == DCSS_LOADSHRX) { + if (segment_overlaps_others(seg)) { + rc = -EBUSY; + goto out_free; + } + } + rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1); if (rc) @@ -316,20 +468,28 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long } if (do_nonshared) - dcss_command = DCSS_LOADNSR; + diag_cc = dcss_diag(&loadnsr_scode, seg->dcss_name, + &start_addr, &end_addr); else - dcss_command = DCSS_LOADNOLY; - - diag_cc = dcss_diag(dcss_command, seg->dcss_name, - &seg->start_addr, &seg->end); + diag_cc = dcss_diag(&loadshr_scode, seg->dcss_name, + &start_addr, &end_addr); + if (diag_cc < 0) { + dcss_diag(&purgeseg_scode, seg->dcss_name, + &dummy, &dummy); + rc = diag_cc; + goto out_resource; + } if (diag_cc > 1) { PRINT_WARN ("segment_load: could not load segment %s - " - "diag returned error (%ld)\n",name,seg->end); - rc = dcss_diag_translate_rc (seg->end); - dcss_diag(DCSS_PURGESEG, seg->dcss_name, - &seg->start_addr, &seg->end); + "diag returned error (%ld)\n", + name, end_addr); + rc = dcss_diag_translate_rc(end_addr); + dcss_diag(&purgeseg_scode, seg->dcss_name, + &dummy, &dummy); goto out_resource; } + seg->start_addr = start_addr; + seg->end = end_addr; seg->do_nonshared = do_nonshared; atomic_set(&seg->ref_count, 1); list_add(&seg->list, &dcss_list); @@ -423,8 +583,8 @@ int segment_modify_shared (char *name, int do_nonshared) { struct dcss_segment *seg; - unsigned long dummy; - int dcss_command, rc, diag_cc; + unsigned long start_addr, end_addr, dummy; + int rc, diag_cc; mutex_lock(&dcss_lock); seg = segment_by_name (name); @@ -445,38 +605,51 @@ segment_modify_shared (char *name, int do_nonshared) goto out_unlock; } release_resource(seg->res); - if (do_nonshared) { - dcss_command = DCSS_LOADNSR; + if (do_nonshared) seg->res->flags &= ~IORESOURCE_READONLY; - } else { - dcss_command = DCSS_LOADNOLY; + else if (seg->vm_segtype == SEG_TYPE_SR || seg->vm_segtype == SEG_TYPE_ER) seg->res->flags |= IORESOURCE_READONLY; - } + if (request_resource(&iomem_resource, seg->res)) { PRINT_WARN("segment_modify_shared: could not reload segment %s" " - overlapping resources\n", name); rc = -EBUSY; kfree(seg->res); - goto out_del; + goto out_del_mem; + } + + dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); + if (do_nonshared) + diag_cc = dcss_diag(&loadnsr_scode, seg->dcss_name, + &start_addr, &end_addr); + else + diag_cc = dcss_diag(&loadshr_scode, seg->dcss_name, + &start_addr, &end_addr); + if (diag_cc < 0) { + rc = diag_cc; + goto out_del_res; } - dcss_diag(DCSS_PURGESEG, seg->dcss_name, &dummy, &dummy); - diag_cc = dcss_diag(dcss_command, seg->dcss_name, - &seg->start_addr, &seg->end); if (diag_cc > 1) { PRINT_WARN ("segment_modify_shared: could not reload segment %s" - " - diag returned error (%ld)\n",name,seg->end); - rc = dcss_diag_translate_rc (seg->end); - goto out_del; + " - diag returned error (%ld)\n", + name, end_addr); + rc = dcss_diag_translate_rc(end_addr); + goto out_del_res; } + seg->start_addr = start_addr; + seg->end = end_addr; seg->do_nonshared = do_nonshared; rc = 0; goto out_unlock; - out_del: + out_del_res: + release_resource(seg->res); + kfree(seg->res); + out_del_mem: vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1); list_del(&seg->list); - dcss_diag(DCSS_PURGESEG, seg->dcss_name, &dummy, &dummy); + dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); kfree(seg); out_unlock: mutex_unlock(&dcss_lock); @@ -510,7 +683,7 @@ segment_unload(char *name) kfree(seg->res); vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1); list_del(&seg->list); - dcss_diag(DCSS_PURGESEG, seg->dcss_name, &dummy, &dummy); + dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); kfree(seg); out_unlock: mutex_unlock(&dcss_lock); @@ -545,7 +718,7 @@ segment_save(char *name) endpfn = (seg->end) >> PAGE_SHIFT; sprintf(cmd1, "DEFSEG %s", name); for (i=0; isegcnt; i++) { - sprintf(cmd1+strlen(cmd1), " %X-%X %s", + sprintf(cmd1+strlen(cmd1), " %lX-%lX %s", seg->range[i].start >> PAGE_SHIFT, seg->range[i].end >> PAGE_SHIFT, segtype_string[seg->range[i].start & 0xff]); -- cgit v1.2.3 From 5a0d0e65379256b4da2c9092e197a2c761f51c01 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 10 Oct 2008 21:33:22 +0200 Subject: [S390] Move private simple udelay function to arch/s390/lib/delay.c. Move cio's private simple udelay function to lib/delay.c and turn it into something much more readable. So we have all implementations at one place. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/delay.h | 1 + arch/s390/lib/delay.c | 13 +++++++++++++ 2 files changed, 14 insertions(+) (limited to 'arch/s390') diff --git a/arch/s390/include/asm/delay.h b/arch/s390/include/asm/delay.h index 78357314c45..a356c958e26 100644 --- a/arch/s390/include/asm/delay.h +++ b/arch/s390/include/asm/delay.h @@ -15,6 +15,7 @@ #define _S390_DELAY_H extern void __udelay(unsigned long usecs); +extern void udelay_simple(unsigned long usecs); extern void __delay(unsigned long loops); #define udelay(n) __udelay(n) diff --git a/arch/s390/lib/delay.c b/arch/s390/lib/delay.c index 0953cee05ef..6ccb9fab055 100644 --- a/arch/s390/lib/delay.c +++ b/arch/s390/lib/delay.c @@ -92,3 +92,16 @@ out: local_irq_restore(flags); preempt_enable(); } + +/* + * Simple udelay variant. To be used on startup and reboot + * when the interrupt handler isn't working. + */ +void udelay_simple(unsigned long usecs) +{ + u64 end; + + end = get_clock() + ((u64) usecs << 12); + while (get_clock() < end) + cpu_relax(); +} -- cgit v1.2.3 From ab1d848fd6a9151b02c6cbf4bddce6e24707b094 Mon Sep 17 00:00:00 2001 From: Nigel Hislop Date: Fri, 10 Oct 2008 21:33:25 +0200 Subject: [S390] Add ioctl support for EMC Symmetrix Subsystem Control I/O EMC Symmetrix Subsystem Control I/O through CKD dasd requires a specific parameter list sent to the array via a Perform Subsystem Function CCW. The Symmetrix response is retrieved from the array via a Read Subsystem Data CCW. Signed-off-by: Nigel Hislop Signed-off-by: Hannes Reinecke Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/dasd.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/s390') diff --git a/arch/s390/include/asm/dasd.h b/arch/s390/include/asm/dasd.h index 3f002e13d02..55b2b80cdf6 100644 --- a/arch/s390/include/asm/dasd.h +++ b/arch/s390/include/asm/dasd.h @@ -3,6 +3,8 @@ * Author(s)......: Holger Smolinski * Bugreports.to..: * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000 + * EMC Symmetrix ioctl Copyright EMC Corporation, 2008 + * Author.........: Nigel Hislop * * This file is the interface of the DASD device driver, which is exported to user space * any future changes wrt the API will result in a change of the APIVERSION reported @@ -202,6 +204,16 @@ typedef struct attrib_data_t { #define DASD_SEQ_PRESTAGE 0x4 #define DASD_REC_ACCESS 0x5 +/* + * Perform EMC Symmetrix I/O + */ +typedef struct dasd_symmio_parms { + unsigned char reserved[8]; /* compat with older releases */ + unsigned long long psf_data; /* char * cast to u64 */ + unsigned long long rssd_result; /* char * cast to u64 */ + int psf_data_len; + int rssd_result_len; +} __attribute__ ((packed)) dasd_symmio_parms_t; /******************************************************************************** * SECTION: Definition of IOCTLs @@ -247,6 +259,7 @@ typedef struct attrib_data_t { /* Set Attributes (cache operations) */ #define BIODASDSATTR _IOW(DASD_IOCTL_LETTER,2,attrib_data_t) +#define BIODASDSYMMIO _IOWR(DASD_IOCTL_LETTER, 240, dasd_symmio_parms_t) #endif /* DASD_H */ -- cgit v1.2.3 From 15e86b0c752d50e910b2cca6e83ce74c4440d06c Mon Sep 17 00:00:00 2001 From: Florian Funke Date: Fri, 10 Oct 2008 21:33:26 +0200 Subject: [S390] introduce dirty bit for kvm live migration This patch defines a dirty bit in the PGSTE that can be used to implement dirty pages logging for KVM's live migration. The bit is set in the ptep_rcp_copy function, which is called to save dirty and referenced information from the storage key in the PGSTE. The bit can be tested and reset by KVM using the kvm_s390_test_and_clear_page_dirty function that is introduced by this patch. Acked-by: Carsten Otte Signed-off-by: Florian Funke Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pgtable.h | 45 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 0bdb704ae05..1a928f84afd 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -281,6 +281,9 @@ extern char empty_zero_page[PAGE_SIZE]; #define RCP_GR_BIT 50 #define RCP_GC_BIT 49 +/* User dirty bit for KVM's migration feature */ +#define KVM_UD_BIT 47 + #ifndef __s390x__ /* Bits in the segment table address-space-control-element */ @@ -575,12 +578,16 @@ static inline void ptep_rcp_copy(pte_t *ptep) unsigned long *pgste = (unsigned long *) (ptep + PTRS_PER_PTE); skey = page_get_storage_key(page_to_phys(page)); - if (skey & _PAGE_CHANGED) + if (skey & _PAGE_CHANGED) { set_bit_simple(RCP_GC_BIT, pgste); + set_bit_simple(KVM_UD_BIT, pgste); + } if (skey & _PAGE_REFERENCED) set_bit_simple(RCP_GR_BIT, pgste); - if (test_and_clear_bit_simple(RCP_HC_BIT, pgste)) + if (test_and_clear_bit_simple(RCP_HC_BIT, pgste)) { SetPageDirty(page); + set_bit_simple(KVM_UD_BIT, pgste); + } if (test_and_clear_bit_simple(RCP_HR_BIT, pgste)) SetPageReferenced(page); #endif @@ -744,6 +751,40 @@ static inline pte_t pte_mkspecial(pte_t pte) return pte; } +#ifdef CONFIG_PGSTE +/* + * Get (and clear) the user dirty bit for a PTE. + */ +static inline int kvm_s390_test_and_clear_page_dirty(struct mm_struct *mm, + pte_t *ptep) +{ + int dirty; + unsigned long *pgste; + struct page *page; + unsigned int skey; + + if (!mm->context.pgstes) + return -EINVAL; + rcp_lock(ptep); + pgste = (unsigned long *) (ptep + PTRS_PER_PTE); + page = virt_to_page(pte_val(*ptep)); + skey = page_get_storage_key(page_to_phys(page)); + if (skey & _PAGE_CHANGED) { + set_bit_simple(RCP_GC_BIT, pgste); + set_bit_simple(KVM_UD_BIT, pgste); + } + if (test_and_clear_bit_simple(RCP_HC_BIT, pgste)) { + SetPageDirty(page); + set_bit_simple(KVM_UD_BIT, pgste); + } + dirty = test_and_clear_bit_simple(KVM_UD_BIT, pgste); + if (skey & _PAGE_CHANGED) + page_clear_dirty(page); + rcp_unlock(ptep); + return dirty; +} +#endif + #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) -- cgit v1.2.3 From 4a672cfa3a7fcbc6f2adc558f34148be1096c561 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 10 Oct 2008 21:33:29 +0200 Subject: [S390] fix initialization of stp chsc_sstpc returns -EIO on error and 0 on success but stp_reset checks against 1 instead of 0. chsc_sstpc used to return 1 on success, one call location has not been updated .. Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/s390') diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 06acb1a18bb..b94e9e3b694 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -1356,7 +1356,7 @@ static void __init stp_reset(void) stp_page = alloc_bootmem_pages(PAGE_SIZE); rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000); - if (rc == 1) + if (rc == 0) set_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags); else if (stp_online) { printk(KERN_WARNING "Running on non STP capable machine.\n"); -- cgit v1.2.3