diff options
38 files changed, 1196 insertions, 354 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 6a5de5643e0b..cac0ba127576 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2211,6 +2211,8 @@ KVM_S390_SIGP_STOP (vcpu) - sigp restart KVM_S390_PROGRAM_INT (vcpu) - program check; code in parm KVM_S390_SIGP_SET_PREFIX (vcpu) - sigp set prefix; prefix address in parm KVM_S390_RESTART (vcpu) - restart +KVM_S390_INT_CLOCK_COMP (vcpu) - clock comparator interrupt +KVM_S390_INT_CPU_TIMER (vcpu) - CPU timer interrupt KVM_S390_INT_VIRTIO (vm) - virtio external interrupt; external interrupt parameters in parm and parm64 KVM_S390_INT_SERVICE (vm) - sclp external interrupt; sclp parameter in parm diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 0d45f6fe734f..a27f5007062a 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -32,8 +32,10 @@ #define KVM_NR_IRQCHIPS 1 #define KVM_IRQCHIP_NUM_PINS 4096 +#define SIGP_CTRL_C 0x00800000 + struct sca_entry { - atomic_t scn; + atomic_t ctrl; __u32 reserved; __u64 sda; __u64 reserved2[2]; @@ -72,6 +74,7 @@ struct sca_block { #define CPUSTAT_ZARCH 0x00000800 #define CPUSTAT_MCDS 0x00000100 #define CPUSTAT_SM 0x00000080 +#define CPUSTAT_IBS 0x00000040 #define CPUSTAT_G 0x00000008 #define CPUSTAT_GED 0x00000004 #define CPUSTAT_J 0x00000002 @@ -79,7 +82,9 @@ struct sca_block { struct kvm_s390_sie_block { atomic_t cpuflags; /* 0x0000 */ - __u32 prefix; /* 0x0004 */ + __u32 : 1; /* 0x0004 */ + __u32 prefix : 18; + __u32 : 13; __u8 reserved08[4]; /* 0x0008 */ #define PROG_IN_SIE (1<<0) __u32 prog0c; /* 0x000c */ @@ -131,7 +136,10 @@ struct kvm_s390_sie_block { psw_t gpsw; /* 0x0090 */ __u64 gg14; /* 0x00a0 */ __u64 gg15; /* 0x00a8 */ - __u8 reservedb0[28]; /* 0x00b0 */ + __u8 reservedb0[20]; /* 0x00b0 */ + __u16 extcpuaddr; /* 0x00c4 */ + __u16 eic; /* 0x00c6 */ + __u32 reservedc8; /* 0x00c8 */ __u16 pgmilc; /* 0x00cc */ __u16 iprcc; /* 0x00ce */ __u32 dxc; /* 0x00d0 */ @@ -411,6 +419,7 @@ struct kvm_arch{ int use_cmma; struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS]; wait_queue_head_t ipte_wq; + spinlock_t start_stop_lock; }; #define KVM_HVA_ERR_BAD (-1UL) diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index 943d43451116..1aba89b53cb9 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -66,5 +66,6 @@ int memcpy_hsa(void *dest, unsigned long src, size_t count, int mode); unsigned long sclp_get_hsa_size(void); void sclp_early_detect(void); int sclp_has_siif(void); +unsigned int sclp_get_ibc(void); #endif /* _ASM_S390_SCLP_H */ diff --git a/arch/s390/include/uapi/asm/sie.h b/arch/s390/include/uapi/asm/sie.h new file mode 100644 index 000000000000..3d97f610198d --- /dev/null +++ b/arch/s390/include/uapi/asm/sie.h @@ -0,0 +1,245 @@ +#ifndef _UAPI_ASM_S390_SIE_H +#define _UAPI_ASM_S390_SIE_H + +#include <asm/sigp.h> + +#define diagnose_codes \ + { 0x10, "DIAG (0x10) release pages" }, \ + { 0x44, "DIAG (0x44) time slice end" }, \ + { 0x9c, "DIAG (0x9c) time slice end directed" }, \ + { 0x204, "DIAG (0x204) logical-cpu utilization" }, \ + { 0x258, "DIAG (0x258) page-reference services" }, \ + { 0x308, "DIAG (0x308) ipl functions" }, \ + { 0x500, "DIAG (0x500) KVM virtio functions" }, \ + { 0x501, "DIAG (0x501) KVM breakpoint" } + +#define sigp_order_codes \ + { SIGP_SENSE, "SIGP sense" }, \ + { SIGP_EXTERNAL_CALL, "SIGP external call" }, \ + { SIGP_EMERGENCY_SIGNAL, "SIGP emergency signal" }, \ + { SIGP_STOP, "SIGP stop" }, \ + { SIGP_STOP_AND_STORE_STATUS, "SIGP stop and store status" }, \ + { SIGP_SET_ARCHITECTURE, "SIGP set architecture" }, \ + { SIGP_SET_PREFIX, "SIGP set prefix" }, \ + { SIGP_SENSE_RUNNING, "SIGP sense running" }, \ + { SIGP_RESTART, "SIGP restart" }, \ + { SIGP_INITIAL_CPU_RESET, "SIGP initial cpu reset" }, \ + { SIGP_STORE_STATUS_AT_ADDRESS, "SIGP store status at address" } + +#define icpt_prog_codes \ + { 0x0001, "Prog Operation" }, \ + { 0x0002, "Prog Privileged Operation" }, \ + { 0x0003, "Prog Execute" }, \ + { 0x0004, "Prog Protection" }, \ + { 0x0005, "Prog Addressing" }, \ + { 0x0006, "Prog Specification" }, \ + { 0x0007, "Prog Data" }, \ + { 0x0008, "Prog Fixedpoint overflow" }, \ + { 0x0009, "Prog Fixedpoint divide" }, \ + { 0x000A, "Prog Decimal overflow" }, \ + { 0x000B, "Prog Decimal divide" }, \ + { 0x000C, "Prog HFP exponent overflow" }, \ + { 0x000D, "Prog HFP exponent underflow" }, \ + { 0x000E, "Prog HFP significance" }, \ + { 0x000F, "Prog HFP divide" }, \ + { 0x0010, "Prog Segment translation" }, \ + { 0x0011, "Prog Page translation" }, \ + { 0x0012, "Prog Translation specification" }, \ + { 0x0013, "Prog Special operation" }, \ + { 0x0015, "Prog Operand" }, \ + { 0x0016, "Prog Trace table" }, \ + { 0x0017, "Prog ASNtranslation specification" }, \ + { 0x001C, "Prog Spaceswitch event" }, \ + { 0x001D, "Prog HFP square root" }, \ + { 0x001F, "Prog PCtranslation specification" }, \ + { 0x0020, "Prog AFX translation" }, \ + { 0x0021, "Prog ASX translation" }, \ + { 0x0022, "Prog LX translation" }, \ + { 0x0023, "Prog EX translation" }, \ + { 0x0024, "Prog Primary authority" }, \ + { 0x0025, "Prog Secondary authority" }, \ + { 0x0026, "Prog LFXtranslation exception" }, \ + { 0x0027, "Prog LSXtranslation exception" }, \ + { 0x0028, "Prog ALET specification" }, \ + { 0x0029, "Prog ALEN translation" }, \ + { 0x002A, "Prog ALE sequence" }, \ + { 0x002B, "Prog ASTE validity" }, \ + { 0x002C, "Prog ASTE sequence" }, \ + { 0x002D, "Prog Extended authority" }, \ + { 0x002E, "Prog LSTE sequence" }, \ + { 0x002F, "Prog ASTE instance" }, \ + { 0x0030, "Prog Stack full" }, \ + { 0x0031, "Prog Stack empty" }, \ + { 0x0032, "Prog Stack specification" }, \ + { 0x0033, "Prog Stack type" }, \ + { 0x0034, "Prog Stack operation" }, \ + { 0x0039, "Prog Region first translation" }, \ + { 0x003A, "Prog Region second translation" }, \ + { 0x003B, "Prog Region third translation" }, \ + { 0x0040, "Prog Monitor event" }, \ + { 0x0080, "Prog PER event" }, \ + { 0x0119, "Prog Crypto operation" } + +#define exit_code_ipa0(ipa0, opcode, mnemonic) \ + { (ipa0 << 8 | opcode), #ipa0 " " mnemonic } +#define exit_code(opcode, mnemonic) \ + { opcode, mnemonic } + +#define icpt_insn_codes \ + exit_code_ipa0(0x01, 0x01, "PR"), \ + exit_code_ipa0(0x01, 0x04, "PTFF"), \ + exit_code_ipa0(0x01, 0x07, "SCKPF"), \ + exit_code_ipa0(0xAA, 0x00, "RINEXT"), \ + exit_code_ipa0(0xAA, 0x01, "RION"), \ + exit_code_ipa0(0xAA, 0x02, "TRIC"), \ + exit_code_ipa0(0xAA, 0x03, "RIOFF"), \ + exit_code_ipa0(0xAA, 0x04, "RIEMIT"), \ + exit_code_ipa0(0xB2, 0x02, "STIDP"), \ + exit_code_ipa0(0xB2, 0x04, "SCK"), \ + exit_code_ipa0(0xB2, 0x05, "STCK"), \ + exit_code_ipa0(0xB2, 0x06, "SCKC"), \ + exit_code_ipa0(0xB2, 0x07, "STCKC"), \ + exit_code_ipa0(0xB2, 0x08, "SPT"), \ + exit_code_ipa0(0xB2, 0x09, "STPT"), \ + exit_code_ipa0(0xB2, 0x0d, "PTLB"), \ + exit_code_ipa0(0xB2, 0x10, "SPX"), \ + exit_code_ipa0(0xB2, 0x11, "STPX"), \ + exit_code_ipa0(0xB2, 0x12, "STAP"), \ + exit_code_ipa0(0xB2, 0x14, "SIE"), \ + exit_code_ipa0(0xB2, 0x16, "SETR"), \ + exit_code_ipa0(0xB2, 0x17, "STETR"), \ + exit_code_ipa0(0xB2, 0x18, "PC"), \ + exit_code_ipa0(0xB2, 0x20, "SERVC"), \ + exit_code_ipa0(0xB2, 0x28, "PT"), \ + exit_code_ipa0(0xB2, 0x29, "ISKE"), \ + exit_code_ipa0(0xB2, 0x2a, "RRBE"), \ + exit_code_ipa0(0xB2, 0x2b, "SSKE"), \ + exit_code_ipa0(0xB2, 0x2c, "TB"), \ + exit_code_ipa0(0xB2, 0x2e, "PGIN"), \ + exit_code_ipa0(0xB2, 0x2f, "PGOUT"), \ + exit_code_ipa0(0xB2, 0x30, "CSCH"), \ + exit_code_ipa0(0xB2, 0x31, "HSCH"), \ + exit_code_ipa0(0xB2, 0x32, "MSCH"), \ + exit_code_ipa0(0xB2, 0x33, "SSCH"), \ + exit_code_ipa0(0xB2, 0x34, "STSCH"), \ + exit_code_ipa0(0xB2, 0x35, "TSCH"), \ + exit_code_ipa0(0xB2, 0x36, "TPI"), \ + exit_code_ipa0(0xB2, 0x37, "SAL"), \ + exit_code_ipa0(0xB2, 0x38, "RSCH"), \ + exit_code_ipa0(0xB2, 0x39, "STCRW"), \ + exit_code_ipa0(0xB2, 0x3a, "STCPS"), \ + exit_code_ipa0(0xB2, 0x3b, "RCHP"), \ + exit_code_ipa0(0xB2, 0x3c, "SCHM"), \ + exit_code_ipa0(0xB2, 0x40, "BAKR"), \ + exit_code_ipa0(0xB2, 0x48, "PALB"), \ + exit_code_ipa0(0xB2, 0x4c, "TAR"), \ + exit_code_ipa0(0xB2, 0x50, "CSP"), \ + exit_code_ipa0(0xB2, 0x54, "MVPG"), \ + exit_code_ipa0(0xB2, 0x58, "BSG"), \ + exit_code_ipa0(0xB2, 0x5a, "BSA"), \ + exit_code_ipa0(0xB2, 0x5f, "CHSC"), \ + exit_code_ipa0(0xB2, 0x74, "SIGA"), \ + exit_code_ipa0(0xB2, 0x76, "XSCH"), \ + exit_code_ipa0(0xB2, 0x78, "STCKE"), \ + exit_code_ipa0(0xB2, 0x7c, "STCKF"), \ + exit_code_ipa0(0xB2, 0x7d, "STSI"), \ + exit_code_ipa0(0xB2, 0xb0, "STFLE"), \ + exit_code_ipa0(0xB2, 0xb1, "STFL"), \ + exit_code_ipa0(0xB2, 0xb2, "LPSWE"), \ + exit_code_ipa0(0xB2, 0xf8, "TEND"), \ + exit_code_ipa0(0xB2, 0xfc, "TABORT"), \ + exit_code_ipa0(0xB9, 0x1e, "KMAC"), \ + exit_code_ipa0(0xB9, 0x28, "PCKMO"), \ + exit_code_ipa0(0xB9, 0x2a, "KMF"), \ + exit_code_ipa0(0xB9, 0x2b, "KMO"), \ + exit_code_ipa0(0xB9, 0x2d, "KMCTR"), \ + exit_code_ipa0(0xB9, 0x2e, "KM"), \ + exit_code_ipa0(0xB9, 0x2f, "KMC"), \ + exit_code_ipa0(0xB9, 0x3e, "KIMD"), \ + exit_code_ipa0(0xB9, 0x3f, "KLMD"), \ + exit_code_ipa0(0xB9, 0x8a, "CSPG"), \ + exit_code_ipa0(0xB9, 0x8d, "EPSW"), \ + exit_code_ipa0(0xB9, 0x8e, "IDTE"), \ + exit_code_ipa0(0xB9, 0x8f, "CRDTE"), \ + exit_code_ipa0(0xB9, 0x9c, "EQBS"), \ + exit_code_ipa0(0xB9, 0xa2, "PTF"), \ + exit_code_ipa0(0xB9, 0xab, "ESSA"), \ + exit_code_ipa0(0xB9, 0xae, "RRBM"), \ + exit_code_ipa0(0xB9, 0xaf, "PFMF"), \ + exit_code_ipa0(0xE3, 0x03, "LRAG"), \ + exit_code_ipa0(0xE3, 0x13, "LRAY"), \ + exit_code_ipa0(0xE3, 0x25, "NTSTG"), \ + exit_code_ipa0(0xE5, 0x00, "LASP"), \ + exit_code_ipa0(0xE5, 0x01, "TPROT"), \ + exit_code_ipa0(0xE5, 0x60, "TBEGIN"), \ + exit_code_ipa0(0xE5, 0x61, "TBEGINC"), \ + exit_code_ipa0(0xEB, 0x25, "STCTG"), \ + exit_code_ipa0(0xEB, 0x2f, "LCTLG"), \ + exit_code_ipa0(0xEB, 0x60, "LRIC"), \ + exit_code_ipa0(0xEB, 0x61, "STRIC"), \ + exit_code_ipa0(0xEB, 0x62, "MRIC"), \ + exit_code_ipa0(0xEB, 0x8a, "SQBS"), \ + exit_code_ipa0(0xC8, 0x01, "ECTG"), \ + exit_code(0x0a, "SVC"), \ + exit_code(0x80, "SSM"), \ + exit_code(0x82, "LPSW"), \ + exit_code(0x83, "DIAG"), \ + exit_code(0xae, "SIGP"), \ + exit_code(0xac, "STNSM"), \ + exit_code(0xad, "STOSM"), \ + exit_code(0xb1, "LRA"), \ + exit_code(0xb6, "STCTL"), \ + exit_code(0xb7, "LCTL"), \ + exit_code(0xee, "PLO") + +#define sie_intercept_code \ + { 0x00, "Host interruption" }, \ + { 0x04, "Instruction" }, \ + { 0x08, "Program interruption" }, \ + { 0x0c, "Instruction and program interruption" }, \ + { 0x10, "External request" }, \ + { 0x14, "External interruption" }, \ + { 0x18, "I/O request" }, \ + { 0x1c, "Wait state" }, \ + { 0x20, "Validity" }, \ + { 0x28, "Stop request" }, \ + { 0x2c, "Operation exception" }, \ + { 0x38, "Partial-execution" }, \ + { 0x3c, "I/O interruption" }, \ + { 0x40, "I/O instruction" }, \ + { 0x48, "Timing subset" } + +/* + * This is the simple interceptable instructions decoder. + * + * It will be used as userspace interface and it can be used in places + * that does not allow to use general decoder functions, + * such as trace events declarations. + * + * Some userspace tools may want to parse this code + * and would be confused by switch(), if() and other statements, + * but they can understand conditional operator. + */ +#define INSN_DECODE_IPA0(ipa0, insn, rshift, mask) \ + (insn >> 56) == (ipa0) ? \ + ((ipa0 << 8) | ((insn >> rshift) & mask)) : + +#define INSN_DECODE(insn) (insn >> 56) + +/* + * The macro icpt_insn_decoder() takes an intercepted instruction + * and returns a key, which can be used to find a mnemonic name + * of the instruction in the icpt_insn_codes table. + */ +#define icpt_insn_decoder(insn) \ + INSN_DECODE_IPA0(0x01, insn, 48, 0xff) \ + INSN_DECODE_IPA0(0xaa, insn, 48, 0x0f) \ + INSN_DECODE_IPA0(0xb2, insn, 48, 0xff) \ + INSN_DECODE_IPA0(0xb9, insn, 48, 0xff) \ + INSN_DECODE_IPA0(0xe3, insn, 48, 0xff) \ + INSN_DECODE_IPA0(0xe5, insn, 48, 0xff) \ + INSN_DECODE_IPA0(0xeb, insn, 16, 0xff) \ + INSN_DECODE_IPA0(0xc8, insn, 48, 0x0f) \ + INSN_DECODE(insn) + +#endif /* _UAPI_ASM_S390_SIE_H */ diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 5521ace8b60d..0161675878a2 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -23,7 +23,7 @@ static int diag_release_pages(struct kvm_vcpu *vcpu) { unsigned long start, end; - unsigned long prefix = vcpu->arch.sie_block->prefix; + unsigned long prefix = kvm_s390_get_prefix(vcpu); start = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4]; end = vcpu->run->s.regs.gprs[vcpu->arch.sie_block->ipa & 0xf] + 4096; @@ -176,7 +176,7 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu) return -EOPNOTSUPP; } - atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); + kvm_s390_vcpu_stop(vcpu); vcpu->run->s390_reset_flags |= KVM_S390_RESET_SUBSYSTEM; vcpu->run->s390_reset_flags |= KVM_S390_RESET_IPL; vcpu->run->s390_reset_flags |= KVM_S390_RESET_CPU_INIT; diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 691fdb776c90..db608c3f9303 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -643,3 +643,31 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, } return rc; } + +/** + * kvm_s390_check_low_addr_protection - check for low-address protection + * @ga: Guest address + * + * Checks whether an address is subject to low-address protection and set + * up vcpu->arch.pgm accordingly if necessary. + * + * Return: 0 if no protection exception, or PGM_PROTECTION if protected. + */ +int kvm_s390_check_low_addr_protection(struct kvm_vcpu *vcpu, unsigned long ga) +{ + struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm; + psw_t *psw = &vcpu->arch.sie_block->gpsw; + struct trans_exc_code_bits *tec_bits; + + if (!is_low_address(ga) || !low_address_protection_enabled(vcpu)) + return 0; + + memset(pgm, 0, sizeof(*pgm)); + tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code; + tec_bits->fsi = FSI_STORE; + tec_bits->as = psw_bits(*psw).as; + tec_bits->addr = ga >> PAGE_SHIFT; + pgm->code = PGM_PROTECTION; + + return pgm->code; +} diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index 1079c8fc6d0d..a07ee08ac478 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -30,7 +30,7 @@ static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu, unsigned long gra) { - unsigned long prefix = vcpu->arch.sie_block->prefix; + unsigned long prefix = kvm_s390_get_prefix(vcpu); if (gra < 2 * PAGE_SIZE) gra += prefix; @@ -99,7 +99,7 @@ static inline unsigned long kvm_s390_logical_to_effective(struct kvm_vcpu *vcpu, unsigned long __gpa; \ \ __gpa = (unsigned long)(gra); \ - __gpa += __vcpu->arch.sie_block->prefix; \ + __gpa += kvm_s390_get_prefix(__vcpu); \ kvm_write_guest(__vcpu->kvm, __gpa, &__x, sizeof(__x)); \ }) @@ -124,7 +124,7 @@ static inline __must_check int write_guest_lc(struct kvm_vcpu *vcpu, unsigned long gra, void *data, unsigned long len) { - unsigned long gpa = gra + vcpu->arch.sie_block->prefix; + unsigned long gpa = gra + kvm_s390_get_prefix(vcpu); return kvm_write_guest(vcpu->kvm, gpa, data, len); } @@ -150,7 +150,7 @@ static inline __must_check int read_guest_lc(struct kvm_vcpu *vcpu, unsigned long gra, void *data, unsigned long len) { - unsigned long gpa = gra + vcpu->arch.sie_block->prefix; + unsigned long gpa = gra + kvm_s390_get_prefix(vcpu); return kvm_read_guest(vcpu->kvm, gpa, data, len); } @@ -325,5 +325,6 @@ int read_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data, } int ipte_lock_held(struct kvm_vcpu *vcpu); +int kvm_s390_check_low_addr_protection(struct kvm_vcpu *vcpu, unsigned long ga); #endif /* __KVM_S390_GACCESS_H */ diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c index 757ccef62fd5..3e8d4092ce30 100644 --- a/arch/s390/kvm/guestdbg.c +++ b/arch/s390/kvm/guestdbg.c @@ -223,9 +223,10 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, goto error; } - ret = copy_from_user(bp_data, dbg->arch.hw_bp, size); - if (ret) + if (copy_from_user(bp_data, dbg->arch.hw_bp, size)) { + ret = -EFAULT; goto error; + } for (i = 0; i < dbg->arch.nr_hw_bp; i++) { switch (bp_data[i].type) { diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 30e1c5eb726a..a0b586c1913c 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -1,7 +1,7 @@ /* * in-kernel handling for sie intercepts * - * Copyright IBM Corp. 2008, 2009 + * Copyright IBM Corp. 2008, 2014 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License (version 2 only) @@ -17,6 +17,7 @@ #include <asm/kvm_host.h> #include <asm/asm-offsets.h> +#include <asm/irq.h> #include "kvm-s390.h" #include "gaccess.h" @@ -46,9 +47,6 @@ static int handle_noop(struct kvm_vcpu *vcpu) case 0x10: vcpu->stat.exit_external_request++; break; - case 0x14: - vcpu->stat.exit_external_interrupt++; - break; default: break; /* nothing */ } @@ -65,8 +63,7 @@ static int handle_stop(struct kvm_vcpu *vcpu) trace_kvm_s390_stop_request(vcpu->arch.local_int.action_bits); if (vcpu->arch.local_int.action_bits & ACTION_STOP_ON_STOP) { - atomic_set_mask(CPUSTAT_STOPPED, - &vcpu->arch.sie_block->cpuflags); + kvm_s390_vcpu_stop(vcpu); vcpu->arch.local_int.action_bits &= ~ACTION_STOP_ON_STOP; VCPU_EVENT(vcpu, 3, "%s", "cpu stopped"); rc = -EOPNOTSUPP; @@ -198,6 +195,7 @@ static int handle_itdb(struct kvm_vcpu *vcpu) static int handle_prog(struct kvm_vcpu *vcpu) { struct kvm_s390_pgm_info pgm_info; + psw_t psw; int rc; vcpu->stat.exit_program_interruption++; @@ -210,7 +208,14 @@ static int handle_prog(struct kvm_vcpu *vcpu) } trace_kvm_s390_intercept_prog(vcpu, vcpu->arch.sie_block->iprcc); - + if (vcpu->arch.sie_block->iprcc == PGM_SPECIFICATION) { + rc = read_guest_lc(vcpu, __LC_PGM_NEW_PSW, &psw, sizeof(psw_t)); + if (rc) + return rc; + /* Avoid endless loops of specification exceptions */ + if (!is_valid_psw(&psw)) + return -EOPNOTSUPP; + } rc = handle_itdb(vcpu); if (rc) return rc; @@ -234,17 +239,110 @@ static int handle_instruction_and_prog(struct kvm_vcpu *vcpu) return rc2; } +/** + * handle_external_interrupt - used for external interruption interceptions + * + * This interception only occurs if the CPUSTAT_EXT_INT bit was set, or if + * the new PSW does not have external interrupts disabled. In the first case, + * we've got to deliver the interrupt manually, and in the second case, we + * drop to userspace to handle the situation there. + */ +static int handle_external_interrupt(struct kvm_vcpu *vcpu) +{ + u16 eic = vcpu->arch.sie_block->eic; + struct kvm_s390_interrupt irq; + psw_t newpsw; + int rc; + + vcpu->stat.exit_external_interrupt++; + + rc = read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &newpsw, sizeof(psw_t)); + if (rc) + return rc; + /* We can not handle clock comparator or timer interrupt with bad PSW */ + if ((eic == EXT_IRQ_CLK_COMP || eic == EXT_IRQ_CPU_TIMER) && + (newpsw.mask & PSW_MASK_EXT)) + return -EOPNOTSUPP; + + switch (eic) { + case EXT_IRQ_CLK_COMP: + irq.type = KVM_S390_INT_CLOCK_COMP; + break; + case EXT_IRQ_CPU_TIMER: + irq.type = KVM_S390_INT_CPU_TIMER; + break; + case EXT_IRQ_EXTERNAL_CALL: + if (kvm_s390_si_ext_call_pending(vcpu)) + return 0; + irq.type = KVM_S390_INT_EXTERNAL_CALL; + irq.parm = vcpu->arch.sie_block->extcpuaddr; + break; + default: + return -EOPNOTSUPP; + } + + return kvm_s390_inject_vcpu(vcpu, &irq); +} + +/** + * Handle MOVE PAGE partial execution interception. + * + * This interception can only happen for guests with DAT disabled and + * addresses that are currently not mapped in the host. Thus we try to + * set up the mappings for the corresponding user pages here (or throw + * addressing exceptions in case of illegal guest addresses). + */ +static int handle_mvpg_pei(struct kvm_vcpu *vcpu) +{ + psw_t *psw = &vcpu->arch.sie_block->gpsw; + unsigned long srcaddr, dstaddr; + int reg1, reg2, rc; + + kvm_s390_get_regs_rre(vcpu, ®1, ®2); + + /* Make sure that the source is paged-in */ + srcaddr = kvm_s390_real_to_abs(vcpu, vcpu->run->s.regs.gprs[reg2]); + if (kvm_is_error_gpa(vcpu->kvm, srcaddr)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + rc = kvm_arch_fault_in_page(vcpu, srcaddr, 0); + if (rc != 0) + return rc; + + /* Make sure that the destination is paged-in */ + dstaddr = kvm_s390_real_to_abs(vcpu, vcpu->run->s.regs.gprs[reg1]); + if (kvm_is_error_gpa(vcpu->kvm, dstaddr)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + rc = kvm_arch_fault_in_page(vcpu, dstaddr, 1); + if (rc != 0) + return rc; + + psw->addr = __rewind_psw(*psw, 4); + + return 0; +} + +static int handle_partial_execution(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.sie_block->ipa == 0xb254) /* MVPG */ + return handle_mvpg_pei(vcpu); + if (vcpu->arch.sie_block->ipa >> 8 == 0xae) /* SIGP */ + return kvm_s390_handle_sigp_pei(vcpu); + + return -EOPNOTSUPP; +} + static const intercept_handler_t intercept_funcs[] = { [0x00 >> 2] = handle_noop, [0x04 >> 2] = handle_instruction, [0x08 >> 2] = handle_prog, [0x0C >> 2] = handle_instruction_and_prog, [0x10 >> 2] = handle_noop, - [0x14 >> 2] = handle_noop, + [0x14 >> 2] = handle_external_interrupt, [0x18 >> 2] = handle_noop, [0x1C >> 2] = kvm_s390_handle_wait, [0x20 >> 2] = handle_validity, [0x28 >> 2] = handle_stop, + [0x38 >> 2] = handle_partial_execution, }; int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 077e4738ebdc..bf0d9bc15bcd 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -27,6 +27,8 @@ #define IOINT_CSSID_MASK 0x03fc0000 #define IOINT_AI_MASK 0x04000000 +static void deliver_ckc_interrupt(struct kvm_vcpu *vcpu); + static int is_ioint(u64 type) { return ((type & 0xfffe0000u) != 0xfffe0000u); @@ -89,6 +91,14 @@ static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu, if (vcpu->arch.sie_block->gcr[0] & 0x4000ul) return 1; return 0; + case KVM_S390_INT_CLOCK_COMP: + return ckc_interrupts_enabled(vcpu); + case KVM_S390_INT_CPU_TIMER: + if (psw_extint_disabled(vcpu)) + return 0; + if (vcpu->arch.sie_block->gcr[0] & 0x400ul) + return 1; + return 0; case KVM_S390_INT_SERVICE: case KVM_S390_INT_PFAULT_INIT: case KVM_S390_INT_PFAULT_DONE: @@ -138,9 +148,8 @@ static void __unset_cpu_idle(struct kvm_vcpu *vcpu) static void __reset_intercept_indicators(struct kvm_vcpu *vcpu) { - atomic_clear_mask(CPUSTAT_ECALL_PEND | - CPUSTAT_IO_INT | CPUSTAT_EXT_INT | CPUSTAT_STOP_INT, - &vcpu->arch.sie_block->cpuflags); + atomic_clear_mask(CPUSTAT_IO_INT | CPUSTAT_EXT_INT | CPUSTAT_STOP_INT, + &vcpu->arch.sie_block->cpuflags); vcpu->arch.sie_block->lctl = 0x0000; vcpu->arch.sie_block->ictl &= ~(ICTL_LPSW | ICTL_STCTL | ICTL_PINT); @@ -166,6 +175,8 @@ static void __set_intercept_indicator(struct kvm_vcpu *vcpu, case KVM_S390_INT_PFAULT_INIT: case KVM_S390_INT_PFAULT_DONE: case KVM_S390_INT_VIRTIO: + case KVM_S390_INT_CLOCK_COMP: + case KVM_S390_INT_CPU_TIMER: if (psw_extint_disabled(vcpu)) __set_cpuflag(vcpu, CPUSTAT_EXT_INT); else @@ -326,6 +337,24 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); break; + case KVM_S390_INT_CLOCK_COMP: + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, + inti->ext.ext_params, 0); + deliver_ckc_interrupt(vcpu); + break; + case KVM_S390_INT_CPU_TIMER: + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, + inti->ext.ext_params, 0); + rc = put_guest_lc(vcpu, EXT_IRQ_CPU_TIMER, + (u16 *)__LC_EXT_INT_CODE); + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, + sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= put_guest_lc(vcpu, inti->ext.ext_params, + (u32 *)__LC_EXT_PARAMS); + break; case KVM_S390_INT_SERVICE: VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x", inti->ext.ext_params); @@ -413,7 +442,7 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, rc |= read_guest_lc(vcpu, offsetof(struct _lowcore, restart_psw), &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - atomic_clear_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); + kvm_s390_vcpu_start(vcpu); break; case KVM_S390_PROGRAM_INT: VCPU_EVENT(vcpu, 4, "interrupt: pgm check code:%x, ilc:%x", @@ -494,6 +523,20 @@ static void deliver_ckc_interrupt(struct kvm_vcpu *vcpu) } } +/* Check whether SIGP interpretation facility has an external call pending */ +int kvm_s390_si_ext_call_pending(struct kvm_vcpu *vcpu) +{ + atomic_t *sigp_ctrl = &vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].ctrl; + + if (!psw_extint_disabled(vcpu) && + (vcpu->arch.sie_block->gcr[0] & 0x2000ul) && + (atomic_read(sigp_ctrl) & SIGP_CTRL_C) && + (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_ECALL_PEND)) + return 1; + + return 0; +} + int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; @@ -524,6 +567,9 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu) if (!rc && kvm_cpu_has_pending_timer(vcpu)) rc = 1; + if (!rc && kvm_s390_si_ext_call_pending(vcpu)) + rc = 1; + return rc; } @@ -580,7 +626,8 @@ no_timer: while (list_empty(&vcpu->arch.local_int.list) && list_empty(&vcpu->arch.local_int.float_int->list) && (!vcpu->arch.local_int.timer_due) && - !signal_pending(current)) { + !signal_pending(current) && + !kvm_s390_si_ext_call_pending(vcpu)) { set_current_state(TASK_INTERRUPTIBLE); spin_unlock_bh(&vcpu->arch.local_int.lock); spin_unlock(&vcpu->arch.local_int.float_int->lock); @@ -637,6 +684,11 @@ void kvm_s390_clear_local_irqs(struct kvm_vcpu *vcpu) } atomic_set(&li->active, 0); spin_unlock_bh(&li->lock); + + /* clear pending external calls set by sigp interpretation facility */ + atomic_clear_mask(CPUSTAT_ECALL_PEND, &vcpu->arch.sie_block->cpuflags); + atomic_clear_mask(SIGP_CTRL_C, + &vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].ctrl); } void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) @@ -984,6 +1036,8 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, break; case KVM_S390_SIGP_STOP: case KVM_S390_RESTART: + case KVM_S390_INT_CLOCK_COMP: + case KVM_S390_INT_CPU_TIMER: VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type); inti->type = s390int->type; break; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index b32c42cbc706..e519860c6031 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -458,6 +458,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.css_support = 0; kvm->arch.use_irqchip = 0; + spin_lock_init(&kvm->arch.start_stop_lock); + return 0; out_nogmap: debug_unregister(kvm->arch.dbf); @@ -592,7 +594,7 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->pp = 0; vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; kvm_clear_async_pf_completion_queue(vcpu); - atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); + kvm_s390_vcpu_stop(vcpu); kvm_s390_clear_local_irqs(vcpu); } @@ -631,7 +633,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->ecb |= 0x10; vcpu->arch.sie_block->ecb2 = 8; - vcpu->arch.sie_block->eca = 0xC1002000U; + vcpu->arch.sie_block->eca = 0xD1002000U; if (sclp_has_siif()) vcpu->arch.sie_block->eca |= 1; vcpu->arch.sie_block->fac = (int) (long) vfacilities; @@ -751,7 +753,7 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address) kvm_for_each_vcpu(i, vcpu, kvm) { /* match against both prefix pages */ - if (vcpu->arch.sie_block->prefix == (address & ~0x1000UL)) { + if (kvm_s390_get_prefix(vcpu) == (address & ~0x1000UL)) { VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address); kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); exit_sie_sync(vcpu); @@ -996,8 +998,15 @@ bool kvm_s390_cmma_enabled(struct kvm *kvm) return true; } +static bool ibs_enabled(struct kvm_vcpu *vcpu) +{ + return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_IBS; +} + static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) { +retry: + s390_vcpu_unblock(vcpu); /* * We use MMU_RELOAD just to re-arm the ipte notifier for the * guest prefix page. gmap_ipte_notify will wait on the ptl lock. @@ -1005,27 +1014,61 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) * already finished. We might race against a second unmapper that * wants to set the blocking bit. Lets just retry the request loop. */ - while (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) { + if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) { int rc; rc = gmap_ipte_notify(vcpu->arch.gmap, - vcpu->arch.sie_block->prefix, + kvm_s390_get_prefix(vcpu), PAGE_SIZE * 2); if (rc) return rc; - s390_vcpu_unblock(vcpu); + goto retry; } + + if (kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu)) { + if (!ibs_enabled(vcpu)) { + trace_kvm_s390_enable_disable_ibs(vcpu->vcpu_id, 1); + atomic_set_mask(CPUSTAT_IBS, + &vcpu->arch.sie_block->cpuflags); + } + goto retry; + } + + if (kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu)) { + if (ibs_enabled(vcpu)) { + trace_kvm_s390_enable_disable_ibs(vcpu->vcpu_id, 0); + atomic_clear_mask(CPUSTAT_IBS, + &vcpu->arch.sie_block->cpuflags); + } + goto retry; + } + return 0; } -static long kvm_arch_fault_in_sync(struct kvm_vcpu *vcpu) +/** + * kvm_arch_fault_in_page - fault-in guest page if necessary + * @vcpu: The corresponding virtual cpu + * @gpa: Guest physical address + * @writable: Whether the page should be writable or not + * + * Make sure that a guest page has been faulted-in on the host. + * + * Return: Zero on success, negative error code otherwise. + */ +long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable) { - long rc; - hva_t fault = gmap_fault(current->thread.gmap_addr, vcpu->arch.gmap); struct mm_struct *mm = current->mm; + hva_t hva; + long rc; + + hva = gmap_fault(gpa, vcpu->arch.gmap); + if (IS_ERR_VALUE(hva)) + return (long)hva; down_read(&mm->mmap_sem); - rc = get_user_pages(current, mm, fault, 1, 1, 0, NULL, NULL); + rc = get_user_pages(current, mm, hva, 1, writable, 0, NULL, NULL); up_read(&mm->mmap_sem); - return rc; + + return rc < 0 ? rc : 0; } static void __kvm_inject_pfault_token(struct kvm_vcpu *vcpu, bool start_token, @@ -1163,9 +1206,12 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) } else if (current->thread.gmap_pfault) { trace_kvm_s390_major_guest_pfault(vcpu); current->thread.gmap_pfault = 0; - if (kvm_arch_setup_async_pf(vcpu) || - (kvm_arch_fault_in_sync(vcpu) >= 0)) + if (kvm_arch_setup_async_pf(vcpu)) { rc = 0; + } else { + gpa_t gpa = current->thread.gmap_addr; + rc = kvm_arch_fault_in_page(vcpu, gpa, 1); + } } if (rc == -1) { @@ -1235,7 +1281,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); - atomic_clear_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); + kvm_s390_vcpu_start(vcpu); switch (kvm_run->exit_reason) { case KVM_EXIT_S390_SIEIC: @@ -1292,7 +1338,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_run->psw_mask = vcpu->arch.sie_block->gpsw.mask; kvm_run->psw_addr = vcpu->arch.sie_block->gpsw.addr; - kvm_run->s.regs.prefix = vcpu->arch.sie_block->prefix; + kvm_run->s.regs.prefix = kvm_s390_get_prefix(vcpu); memcpy(&kvm_run->s.regs.crs, &vcpu->arch.sie_block->gcr, 128); if (vcpu->sigset_active) @@ -1311,6 +1357,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa) { unsigned char archmode = 1; + unsigned int px; u64 clkcomp; int rc; @@ -1329,8 +1376,9 @@ int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa) vcpu->run->s.regs.gprs, 128); rc |= write_guest_abs(vcpu, gpa + offsetof(struct save_area, psw), &vcpu->arch.sie_block->gpsw, 16); + px = kvm_s390_get_prefix(vcpu); rc |= write_guest_abs(vcpu, gpa + offsetof(struct save_area, pref_reg), - &vcpu->arch.sie_block->prefix, 4); + &px, 4); rc |= write_guest_abs(vcpu, gpa + offsetof(struct save_area, fp_ctrl_reg), &vcpu->arch.guest_fpregs.fpc, 4); @@ -1362,6 +1410,109 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) return kvm_s390_store_status_unloaded(vcpu, addr); } +static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu) +{ + return atomic_read(&(vcpu)->arch.sie_block->cpuflags) & CPUSTAT_STOPPED; +} + +static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu) +{ + kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu); + kvm_make_request(KVM_REQ_DISABLE_IBS, vcpu); + exit_sie_sync(vcpu); +} + +static void __disable_ibs_on_all_vcpus(struct kvm *kvm) +{ + unsigned int i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + __disable_ibs_on_vcpu(vcpu); + } +} + +static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu) +{ + kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu); + kvm_make_request(KVM_REQ_ENABLE_IBS, vcpu); + exit_sie_sync(vcpu); +} + +void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu) +{ + int i, online_vcpus, started_vcpus = 0; + + if (!is_vcpu_stopped(vcpu)) + return; + + trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 1); + /* Only one cpu at a time may enter/leave the STOPPED state. */ + spin_lock_bh(&vcpu->kvm->arch.start_stop_lock); + online_vcpus = atomic_read(&vcpu->kvm->online_vcpus); + + for (i = 0; i < online_vcpus; i++) { + if (!is_vcpu_stopped(vcpu->kvm->vcpus[i])) + started_vcpus++; + } + + if (started_vcpus == 0) { + /* we're the only active VCPU -> speed it up */ + __enable_ibs_on_vcpu(vcpu); + } else if (started_vcpus == 1) { + /* + * As we are starting a second VCPU, we have to disable + * the IBS facility on all VCPUs to remove potentially + * oustanding ENABLE requests. + */ + __disable_ibs_on_all_vcpus(vcpu->kvm); + } + + atomic_clear_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); + /* + * Another VCPU might have used IBS while we were offline. + * Let's play safe and flush the VCPU at startup. + */ + vcpu->arch.sie_block->ihcpu = 0xffff; + spin_unlock_bh(&vcpu->kvm->arch.start_stop_lock); + return; +} + +void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu) +{ + int i, online_vcpus, started_vcpus = 0; + struct kvm_vcpu *started_vcpu = NULL; + + if (is_vcpu_stopped(vcpu)) + return; + + trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 0); + /* Only one cpu at a time may enter/leave the STOPPED state. */ + spin_lock_bh(&vcpu->kvm->arch.start_stop_lock); + online_vcpus = atomic_read(&vcpu->kvm->online_vcpus); + + atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); + __disable_ibs_on_vcpu(vcpu); + + for (i = 0; i < online_vcpus; i++) { + if (!is_vcpu_stopped(vcpu->kvm->vcpus[i])) { + started_vcpus++; + started_vcpu = vcpu->kvm->vcpus[i]; + } + } + + if (started_vcpus == 1) { + /* + * As we only have one VCPU left, we want to enable the + * IBS facility for that VCPU to speed it up. + */ + __enable_ibs_on_vcpu(started_vcpu); + } + + spin_unlock_bh(&vcpu->kvm->arch.start_stop_lock); + return; +} + static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, struct kvm_enable_cap *cap) { diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 9b5680d1f6cc..a8655ed31616 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -61,9 +61,15 @@ static inline int kvm_is_ucontrol(struct kvm *kvm) #endif } +#define GUEST_PREFIX_SHIFT 13 +static inline u32 kvm_s390_get_prefix(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.sie_block->prefix << GUEST_PREFIX_SHIFT; +} + static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix) { - vcpu->arch.sie_block->prefix = prefix & 0x7fffe000u; + vcpu->arch.sie_block->prefix = prefix >> GUEST_PREFIX_SHIFT; vcpu->arch.sie_block->ihcpu = 0xffff; kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); } @@ -142,6 +148,7 @@ void kvm_s390_reinject_io_int(struct kvm *kvm, int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked); /* implemented in priv.c */ +int is_valid_psw(psw_t *psw); int kvm_s390_handle_b2(struct kvm_vcpu *vcpu); int kvm_s390_handle_e5(struct kvm_vcpu *vcpu); int kvm_s390_handle_01(struct kvm_vcpu *vcpu); @@ -153,10 +160,14 @@ int kvm_s390_handle_eb(struct kvm_vcpu *vcpu); /* implemented in sigp.c */ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); +int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu); /* implemented in kvm-s390.c */ +long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable); int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr); int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr); +void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu); +void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu); void s390_vcpu_block(struct kvm_vcpu *vcpu); void s390_vcpu_unblock(struct kvm_vcpu *vcpu); void exit_sie(struct kvm_vcpu *vcpu); @@ -210,6 +221,7 @@ static inline int kvm_s390_inject_prog_cond(struct kvm_vcpu *vcpu, int rc) int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int psw_extint_disabled(struct kvm_vcpu *vcpu); void kvm_s390_destroy_adapters(struct kvm *kvm); +int kvm_s390_si_ext_call_pending(struct kvm_vcpu *vcpu); /* implemented in guestdbg.c */ void kvm_s390_backup_guest_per_regs(struct kvm_vcpu *vcpu); diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 27f9051a78f8..6296159ac883 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -119,8 +119,7 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu) if (operand2 & 3) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - address = vcpu->arch.sie_block->prefix; - address = address & 0x7fffe000u; + address = kvm_s390_get_prefix(vcpu); /* get the value */ rc = write_guest(vcpu, operand2, &address, sizeof(address)); @@ -206,6 +205,9 @@ static int handle_test_block(struct kvm_vcpu *vcpu) kvm_s390_get_regs_rre(vcpu, NULL, ®2); addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; + addr = kvm_s390_logical_to_effective(vcpu, addr); + if (kvm_s390_check_low_addr_protection(vcpu, addr)) + return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); addr = kvm_s390_real_to_abs(vcpu, addr); if (kvm_is_error_gpa(vcpu->kvm, addr)) @@ -362,7 +364,8 @@ static void handle_new_psw(struct kvm_vcpu *vcpu) #define PSW_ADDR_24 0x0000000000ffffffUL #define PSW_ADDR_31 0x000000007fffffffUL -static int is_valid_psw(psw_t *psw) { +int is_valid_psw(psw_t *psw) +{ if (psw->mask & PSW_MASK_UNASSIGNED) return 0; if ((psw->mask & PSW_MASK_ADDR_MODE) == PSW_MASK_BA) { @@ -373,6 +376,8 @@ static int is_valid_psw(psw_t *psw) { return 0; if ((psw->mask & PSW_MASK_ADDR_MODE) == PSW_MASK_EA) return 0; + if (psw->addr & 1) + return 0; return 1; } @@ -650,6 +655,11 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; + if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) { + if (kvm_s390_check_low_addr_protection(vcpu, start)) + return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); + } + switch (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) { case 0x00000000: end = (start + (1UL << 12)) & ~((1UL << 12) - 1); @@ -665,10 +675,15 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); } while (start < end) { - unsigned long useraddr; - - useraddr = gmap_translate(start, vcpu->arch.gmap); - if (IS_ERR((void *)useraddr)) + unsigned long useraddr, abs_addr; + + /* Translate guest address to host address */ + if ((vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) == 0) + abs_addr = kvm_s390_real_to_abs(vcpu, start); + else + abs_addr = start; + useraddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(abs_addr)); + if (kvm_is_error_hva(useraddr)) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) { diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index c0b99e0f6b63..d0341d2e54b1 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -458,3 +458,38 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) kvm_s390_set_psw_cc(vcpu, rc); return 0; } + +/* + * Handle SIGP partial execution interception. + * + * This interception will occur at the source cpu when a source cpu sends an + * external call to a target cpu and the target cpu has the WAIT bit set in + * its cpuflags. Interception will occurr after the interrupt indicator bits at + * the target cpu have been set. All error cases will lead to instruction + * interception, therefore nothing is to be checked or prepared. + */ +int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu) +{ + int r3 = vcpu->arch.sie_block->ipa & 0x000f; + u16 cpu_addr = vcpu->run->s.regs.gprs[r3]; + struct kvm_vcpu *dest_vcpu; + u8 order_code = kvm_s390_get_base_disp_rs(vcpu); + + trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr); + + if (order_code == SIGP_EXTERNAL_CALL) { + dest_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); + BUG_ON(dest_vcpu == NULL); + + spin_lock_bh(&dest_vcpu->arch.local_int.lock); + if (waitqueue_active(&dest_vcpu->wq)) + wake_up_interruptible(&dest_vcpu->wq); + dest_vcpu->preempted = true; + spin_unlock_bh(&dest_vcpu->arch.local_int.lock); + + kvm_s390_set_psw_cc(vcpu, SIGP_CC_ORDER_CODE_ACCEPTED); + return 0; + } + + return -EOPNOTSUPP; +} diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h index 13f30f58a2df..647e9d6a4818 100644 --- a/arch/s390/kvm/trace-s390.h +++ b/arch/s390/kvm/trace-s390.h @@ -68,6 +68,27 @@ TRACE_EVENT(kvm_s390_destroy_vcpu, ); /* + * Trace point for start and stop of vpcus. + */ +TRACE_EVENT(kvm_s390_vcpu_start_stop, + TP_PROTO(unsigned int id, int state), + TP_ARGS(id, state), + + TP_STRUCT__entry( + __field(unsigned int, id) + __field(int, state) + ), + + TP_fast_assign( + __entry->id = id; + __entry->state = state; + ), + + TP_printk("%s cpu %d", __entry->state ? "starting" : "stopping", + __entry->id) + ); + +/* * Trace points for injection of interrupts, either per machine or * per vcpu. */ @@ -223,6 +244,28 @@ TRACE_EVENT(kvm_s390_enable_css, __entry->kvm) ); +/* + * Trace point for enabling and disabling interlocking-and-broadcasting + * suppression. + */ +TRACE_EVENT(kvm_s390_enable_disable_ibs, + TP_PROTO(unsigned int id, int state), + TP_ARGS(id, state), + + TP_STRUCT__entry( + __field(unsigned int, id) + __field(int, state) + ), + + TP_fast_assign( + __entry->id = id; + __entry->state = state; + ), + + TP_printk("%s ibs on cpu %d", + __entry->state ? "enabling" : "disabling", __entry->id) + ); + #endif /* _TRACE_KVMS390_H */ diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h index abf6ba52769e..916834d7a73a 100644 --- a/arch/s390/kvm/trace.h +++ b/arch/s390/kvm/trace.h @@ -2,7 +2,7 @@ #define _TRACE_KVM_H #include <linux/tracepoint.h> -#include <asm/sigp.h> +#include <asm/sie.h> #include <asm/debug.h> #include <asm/dis.h> @@ -125,17 +125,6 @@ TRACE_EVENT(kvm_s390_sie_fault, VCPU_TP_PRINTK("%s", "fault in sie instruction") ); -#define sie_intercept_code \ - {0x04, "Instruction"}, \ - {0x08, "Program interruption"}, \ - {0x0C, "Instruction and program interruption"}, \ - {0x10, "External request"}, \ - {0x14, "External interruption"}, \ - {0x18, "I/O request"}, \ - {0x1C, "Wait state"}, \ - {0x20, "Validity"}, \ - {0x28, "Stop request"} - TRACE_EVENT(kvm_s390_sie_exit, TP_PROTO(VCPU_PROTO_COMMON, u8 icptcode), TP_ARGS(VCPU_ARGS_COMMON, icptcode), @@ -165,7 +154,6 @@ TRACE_EVENT(kvm_s390_intercept_instruction, TP_STRUCT__entry( VCPU_FIELD_COMMON __field(__u64, instruction) - __field(char, insn[8]) ), TP_fast_assign( @@ -176,10 +164,8 @@ TRACE_EVENT(kvm_s390_intercept_instruction, VCPU_TP_PRINTK("intercepted instruction %016llx (%s)", __entry->instruction, - insn_to_mnemonic((unsigned char *) - &__entry->instruction, - __entry->insn, sizeof(__entry->insn)) ? - "unknown" : __entry->insn) + __print_symbolic(icpt_insn_decoder(__entry->instruction), + icpt_insn_codes)) ); /* @@ -227,18 +213,6 @@ TRACE_EVENT(kvm_s390_intercept_validity, * Trace points for instructions that are of special interest. */ -#define sigp_order_codes \ - {SIGP_SENSE, "sense"}, \ - {SIGP_EXTERNAL_CALL, "external call"}, \ - {SIGP_EMERGENCY_SIGNAL, "emergency signal"}, \ - {SIGP_STOP, "stop"}, \ - {SIGP_STOP_AND_STORE_STATUS, "stop and store status"}, \ - {SIGP_SET_ARCHITECTURE, "set architecture"}, \ - {SIGP_SET_PREFIX, "set prefix"}, \ - {SIGP_STORE_STATUS_AT_ADDRESS, "store status at addr"}, \ - {SIGP_SENSE_RUNNING, "sense running"}, \ - {SIGP_RESTART, "restart"} - TRACE_EVENT(kvm_s390_handle_sigp, TP_PROTO(VCPU_PROTO_COMMON, __u8 order_code, __u16 cpu_addr, \ __u32 parameter), @@ -265,12 +239,28 @@ TRACE_EVENT(kvm_s390_handle_sigp, __entry->cpu_addr, __entry->parameter) ); -#define diagnose_codes \ - {0x10, "release pages"}, \ - {0x44, "time slice end"}, \ - {0x308, "ipl functions"}, \ - {0x500, "kvm hypercall"}, \ - {0x501, "kvm breakpoint"} +TRACE_EVENT(kvm_s390_handle_sigp_pei, + TP_PROTO(VCPU_PROTO_COMMON, __u8 order_code, __u16 cpu_addr), + TP_ARGS(VCPU_ARGS_COMMON, order_code, cpu_addr), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(__u8, order_code) + __field(__u16, cpu_addr) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->order_code = order_code; + __entry->cpu_addr = cpu_addr; + ), + + VCPU_TP_PRINTK("handle sigp pei order %02x (%s), cpu address %04x", + __entry->order_code, + __print_symbolic(__entry->order_code, + sigp_order_codes), + __entry->cpu_addr) + ); TRACE_EVENT(kvm_s390_handle_diag, TP_PROTO(VCPU_PROTO_COMMON, __u16 code), diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index ea4a31b95990..66ba60c9b77e 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -958,8 +958,10 @@ void page_table_reset_pgste(struct mm_struct *mm, unsigned long start, unsigned long addr, next; pgd_t *pgd; + down_write(&mm->mmap_sem); + if (init_skey && mm_use_skey(mm)) + goto out_up; addr = start; - down_read(&mm->mmap_sem); pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); @@ -967,7 +969,10 @@ void page_table_reset_pgste(struct mm_struct *mm, unsigned long start, continue; next = page_table_reset_pud(mm, pgd, addr, next, init_skey); } while (pgd++, addr = next, addr != end); - up_read(&mm->mmap_sem); + if (init_skey) + current->mm->context.use_skey = 1; +out_up: + up_write(&mm->mmap_sem); } EXPORT_SYMBOL(page_table_reset_pgste); @@ -1384,19 +1389,6 @@ EXPORT_SYMBOL_GPL(s390_enable_sie); */ void s390_enable_skey(void) { - /* - * To avoid races between multiple vcpus, ending in calling - * page_table_reset twice or more, - * the page_table_lock is taken for serialization. - */ - spin_lock(¤t->mm->page_table_lock); - if (mm_use_skey(current->mm)) { - spin_unlock(¤t->mm->page_table_lock); - return; - } - - current->mm->context.use_skey = 1; - spin_unlock(¤t->mm->page_table_lock); page_table_reset_pgste(current->mm, 0, TASK_SIZE, true); } EXPORT_SYMBOL_GPL(s390_enable_skey); diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 24ec1216596e..a04fe4eb237d 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -189,7 +189,6 @@ struct x86_emulate_ops { void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); - void (*set_rflags)(struct x86_emulate_ctxt *ctxt, ulong val); int (*cpl)(struct x86_emulate_ctxt *ctxt); int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e21aee98a5c2..49314155b66c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -130,7 +130,6 @@ enum kvm_reg_ex { VCPU_EXREG_PDPTR = NR_VCPU_REGS, VCPU_EXREG_CR3, VCPU_EXREG_RFLAGS, - VCPU_EXREG_CPL, VCPU_EXREG_SEGMENTS, }; diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 58d66fe06b61..8ba18842c48e 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -74,6 +74,11 @@ dotraplinkage void do_general_protection(struct pt_regs *, long); dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); #ifdef CONFIG_TRACING dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long); +#else +static inline void trace_do_page_fault(struct pt_regs *regs, unsigned long error) +{ + do_page_fault(regs, error); +} #endif dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long); dotraplinkage void do_coprocessor_error(struct pt_regs *, long); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 0331cb389d68..7e97371387fd 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -259,7 +259,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) switch (kvm_read_and_reset_pf_reason()) { default: - do_page_fault(regs, error_code); + trace_do_page_fault(regs, error_code); break; case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index f47a104a749c..38a0afe83c6b 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -283,6 +283,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); /* cpuid 1.ecx */ const u32 kvm_supported_word4_x86_features = + /* NOTE: MONITOR (and MWAIT) are emulated as NOP, + * but *not* advertised to guests via CPUID ! */ F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | 0 /* DS-CPL, VMX, SMX, EST */ | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | @@ -495,6 +497,13 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ecx &= kvm_supported_word6_x86_features; cpuid_mask(&entry->ecx, 6); break; + case 0x80000007: /* Advanced power management */ + /* invariant TSC is CPUID.80000007H:EDX[8] */ + entry->edx &= (1 << 8); + /* mask against host */ + entry->edx &= boot_cpu_data.x86_power; + entry->eax = entry->ebx = entry->ecx = 0; + break; case 0x80000008: { unsigned g_phys_as = (entry->eax >> 16) & 0xff; unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); @@ -525,7 +534,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, case 3: /* Processor serial number */ case 5: /* MONITOR/MWAIT */ case 6: /* Thermal management */ - case 0x80000007: /* Advanced power management */ case 0xC0000002: case 0xC0000003: case 0xC0000004: @@ -726,6 +734,7 @@ int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) not_found: return 36; } +EXPORT_SYMBOL_GPL(cpuid_maxphyaddr); /* * If no match is found, check whether we exceed the vCPU's limit diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index eeecbed26ac7..f9087315e0cd 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -88,4 +88,11 @@ static inline bool guest_cpuid_has_x2apic(struct kvm_vcpu *vcpu) return best && (best->ecx & bit(X86_FEATURE_X2APIC)); } +static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + return best && (best->edx & bit(X86_FEATURE_GBPAGES)); +} #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e8a58409b5ac..e4e833d3d7d7 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -161,6 +161,7 @@ #define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ #define NoWrite ((u64)1 << 45) /* No writeback */ #define SrcWrite ((u64)1 << 46) /* Write back src operand */ +#define NoMod ((u64)1 << 47) /* Mod field is ignored */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) @@ -1077,7 +1078,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, ctxt->modrm_rm |= (ctxt->modrm & 0x07); ctxt->modrm_seg = VCPU_SREG_DS; - if (ctxt->modrm_mod == 3) { + if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) { op->type = OP_REG; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.reg = decode_register(ctxt, ctxt->modrm_rm, @@ -1410,11 +1411,11 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, } /* Does not support long mode */ -static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, - u16 selector, int seg) +static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, + u16 selector, int seg, u8 cpl, bool in_task_switch) { struct desc_struct seg_desc, old_desc; - u8 dpl, rpl, cpl; + u8 dpl, rpl; unsigned err_vec = GP_VECTOR; u32 err_code = 0; bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ @@ -1442,7 +1443,6 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, } rpl = selector & 3; - cpl = ctxt->ops->cpl(ctxt); /* NULL selector is not valid for TR, CS and SS (except for long mode) */ if ((seg == VCPU_SREG_CS @@ -1487,6 +1487,9 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, goto exception; break; case VCPU_SREG_CS: + if (in_task_switch && rpl != dpl) + goto exception; + if (!(seg_desc.type & 8)) goto exception; @@ -1544,6 +1547,13 @@ exception: return X86EMUL_PROPAGATE_FAULT; } +static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, + u16 selector, int seg) +{ + u8 cpl = ctxt->ops->cpl(ctxt); + return __load_segment_descriptor(ctxt, selector, seg, cpl, false); +} + static void write_register_operand(struct operand *op) { /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ @@ -2405,6 +2415,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, struct tss_segment_16 *tss) { int ret; + u8 cpl; ctxt->_eip = tss->ip; ctxt->eflags = tss->flag | 2; @@ -2427,23 +2438,25 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); + cpl = tss->cs & 3; + /* * Now load segment descriptors. If fault happens at this stage * it is handled in a context of new task */ - ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR); + ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES); + ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS); + ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS); + ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS); + ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; @@ -2521,6 +2534,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, struct tss_segment_32 *tss) { int ret; + u8 cpl; if (ctxt->ops->set_cr(ctxt, 3, tss->cr3)) return emulate_gp(ctxt, 0); @@ -2539,7 +2553,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, /* * SDM says that segment selectors are loaded before segment - * descriptors + * descriptors. This is important because CPL checks will + * use CS.RPL. */ set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); @@ -2553,43 +2568,38 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, * If we're switching between Protected Mode and VM86, we need to make * sure to update the mode before loading the segment descriptors so * that the selectors are interpreted correctly. - * - * Need to get rflags to the vcpu struct immediately because it - * influences the CPL which is checked at least when loading the segment - * descriptors and when pushing an error code to the new kernel stack. - * - * TODO Introduce a separate ctxt->ops->set_cpl callback */ - if (ctxt->eflags & X86_EFLAGS_VM) + if (ctxt->eflags & X86_EFLAGS_VM) { ctxt->mode = X86EMUL_MODE_VM86; - else + cpl = 3; + } else { ctxt->mode = X86EMUL_MODE_PROT32; - - ctxt->ops->set_rflags(ctxt, ctxt->eflags); + cpl = tss->cs & 3; + } /* * Now load segment descriptors. If fault happenes at this stage * it is handled in a context of new task */ - ret = load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); + ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES); + ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS); + ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS); + ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS); + ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS); + ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS); + ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; @@ -3868,10 +3878,12 @@ static const struct opcode twobyte_table[256] = { N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, D(ImplicitOps | ModRM), /* 0x20 - 0x2F */ - DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read), - DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read), - IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write), - IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write), + DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_read), + DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read), + IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_cr_write, cr_write, + check_cr_write), + IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write, + check_dr_write), N, N, N, N, GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29), GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29), diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9736529ade08..006911858174 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -360,6 +360,8 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) static inline void apic_set_isr(int vec, struct kvm_lapic *apic) { + /* Note that we never get here with APIC virtualization enabled. */ + if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) ++apic->isr_count; BUG_ON(apic->isr_count > MAX_APIC_VECTOR); @@ -371,12 +373,48 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) apic->highest_isr_cache = vec; } +static inline int apic_find_highest_isr(struct kvm_lapic *apic) +{ + int result; + + /* + * Note that isr_count is always 1, and highest_isr_cache + * is always -1, with APIC virtualization enabled. + */ + if (!apic->isr_count) + return -1; + if (likely(apic->highest_isr_cache != -1)) + return apic->highest_isr_cache; + + result = find_highest_vector(apic->regs + APIC_ISR); + ASSERT(result == -1 || result >= 16); + + return result; +} + static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) { - if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) + struct kvm_vcpu *vcpu; + if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) + return; + + vcpu = apic->vcpu; + + /* + * We do get here for APIC virtualization enabled if the guest + * uses the Hyper-V APIC enlightenment. In this case we may need + * to trigger a new interrupt delivery by writing the SVI field; + * on the other hand isr_count and highest_isr_cache are unused + * and must be left alone. + */ + if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) + kvm_x86_ops->hwapic_isr_update(vcpu->kvm, + apic_find_highest_isr(apic)); + else { --apic->isr_count; - BUG_ON(apic->isr_count < 0); - apic->highest_isr_cache = -1; + BUG_ON(apic->isr_count < 0); + apic->highest_isr_cache = -1; + } } int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) @@ -456,22 +494,6 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); } -static inline int apic_find_highest_isr(struct kvm_lapic *apic) -{ - int result; - - /* Note that isr_count is always 1 with vid enabled */ - if (!apic->isr_count) - return -1; - if (likely(apic->highest_isr_cache != -1)) - return apic->highest_isr_cache; - - result = find_highest_vector(apic->regs + APIC_ISR); - ASSERT(result == -1 || result >= 16); - - return result; -} - void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -1605,6 +1627,8 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) int vector = kvm_apic_has_interrupt(vcpu); struct kvm_lapic *apic = vcpu->arch.apic; + /* Note that we never get here with APIC virtualization enabled. */ + if (vector == -1) return -1; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 65f2400b8268..931467881da7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -22,6 +22,7 @@ #include "mmu.h" #include "x86.h" #include "kvm_cache_regs.h" +#include "cpuid.h" #include <linux/kvm_host.h> #include <linux/types.h> @@ -3516,11 +3517,14 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, { int maxphyaddr = cpuid_maxphyaddr(vcpu); u64 exb_bit_rsvd = 0; + u64 gbpages_bit_rsvd = 0; context->bad_mt_xwr = 0; if (!context->nx) exb_bit_rsvd = rsvd_bits(63, 63); + if (!guest_cpuid_has_gbpages(vcpu)) + gbpages_bit_rsvd = rsvd_bits(7, 7); switch (context->root_level) { case PT32_ROOT_LEVEL: /* no rsvd bits for 2 level 4K page table entries */ @@ -3557,14 +3561,14 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, context->rsvd_bits_mask[0][3] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 7); context->rsvd_bits_mask[0][2] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 7); + gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[0][0] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; context->rsvd_bits_mask[1][2] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51) | + gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 29); context->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51) | diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 7f4f9c2badae..ec8366c5cfea 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1338,21 +1338,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); } -static void svm_update_cpl(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - int cpl; - - if (!is_protmode(vcpu)) - cpl = 0; - else if (svm->vmcb->save.rflags & X86_EFLAGS_VM) - cpl = 3; - else - cpl = svm->vmcb->save.cs.selector & 0x3; - - svm->vmcb->save.cpl = cpl; -} - static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) { return to_svm(vcpu)->vmcb->save.rflags; @@ -1360,11 +1345,12 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { - unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags; - + /* + * Any change of EFLAGS.VM is accompained by a reload of SS + * (caused by either a task switch or an inter-privilege IRET), + * so we do not need to update the CPL here. + */ to_svm(vcpu)->vmcb->save.rflags = rflags; - if ((old_rflags ^ rflags) & X86_EFLAGS_VM) - svm_update_cpl(vcpu); } static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) @@ -1631,8 +1617,15 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; } - if (seg == VCPU_SREG_CS) - svm_update_cpl(vcpu); + + /* + * This is always accurate, except if SYSRET returned to a segment + * with SS.DPL != 3. Intel does not have this quirk, and always + * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it + * would entail passing the CPL to userspace and back. + */ + if (seg == VCPU_SREG_SS) + svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; mark_dirty(svm->vmcb, VMCB_SEG); } @@ -2770,12 +2763,6 @@ static int xsetbv_interception(struct vcpu_svm *svm) return 1; } -static int invalid_op_interception(struct vcpu_svm *svm) -{ - kvm_queue_exception(&svm->vcpu, UD_VECTOR); - return 1; -} - static int task_switch_interception(struct vcpu_svm *svm) { u16 tss_selector; @@ -3287,6 +3274,24 @@ static int pause_interception(struct vcpu_svm *svm) return 1; } +static int nop_interception(struct vcpu_svm *svm) +{ + skip_emulated_instruction(&(svm->vcpu)); + return 1; +} + +static int monitor_interception(struct vcpu_svm *svm) +{ + printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); + return nop_interception(svm); +} + +static int mwait_interception(struct vcpu_svm *svm) +{ + printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); + return nop_interception(svm); +} + static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR0] = cr_interception, [SVM_EXIT_READ_CR3] = cr_interception, @@ -3344,8 +3349,8 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_CLGI] = clgi_interception, [SVM_EXIT_SKINIT] = skinit_interception, [SVM_EXIT_WBINVD] = emulate_on_interception, - [SVM_EXIT_MONITOR] = invalid_op_interception, - [SVM_EXIT_MWAIT] = invalid_op_interception, + [SVM_EXIT_MONITOR] = monitor_interception, + [SVM_EXIT_MWAIT] = mwait_interception, [SVM_EXIT_XSETBV] = xsetbv_interception, [SVM_EXIT_NPF] = pf_interception, }; diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 545245d7cc63..33574c95220d 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -91,16 +91,21 @@ TRACE_EVENT(kvm_hv_hypercall, /* * Tracepoint for PIO. */ + +#define KVM_PIO_IN 0 +#define KVM_PIO_OUT 1 + TRACE_EVENT(kvm_pio, TP_PROTO(unsigned int rw, unsigned int port, unsigned int size, - unsigned int count), - TP_ARGS(rw, port, size, count), + unsigned int count, void *data), + TP_ARGS(rw, port, size, count, data), TP_STRUCT__entry( __field( unsigned int, rw ) __field( unsigned int, port ) __field( unsigned int, size ) __field( unsigned int, count ) + __field( unsigned int, val ) ), TP_fast_assign( @@ -108,11 +113,18 @@ TRACE_EVENT(kvm_pio, __entry->port = port; __entry->size = size; __entry->count = count; + if (size == 1) + __entry->val = *(unsigned char *)data; + else if (size == 2) + __entry->val = *(unsigned short *)data; + else + __entry->val = *(unsigned int *)data; ), - TP_printk("pio_%s at 0x%x size %d count %d", + TP_printk("pio_%s at 0x%x size %d count %d val 0x%x %s", __entry->rw ? "write" : "read", - __entry->port, __entry->size, __entry->count) + __entry->port, __entry->size, __entry->count, __entry->val, + __entry->count > 1 ? "(...)" : "") ); /* diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 72b8012991b9..248287cefa7a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -354,6 +354,7 @@ struct vmcs02_list { struct nested_vmx { /* Has the level1 guest done vmxon? */ bool vmxon; + gpa_t vmxon_ptr; /* The guest-physical address of the current VMCS L1 keeps for L2 */ gpa_t current_vmptr; @@ -413,7 +414,6 @@ struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; u8 fail; - u8 cpl; bool nmi_known_unmasked; u32 exit_intr_info; u32 idt_vectoring_info; @@ -3149,10 +3149,6 @@ static void enter_pmode(struct kvm_vcpu *vcpu) fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); - - /* CPL is always 0 when CPU enters protected mode */ - __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); - vmx->cpl = 0; } static void fix_rmode_seg(int seg, struct kvm_segment *save) @@ -3554,22 +3550,14 @@ static int vmx_get_cpl(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!is_protmode(vcpu)) + if (unlikely(vmx->rmode.vm86_active)) return 0; - - if (!is_long_mode(vcpu) - && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */ - return 3; - - if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { - __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); - vmx->cpl = vmx_read_guest_seg_selector(vmx, VCPU_SREG_CS) & 3; + else { + int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); + return AR_DPL(ar); } - - return vmx->cpl; } - static u32 vmx_segment_access_rights(struct kvm_segment *var) { u32 ar; @@ -3597,8 +3585,6 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; vmx_segment_cache_clear(vmx); - if (seg == VCPU_SREG_CS) - __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { vmx->rmode.segs[seg] = *var; @@ -5142,7 +5128,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) return 1; kvm_register_write(vcpu, reg, val); } else - if (kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg])) + if (kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg))) return 1; skip_emulated_instruction(vcpu); @@ -5415,7 +5401,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) } /* clear all local breakpoint enable flags */ - vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55); + vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x55); /* * TODO: What about debug traps on tss switch? @@ -5649,12 +5635,24 @@ static int handle_pause(struct kvm_vcpu *vcpu) return 1; } -static int handle_invalid_op(struct kvm_vcpu *vcpu) +static int handle_nop(struct kvm_vcpu *vcpu) { - kvm_queue_exception(vcpu, UD_VECTOR); + skip_emulated_instruction(vcpu); return 1; } +static int handle_mwait(struct kvm_vcpu *vcpu) +{ + printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); + return handle_nop(vcpu); +} + +static int handle_monitor(struct kvm_vcpu *vcpu) +{ + printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); + return handle_nop(vcpu); +} + /* * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12. * We could reuse a single VMCS for all the L2 guests, but we also want the @@ -5792,6 +5790,154 @@ static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) } /* + * Decode the memory-address operand of a vmx instruction, as recorded on an + * exit caused by such an instruction (run by a guest hypervisor). + * On success, returns 0. When the operand is invalid, returns 1 and throws + * #UD or #GP. + */ +static int get_vmx_mem_address(struct kvm_vcpu *vcpu, + unsigned long exit_qualification, + u32 vmx_instruction_info, gva_t *ret) +{ + /* + * According to Vol. 3B, "Information for VM Exits Due to Instruction + * Execution", on an exit, vmx_instruction_info holds most of the + * addressing components of the operand. Only the displacement part + * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). + * For how an actual address is calculated from all these components, + * refer to Vol. 1, "Operand Addressing". + */ + int scaling = vmx_instruction_info & 3; + int addr_size = (vmx_instruction_info >> 7) & 7; + bool is_reg = vmx_instruction_info & (1u << 10); + int seg_reg = (vmx_instruction_info >> 15) & 7; + int index_reg = (vmx_instruction_info >> 18) & 0xf; + bool index_is_valid = !(vmx_instruction_info & (1u << 22)); + int base_reg = (vmx_instruction_info >> 23) & 0xf; + bool base_is_valid = !(vmx_instruction_info & (1u << 27)); + + if (is_reg) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + /* Addr = segment_base + offset */ + /* offset = base + [index * scale] + displacement */ + *ret = vmx_get_segment_base(vcpu, seg_reg); + if (base_is_valid) + *ret += kvm_register_read(vcpu, base_reg); + if (index_is_valid) + *ret += kvm_register_read(vcpu, index_reg)<<scaling; + *ret += exit_qualification; /* holds the displacement */ + + if (addr_size == 1) /* 32 bit */ + *ret &= 0xffffffff; + + /* + * TODO: throw #GP (and return 1) in various cases that the VM* + * instructions require it - e.g., offset beyond segment limit, + * unusable or unreadable/unwritable segment, non-canonical 64-bit + * address, and so on. Currently these are not checked. + */ + return 0; +} + +/* + * This function performs the various checks including + * - if it's 4KB aligned + * - No bits beyond the physical address width are set + * - Returns 0 on success or else 1 + * (Intel SDM Section 30.3) + */ +static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, + gpa_t *vmpointer) +{ + gva_t gva; + gpa_t vmptr; + struct x86_exception e; + struct page *page; + struct vcpu_vmx *vmx = to_vmx(vcpu); + int maxphyaddr = cpuid_maxphyaddr(vcpu); + + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) + return 1; + + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, + sizeof(vmptr), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + switch (exit_reason) { + case EXIT_REASON_VMON: + /* + * SDM 3: 24.11.5 + * The first 4 bytes of VMXON region contain the supported + * VMCS revision identifier + * + * Note - IA32_VMX_BASIC[48] will never be 1 + * for the nested case; + * which replaces physical address width with 32 + * + */ + if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + nested_vmx_failInvalid(vcpu); + skip_emulated_instruction(vcpu); + return 1; + } + + page = nested_get_page(vcpu, vmptr); + if (page == NULL || + *(u32 *)kmap(page) != VMCS12_REVISION) { + nested_vmx_failInvalid(vcpu); + kunmap(page); + skip_emulated_instruction(vcpu); + return 1; + } + kunmap(page); + vmx->nested.vmxon_ptr = vmptr; + break; + case EXIT_REASON_VMCLEAR: + if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + nested_vmx_failValid(vcpu, + VMXERR_VMCLEAR_INVALID_ADDRESS); + skip_emulated_instruction(vcpu); + return 1; + } + + if (vmptr == vmx->nested.vmxon_ptr) { + nested_vmx_failValid(vcpu, + VMXERR_VMCLEAR_VMXON_POINTER); + skip_emulated_instruction(vcpu); + return 1; + } + break; + case EXIT_REASON_VMPTRLD: + if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + nested_vmx_failValid(vcpu, + VMXERR_VMPTRLD_INVALID_ADDRESS); + skip_emulated_instruction(vcpu); + return 1; + } + + if (vmptr == vmx->nested.vmxon_ptr) { + nested_vmx_failValid(vcpu, + VMXERR_VMCLEAR_VMXON_POINTER); + skip_emulated_instruction(vcpu); + return 1; + } + break; + default: + return 1; /* shouldn't happen */ + } + + if (vmpointer) + *vmpointer = vmptr; + return 0; +} + +/* * Emulate the VMXON instruction. * Currently, we just remember that VMX is active, and do not save or even * inspect the argument to VMXON (the so-called "VMXON pointer") because we @@ -5829,6 +5975,10 @@ static int handle_vmon(struct kvm_vcpu *vcpu) kvm_inject_gp(vcpu, 0); return 1; } + + if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL)) + return 1; + if (vmx->nested.vmxon) { nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); skip_emulated_instruction(vcpu); @@ -5951,88 +6101,20 @@ static int handle_vmoff(struct kvm_vcpu *vcpu) return 1; } -/* - * Decode the memory-address operand of a vmx instruction, as recorded on an - * exit caused by such an instruction (run by a guest hypervisor). - * On success, returns 0. When the operand is invalid, returns 1 and throws - * #UD or #GP. - */ -static int get_vmx_mem_address(struct kvm_vcpu *vcpu, - unsigned long exit_qualification, - u32 vmx_instruction_info, gva_t *ret) -{ - /* - * According to Vol. 3B, "Information for VM Exits Due to Instruction - * Execution", on an exit, vmx_instruction_info holds most of the - * addressing components of the operand. Only the displacement part - * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). - * For how an actual address is calculated from all these components, - * refer to Vol. 1, "Operand Addressing". - */ - int scaling = vmx_instruction_info & 3; - int addr_size = (vmx_instruction_info >> 7) & 7; - bool is_reg = vmx_instruction_info & (1u << 10); - int seg_reg = (vmx_instruction_info >> 15) & 7; - int index_reg = (vmx_instruction_info >> 18) & 0xf; - bool index_is_valid = !(vmx_instruction_info & (1u << 22)); - int base_reg = (vmx_instruction_info >> 23) & 0xf; - bool base_is_valid = !(vmx_instruction_info & (1u << 27)); - - if (is_reg) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - /* Addr = segment_base + offset */ - /* offset = base + [index * scale] + displacement */ - *ret = vmx_get_segment_base(vcpu, seg_reg); - if (base_is_valid) - *ret += kvm_register_read(vcpu, base_reg); - if (index_is_valid) - *ret += kvm_register_read(vcpu, index_reg)<<scaling; - *ret += exit_qualification; /* holds the displacement */ - - if (addr_size == 1) /* 32 bit */ - *ret &= 0xffffffff; - - /* - * TODO: throw #GP (and return 1) in various cases that the VM* - * instructions require it - e.g., offset beyond segment limit, - * unusable or unreadable/unwritable segment, non-canonical 64-bit - * address, and so on. Currently these are not checked. - */ - return 0; -} - /* Emulate the VMCLEAR instruction */ static int handle_vmclear(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - gva_t gva; gpa_t vmptr; struct vmcs12 *vmcs12; struct page *page; - struct x86_exception e; if (!nested_vmx_check_permission(vcpu)) return 1; - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) + if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr)) return 1; - if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, - sizeof(vmptr), &e)) { - kvm_inject_page_fault(vcpu, &e); - return 1; - } - - if (!IS_ALIGNED(vmptr, PAGE_SIZE)) { - nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); - skip_emulated_instruction(vcpu); - return 1; - } - if (vmptr == vmx->nested.current_vmptr) { nested_release_vmcs12(vmx); vmx->nested.current_vmptr = -1ull; @@ -6352,30 +6434,15 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) static int handle_vmptrld(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - gva_t gva; gpa_t vmptr; - struct x86_exception e; u32 exec_control; if (!nested_vmx_check_permission(vcpu)) return 1; - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) + if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr)) return 1; - if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, - sizeof(vmptr), &e)) { - kvm_inject_page_fault(vcpu, &e); - return 1; - } - - if (!IS_ALIGNED(vmptr, PAGE_SIZE)) { - nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); - skip_emulated_instruction(vcpu); - return 1; - } - if (vmx->nested.current_vmptr != vmptr) { struct vmcs12 *new_vmcs12; struct page *page; @@ -6547,8 +6614,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, - [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, - [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, + [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, + [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, [EXIT_REASON_INVEPT] = handle_invept, }; @@ -7389,7 +7456,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) | (1 << VCPU_EXREG_RFLAGS) - | (1 << VCPU_EXREG_CPL) | (1 << VCPU_EXREG_PDPTR) | (1 << VCPU_EXREG_SEGMENTS) | (1 << VCPU_EXREG_CR3)); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c5582c385bc0..57eac309c22e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -701,10 +701,11 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) return 0; } - if (is_long_mode(vcpu) && (cr3 & CR3_L_MODE_RESERVED_BITS)) - return 1; - if (is_pae(vcpu) && is_paging(vcpu) && - !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) + if (is_long_mode(vcpu)) { + if (cr3 & CR3_L_MODE_RESERVED_BITS) + return 1; + } else if (is_pae(vcpu) && is_paging(vcpu) && + !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; vcpu->arch.cr3 = cr3; @@ -1917,6 +1918,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { vcpu->arch.hv_vapic = data; + if (kvm_lapic_enable_pv_eoi(vcpu, 0)) + return 1; break; } gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT; @@ -1927,6 +1930,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) return 1; vcpu->arch.hv_vapic = data; mark_page_dirty(vcpu->kvm, gfn); + if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED)) + return 1; break; } case HV_X64_MSR_EOI: @@ -4480,8 +4485,6 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, unsigned short port, void *val, unsigned int count, bool in) { - trace_kvm_pio(!in, port, size, count); - vcpu->arch.pio.port = port; vcpu->arch.pio.in = in; vcpu->arch.pio.count = count; @@ -4516,6 +4519,7 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, if (ret) { data_avail: memcpy(val, vcpu->arch.pio_data, size * count); + trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data); vcpu->arch.pio.count = 0; return 1; } @@ -4530,6 +4534,7 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); memcpy(vcpu->arch.pio_data, val, size * count); + trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data); return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); } @@ -4641,11 +4646,6 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) return res; } -static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val) -{ - kvm_set_rflags(emul_to_vcpu(ctxt), val); -} - static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) { return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); @@ -4830,7 +4830,6 @@ static const struct x86_emulate_ops emulate_ops = { .set_idt = emulator_set_idt, .get_cr = emulator_get_cr, .set_cr = emulator_set_cr, - .set_rflags = emulator_set_rflags, .cpl = emulator_get_cpl, .get_dr = emulator_get_dr, .set_dr = emulator_set_dr, diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index b57fe0efb422..1918d9dff45d 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -27,7 +27,9 @@ struct read_info_sccb { u8 loadparm[8]; /* 24-31 */ u8 _reserved1[48 - 32]; /* 32-47 */ u64 facilities; /* 48-55 */ - u8 _reserved2[84 - 56]; /* 56-83 */ + u8 _reserved2a[76 - 56]; /* 56-75 */ + u32 ibc; /* 76-79 */ + u8 _reserved2b[84 - 80]; /* 80-83 */ u8 fac84; /* 84 */ u8 fac85; /* 85 */ u8 _reserved3[91 - 86]; /* 86-90 */ @@ -47,6 +49,7 @@ static unsigned long sclp_hsa_size; static unsigned int sclp_max_cpu; static struct sclp_ipl_info sclp_ipl_info; static unsigned char sclp_siif; +static u32 sclp_ibc; u64 sclp_facilities; u8 sclp_fac84; @@ -111,6 +114,7 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb) sclp_rnmax = sccb->rnmax ? sccb->rnmax : sccb->rnmax2; sclp_rzm = sccb->rnsize ? sccb->rnsize : sccb->rnsize2; sclp_rzm <<= 20; + sclp_ibc = sccb->ibc; if (!sccb->hcpua) { if (MACHINE_IS_VM) @@ -168,6 +172,12 @@ int sclp_has_siif(void) } EXPORT_SYMBOL(sclp_has_siif); +unsigned int sclp_get_ibc(void) +{ + return sclp_ibc; +} +EXPORT_SYMBOL(sclp_get_ibc); + /* * This function will be called after sclp_facilities_detect(), which gets * called from early.c code. The sclp_facilities_detect() function retrieves diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 820fc2e1d9df..970c68197c69 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -134,6 +134,8 @@ static inline bool is_error_page(struct page *page) #define KVM_REQ_EPR_EXIT 20 #define KVM_REQ_SCAN_IOAPIC 21 #define KVM_REQ_GLOBAL_CLOCK_UPDATE 22 +#define KVM_REQ_ENABLE_IBS 23 +#define KVM_REQ_DISABLE_IBS 24 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 @@ -368,6 +370,7 @@ struct kvm { struct mm_struct *mm; /* userspace tied to this vm */ struct kvm_memslots *memslots; struct srcu_struct srcu; + struct srcu_struct irq_srcu; #ifdef CONFIG_KVM_APIC_ARCHITECTURE u32 bsp_vcpu_id; #endif diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 16cb1a14993b..32cf446f1911 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -424,6 +424,8 @@ struct kvm_s390_psw { #define KVM_S390_INT_PFAULT_INIT 0xfffe0004u #define KVM_S390_INT_PFAULT_DONE 0xfffe0005u #define KVM_S390_MCHK 0xfffe1000u +#define KVM_S390_INT_CLOCK_COMP 0xffff1004u +#define KVM_S390_INT_CPU_TIMER 0xffff1005u #define KVM_S390_INT_VIRTIO 0xffff2603u #define KVM_S390_INT_SERVICE 0xffff2401u #define KVM_S390_INT_EMERGENCY 0xffff1201u diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 10df100c4514..62f4223f4c9e 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -80,12 +80,10 @@ static void async_pf_execute(struct work_struct *work) might_sleep(); - use_mm(mm); down_read(&mm->mmap_sem); - get_user_pages(current, mm, addr, 1, 1, 0, NULL, NULL); + get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL); up_read(&mm->mmap_sem); kvm_async_page_present_sync(vcpu, apf); - unuse_mm(mm); spin_lock(&vcpu->async_pf.lock); list_add_tail(&apf->link, &vcpu->async_pf.done); diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 912ec5a95e2c..20c3af7692c5 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -31,6 +31,7 @@ #include <linux/list.h> #include <linux/eventfd.h> #include <linux/kernel.h> +#include <linux/srcu.h> #include <linux/slab.h> #include "iodev.h" @@ -118,19 +119,22 @@ static void irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian) { struct _irqfd_resampler *resampler; + struct kvm *kvm; struct _irqfd *irqfd; + int idx; resampler = container_of(kian, struct _irqfd_resampler, notifier); + kvm = resampler->kvm; - kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, + kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, resampler->notifier.gsi, 0, false); - rcu_read_lock(); + idx = srcu_read_lock(&kvm->irq_srcu); list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link) eventfd_signal(irqfd->resamplefd, 1); - rcu_read_unlock(); + srcu_read_unlock(&kvm->irq_srcu, idx); } static void @@ -142,7 +146,7 @@ irqfd_resampler_shutdown(struct _irqfd *irqfd) mutex_lock(&kvm->irqfds.resampler_lock); list_del_rcu(&irqfd->resampler_link); - synchronize_rcu(); + synchronize_srcu(&kvm->irq_srcu); if (list_empty(&resampler->list)) { list_del(&resampler->link); @@ -221,17 +225,18 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) unsigned long flags = (unsigned long)key; struct kvm_kernel_irq_routing_entry *irq; struct kvm *kvm = irqfd->kvm; + int idx; if (flags & POLLIN) { - rcu_read_lock(); - irq = rcu_dereference(irqfd->irq_entry); + idx = srcu_read_lock(&kvm->irq_srcu); + irq = srcu_dereference(irqfd->irq_entry, &kvm->irq_srcu); /* An event has been signaled, inject an interrupt */ if (irq) kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, false); else schedule_work(&irqfd->inject); - rcu_read_unlock(); + srcu_read_unlock(&kvm->irq_srcu, idx); } if (flags & POLLHUP) { @@ -363,7 +368,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) } list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list); - synchronize_rcu(); + synchronize_srcu(&kvm->irq_srcu); mutex_unlock(&kvm->irqfds.resampler_lock); } @@ -465,7 +470,7 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) * another thread calls kvm_irq_routing_update before * we flush workqueue below (we synchronize with * kvm_irq_routing_update using irqfds.lock). - * It is paired with synchronize_rcu done by caller + * It is paired with synchronize_srcu done by caller * of that function. */ rcu_assign_pointer(irqfd->irq_entry, NULL); @@ -524,7 +529,7 @@ kvm_irqfd_release(struct kvm *kvm) /* * Change irq_routing and irqfd. - * Caller must invoke synchronize_rcu afterwards. + * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards. */ void kvm_irq_routing_update(struct kvm *kvm, struct kvm_irq_routing_table *irq_rt) diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index e2e6b4473a96..ced4a542a031 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -163,6 +163,7 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level) struct kvm_kernel_irq_routing_entry *e; int ret = -EINVAL; struct kvm_irq_routing_table *irq_rt; + int idx; trace_kvm_set_irq(irq, level, irq_source_id); @@ -174,8 +175,8 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level) * Since there's no easy way to do this, we only support injecting MSI * which is limited to 1:1 GSI mapping. */ - rcu_read_lock(); - irq_rt = rcu_dereference(kvm->irq_routing); + idx = srcu_read_lock(&kvm->irq_srcu); + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); if (irq < irq_rt->nr_rt_entries) hlist_for_each_entry(e, &irq_rt->map[irq], link) { if (likely(e->type == KVM_IRQ_ROUTING_MSI)) @@ -184,7 +185,7 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level) ret = -EWOULDBLOCK; break; } - rcu_read_unlock(); + srcu_read_unlock(&kvm->irq_srcu, idx); return ret; } @@ -253,22 +254,22 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, mutex_lock(&kvm->irq_lock); hlist_del_rcu(&kimn->link); mutex_unlock(&kvm->irq_lock); - synchronize_rcu(); + synchronize_srcu(&kvm->irq_srcu); } void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, bool mask) { struct kvm_irq_mask_notifier *kimn; - int gsi; + int idx, gsi; - rcu_read_lock(); - gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; + idx = srcu_read_lock(&kvm->irq_srcu); + gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin]; if (gsi != -1) hlist_for_each_entry_rcu(kimn, &kvm->mask_notifier_list, link) if (kimn->irq == gsi) kimn->func(kimn, mask); - rcu_read_unlock(); + srcu_read_unlock(&kvm->irq_srcu, idx); } int kvm_set_routing_entry(struct kvm_irq_routing_table *rt, diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index 20dc9e4a8f6c..b43c275775cd 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -26,6 +26,7 @@ #include <linux/kvm_host.h> #include <linux/slab.h> +#include <linux/srcu.h> #include <linux/export.h> #include <trace/events/kvm.h> #include "irq.h" @@ -33,19 +34,19 @@ bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) { struct kvm_irq_ack_notifier *kian; - int gsi; + int gsi, idx; - rcu_read_lock(); - gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; + idx = srcu_read_lock(&kvm->irq_srcu); + gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin]; if (gsi != -1) hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, link) if (kian->gsi == gsi) { - rcu_read_unlock(); + srcu_read_unlock(&kvm->irq_srcu, idx); return true; } - rcu_read_unlock(); + srcu_read_unlock(&kvm->irq_srcu, idx); return false; } @@ -54,18 +55,18 @@ EXPORT_SYMBOL_GPL(kvm_irq_has_notifier); void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) { struct kvm_irq_ack_notifier *kian; - int gsi; + int gsi, idx; trace_kvm_ack_irq(irqchip, pin); - rcu_read_lock(); - gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; + idx = srcu_read_lock(&kvm->irq_srcu); + gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin]; if (gsi != -1) hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, link) if (kian->gsi == gsi) kian->irq_acked(kian); - rcu_read_unlock(); + srcu_read_unlock(&kvm->irq_srcu, idx); } void kvm_register_irq_ack_notifier(struct kvm *kvm, @@ -85,7 +86,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm, mutex_lock(&kvm->irq_lock); hlist_del_init_rcu(&kian->link); mutex_unlock(&kvm->irq_lock); - synchronize_rcu(); + synchronize_srcu(&kvm->irq_srcu); #ifdef __KVM_HAVE_IOAPIC kvm_vcpu_request_scan_ioapic(kvm); #endif @@ -115,7 +116,7 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status) { struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS]; - int ret = -1, i = 0; + int ret = -1, i = 0, idx; struct kvm_irq_routing_table *irq_rt; trace_kvm_set_irq(irq, level, irq_source_id); @@ -124,12 +125,12 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, * IOAPIC. So set the bit in both. The guest will ignore * writes to the unused one. */ - rcu_read_lock(); - irq_rt = rcu_dereference(kvm->irq_routing); + idx = srcu_read_lock(&kvm->irq_srcu); + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); if (irq < irq_rt->nr_rt_entries) hlist_for_each_entry(e, &irq_rt->map[irq], link) irq_set[i++] = *e; - rcu_read_unlock(); + srcu_read_unlock(&kvm->irq_srcu, idx); while(i--) { int r; @@ -226,7 +227,7 @@ int kvm_set_irq_routing(struct kvm *kvm, kvm_irq_routing_update(kvm, new); mutex_unlock(&kvm->irq_lock); - synchronize_rcu(); + synchronize_srcu_expedited(&kvm->irq_srcu); new = old; r = 0; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index fa70c6e642b4..95b4c2b3906a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -457,11 +457,11 @@ static struct kvm *kvm_create_vm(unsigned long type) r = kvm_arch_init_vm(kvm, type); if (r) - goto out_err_nodisable; + goto out_err_no_disable; r = hardware_enable_all(); if (r) - goto out_err_nodisable; + goto out_err_no_disable; #ifdef CONFIG_HAVE_KVM_IRQCHIP INIT_HLIST_HEAD(&kvm->mask_notifier_list); @@ -473,10 +473,12 @@ static struct kvm *kvm_create_vm(unsigned long type) r = -ENOMEM; kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); if (!kvm->memslots) - goto out_err_nosrcu; + goto out_err_no_srcu; kvm_init_memslots_id(kvm); if (init_srcu_struct(&kvm->srcu)) - goto out_err_nosrcu; + goto out_err_no_srcu; + if (init_srcu_struct(&kvm->irq_srcu)) + goto out_err_no_irq_srcu; for (i = 0; i < KVM_NR_BUSES; i++) { kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); @@ -505,10 +507,12 @@ static struct kvm *kvm_create_vm(unsigned long type) return kvm; out_err: + cleanup_srcu_struct(&kvm->irq_srcu); +out_err_no_irq_srcu: cleanup_srcu_struct(&kvm->srcu); -out_err_nosrcu: +out_err_no_srcu: hardware_disable_all(); -out_err_nodisable: +out_err_no_disable: for (i = 0; i < KVM_NR_BUSES; i++) kfree(kvm->buses[i]); kfree(kvm->memslots); |