7 files changed, 497 insertions, 155 deletions
diff --git a/lib/cpus/aarch64/cortex_a75_pubsub.c b/lib/cpus/aarch64/cortex_a75_pubsub.c
index c1089a607..a1ffcb041 100644
--- a/lib/cpus/aarch64/cortex_a75_pubsub.c
+++ b/lib/cpus/aarch64/cortex_a75_pubsub.c
@@ -5,8 +5,8 @@
  */
 
 #include <cortex_a75.h>
-#include <pubsub_events.h>
 #include <platform.h>
+#include <pubsub_events.h>
 
 struct amu_ctx {
 	uint64_t cnts[CORTEX_A75_AMU_NR_COUNTERS];
diff --git a/lib/cpus/aarch64/workaround_cve_2017_5715_bpiall.S b/lib/cpus/aarch64/workaround_cve_2017_5715_bpiall.S
index cd29266ed..cd8249732 100644
--- a/lib/cpus/aarch64/workaround_cve_2017_5715_bpiall.S
+++ b/lib/cpus/aarch64/workaround_cve_2017_5715_bpiall.S
@@ -1,20 +1,27 @@
 /*
- * Copyright (c) 2017, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2017-2018, ARM Limited and Contributors. All rights reserved.
  *
  * SPDX-License-Identifier: BSD-3-Clause
  */
 
 #include <arch.h>
+#include <arm_arch_svc.h>
 #include <asm_macros.S>
 #include <context.h>
 
 	.globl	workaround_bpiall_vbar0_runtime_exceptions
 
 #define EMIT_BPIALL		0xee070fd5
-#define EMIT_MOV_R0_IMM(v)	0xe3a0000##v
 #define EMIT_SMC		0xe1600070
+#define ESR_EL3_A64_SMC0	0x5e000000
+
+	.macro	enter_workaround _from_vector
+	/*
+	 * Save register state to enable a call to AArch32 S-EL1 and return
+	 * Identify the original calling vector in w2 (==_from_vector)
+	 * Use w3-w6 for additional register state preservation while in S-EL1
+	 */
 
-	.macro	enter_workaround _stub_name
 	/* Save GP regs */
 	stp	x0, x1, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X0]
 	stp	x2, x3, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X2]
@@ -32,47 +39,50 @@
 	stp	x26, x27, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X26]
 	stp	x28, x29, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X28]
 
-	adr	x4, \_stub_name
+	/* Identify the original exception vector */
+	mov	w2, \_from_vector
+
+	/* Preserve 32-bit system registers in GP registers through the workaround */
+	mrs	x3, esr_el3
+	mrs	x4, spsr_el3
+	mrs	x5, scr_el3
+	mrs	x6, sctlr_el1
 
 	/*
-	 * Load SPSR_EL3 and VBAR_EL3.  SPSR_EL3 is set up to have
-	 * all interrupts masked in preparation to running the workaround
-	 * stub in S-EL1.  VBAR_EL3 points to the vector table that
-	 * will handle the SMC back from the workaround stub.
+	 * Preserve LR and ELR_EL3 registers in the GP regs context.
+	 * Temporarily use the CTX_GPREG_SP_EL0 slot to preserve ELR_EL3
+	 * through the workaround. This is OK because at this point the
+	 * current state for this context's SP_EL0 is in the live system
+	 * register, which is unmodified by the workaround.
 	 */
-	ldp	x0, x1, [x4, #0]
+	mrs	x7, elr_el3
+	stp	x30, x7, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_LR]
 
 	/*
-	 * Load SCTLR_EL1 and ELR_EL3.  SCTLR_EL1 is configured to disable
-	 * the MMU in S-EL1.  ELR_EL3 points to the appropriate stub in S-EL1.
+	 * Load system registers for entry to S-EL1.
 	 */
-	ldp	x2, x3, [x4, #16]
 
-	mrs	x4, scr_el3
-	mrs	x5, spsr_el3
-	mrs	x6, elr_el3
-	mrs	x7, sctlr_el1
-	mrs	x8, esr_el3
+	/* Mask all interrupts and set AArch32 Supervisor mode */
+	movz	w8, SPSR_MODE32(MODE32_svc, SPSR_T_ARM, SPSR_E_LITTLE, SPSR_AIF_MASK)
+
+	/* Switch EL3 exception vectors while the workaround is executing. */
+	adr	x9, workaround_bpiall_vbar1_runtime_exceptions
+
+	/* Setup SCTLR_EL1 with MMU off and I$ on */
+	ldr	x10, stub_sel1_sctlr
 
-	/* Preserve system registers in the workaround context */
-	stp	x4, x5, [sp, #CTX_CVE_2017_5715_OFFSET + CTX_CVE_2017_5715_QUAD0]
-	stp	x6, x7, [sp, #CTX_CVE_2017_5715_OFFSET + CTX_CVE_2017_5715_QUAD2]
-	stp	x8, x30, [sp, #CTX_CVE_2017_5715_OFFSET + CTX_CVE_2017_5715_QUAD4]
+	/* Land at the S-EL1 workaround stub */
+	adr	x11, aarch32_stub
 
 	/*
 	 * Setting SCR_EL3 to all zeroes means that the NS, RW
 	 * and SMD bits are configured as expected.
 	 */
 	msr	scr_el3, xzr
-
-	/*
-	 * Reload system registers with the crafted values
-	 * in preparation for entry in S-EL1.
-	 */
-	msr	spsr_el3, x0
-	msr	vbar_el3, x1
-	msr	sctlr_el1, x2
-	msr	elr_el3, x3
+	msr	spsr_el3, x8
+	msr	vbar_el3, x9
+	msr	sctlr_el1, x10
+	msr	elr_el3, x11
 
 	eret
 	.endm
@@ -91,76 +101,31 @@ vector_base workaround_bpiall_vbar0_runtime_exceptions
 	 */
 vector_entry workaround_bpiall_vbar0_sync_exception_sp_el0
 	b	sync_exception_sp_el0
+	nop	/* to force 8 byte alignment for the following stub */
+
 	/*
 	 * Since each vector table entry is 128 bytes, we can store the
 	 * stub context in the unused space to minimize memory footprint.
 	 */
-aarch32_stub_smc:
+stub_sel1_sctlr:
+	.quad	SCTLR_AARCH32_EL1_RES1 | SCTLR_I_BIT
+
+aarch32_stub:
 	.word	EMIT_BPIALL
-	.word	EMIT_MOV_R0_IMM(1)
 	.word	EMIT_SMC
-aarch32_stub_ctx_smc:
-	/* Mask all interrupts and set AArch32 Supervisor mode */
-	.quad	(SPSR_AIF_MASK << SPSR_AIF_SHIFT | \
-	         SPSR_M_AARCH32 << SPSR_M_SHIFT | \
-	         MODE32_svc << MODE32_SHIFT)
 
-	/*
-	 * VBAR_EL3 points to vbar1 which is the vector table
-	 * used while the workaround is executing.
-	 */
-	.quad	workaround_bpiall_vbar1_runtime_exceptions
-
-	/* Setup SCTLR_EL1 with MMU off and I$ on */
-	.quad	SCTLR_AARCH32_EL1_RES1 | SCTLR_I_BIT
-
-	/* ELR_EL3 is setup to point to the sync exception stub in AArch32 */
-	.quad	aarch32_stub_smc
 	check_vector_size workaround_bpiall_vbar0_sync_exception_sp_el0
 
 vector_entry workaround_bpiall_vbar0_irq_sp_el0
 	b	irq_sp_el0
-aarch32_stub_irq:
-	.word	EMIT_BPIALL
-	.word	EMIT_MOV_R0_IMM(2)
-	.word	EMIT_SMC
-aarch32_stub_ctx_irq:
-	.quad	(SPSR_AIF_MASK << SPSR_AIF_SHIFT | \
-	         SPSR_M_AARCH32 << SPSR_M_SHIFT | \
-	         MODE32_svc << MODE32_SHIFT)
-	.quad	workaround_bpiall_vbar1_runtime_exceptions
-	.quad	SCTLR_AARCH32_EL1_RES1 | SCTLR_I_BIT
-	.quad	aarch32_stub_irq
 	check_vector_size workaround_bpiall_vbar0_irq_sp_el0
 
 vector_entry workaround_bpiall_vbar0_fiq_sp_el0
 	b	fiq_sp_el0
-aarch32_stub_fiq:
-	.word	EMIT_BPIALL
-	.word	EMIT_MOV_R0_IMM(4)
-	.word	EMIT_SMC
-aarch32_stub_ctx_fiq:
-	.quad	(SPSR_AIF_MASK << SPSR_AIF_SHIFT | \
-	         SPSR_M_AARCH32 << SPSR_M_SHIFT | \
-	         MODE32_svc << MODE32_SHIFT)
-	.quad	workaround_bpiall_vbar1_runtime_exceptions
-	.quad	SCTLR_AARCH32_EL1_RES1 | SCTLR_I_BIT
-	.quad	aarch32_stub_fiq
 	check_vector_size workaround_bpiall_vbar0_fiq_sp_el0
 
 vector_entry workaround_bpiall_vbar0_serror_sp_el0
 	b	serror_sp_el0
-aarch32_stub_serror:
-	.word	EMIT_BPIALL
-	.word	EMIT_MOV_R0_IMM(8)
-	.word	EMIT_SMC
-aarch32_stub_ctx_serror:
-	.quad	(SPSR_AIF_MASK << SPSR_AIF_SHIFT | \
-	         SPSR_M_AARCH32 << SPSR_M_SHIFT | \
-	         MODE32_svc << MODE32_SHIFT)
-	.quad	workaround_bpiall_vbar1_runtime_exceptions
-	.quad	SCTLR_AARCH32_EL1_RES1 | SCTLR_I_BIT
-	.quad	aarch32_stub_serror
 	check_vector_size workaround_bpiall_vbar0_serror_sp_el0
 
 	/* ---------------------------------------------------------------------
@@ -188,19 +153,19 @@ vector_entry workaround_bpiall_vbar0_serror_sp_elx
 	 * ---------------------------------------------------------------------
 	 */
 vector_entry workaround_bpiall_vbar0_sync_exception_aarch64
-	enter_workaround aarch32_stub_ctx_smc
+	enter_workaround 1
 	check_vector_size workaround_bpiall_vbar0_sync_exception_aarch64
 
 vector_entry workaround_bpiall_vbar0_irq_aarch64
-	enter_workaround aarch32_stub_ctx_irq
+	enter_workaround 2
 	check_vector_size workaround_bpiall_vbar0_irq_aarch64
 
 vector_entry workaround_bpiall_vbar0_fiq_aarch64
-	enter_workaround aarch32_stub_ctx_fiq
+	enter_workaround 4
 	check_vector_size workaround_bpiall_vbar0_fiq_aarch64
 
 vector_entry workaround_bpiall_vbar0_serror_aarch64
-	enter_workaround aarch32_stub_ctx_serror
+	enter_workaround 8
 	check_vector_size workaround_bpiall_vbar0_serror_aarch64
 
 	/* ---------------------------------------------------------------------
@@ -208,19 +173,19 @@ vector_entry workaround_bpiall_vbar0_serror_aarch64
 	 * ---------------------------------------------------------------------
 	 */
 vector_entry workaround_bpiall_vbar0_sync_exception_aarch32
-	enter_workaround aarch32_stub_ctx_smc
+	enter_workaround 1
 	check_vector_size workaround_bpiall_vbar0_sync_exception_aarch32
 
 vector_entry workaround_bpiall_vbar0_irq_aarch32
-	enter_workaround aarch32_stub_ctx_irq
+	enter_workaround 2
 	check_vector_size workaround_bpiall_vbar0_irq_aarch32
 
 vector_entry workaround_bpiall_vbar0_fiq_aarch32
-	enter_workaround aarch32_stub_ctx_fiq
+	enter_workaround 4
 	check_vector_size workaround_bpiall_vbar0_fiq_aarch32
 
 vector_entry workaround_bpiall_vbar0_serror_aarch32
-	enter_workaround aarch32_stub_ctx_serror
+	enter_workaround 8
 	check_vector_size workaround_bpiall_vbar0_serror_aarch32
 
 	/* ---------------------------------------------------------------------
@@ -297,31 +262,33 @@ vector_entry workaround_bpiall_vbar1_serror_aarch64
 	 * ---------------------------------------------------------------------
 	 */
 vector_entry workaround_bpiall_vbar1_sync_exception_aarch32
-	/* Restore register state from the workaround context */
-	ldp	x2, x3, [sp, #CTX_CVE_2017_5715_OFFSET + CTX_CVE_2017_5715_QUAD0]
-	ldp	x4, x5, [sp, #CTX_CVE_2017_5715_OFFSET + CTX_CVE_2017_5715_QUAD2]
-	ldp	x6, x30, [sp, #CTX_CVE_2017_5715_OFFSET + CTX_CVE_2017_5715_QUAD4]
+	/*
+	 * w2 indicates which SEL1 stub was run and thus which original vector was used
+	 * w3-w6 contain saved system register state (esr_el3 in w3)
+	 * Restore LR and ELR_EL3 register state from the GP regs context
+	 */
+	ldp	x30, x7, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_LR]
 
 	/* Apply the restored system register state */
-	msr	scr_el3, x2
-	msr	spsr_el3, x3
-	msr	elr_el3, x4
-	msr	sctlr_el1, x5
-	msr	esr_el3, x6
+	msr	esr_el3, x3
+	msr	spsr_el3, x4
+	msr	scr_el3, x5
+	msr	sctlr_el1, x6
+	msr	elr_el3, x7
 
 	/*
 	 * Workaround is complete, so swap VBAR_EL3 to point
 	 * to workaround entry table in preparation for subsequent
 	 * Sync/IRQ/FIQ/SError exceptions.
 	 */
-	adr	x2, workaround_bpiall_vbar0_runtime_exceptions
-	msr	vbar_el3, x2
+	adr	x0, workaround_bpiall_vbar0_runtime_exceptions
+	msr	vbar_el3, x0
 
 	/*
-	 * Restore all GP regs except x0 and x1.  The value in x0
+	 * Restore all GP regs except x2 and x3 (esr).  The value in x2
 	 * indicates the type of the original exception.
 	 */
-	ldp	x2, x3, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X2]
+	ldp	x0, x1, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X0]
 	ldp	x4, x5, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X4]
 	ldp	x6, x7, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X6]
 	ldp	x8, x9, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X8]
@@ -336,37 +303,55 @@ vector_entry workaround_bpiall_vbar1_sync_exception_aarch32
 	ldp	x26, x27, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X26]
 	ldp	x28, x29, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X28]
 
+	/* Fast path Sync exceptions.  Static predictor will fall through. */
+	tbz	w2, #0, workaround_not_sync
+
 	/*
-	 * Each of these handlers will first restore x0 and x1 from
-	 * the context and the branch to the common implementation for
-	 * each of the exception types.
+	 * Check if SMC is coming from A64 state on #0
+	 * with W0 = SMCCC_ARCH_WORKAROUND_1
+	 *
+	 * This sequence evaluates as:
+	 *    (W0==SMCCC_ARCH_WORKAROUND_1) ? (ESR_EL3==SMC#0) : (NE)
+	 * allowing use of a single branch operation
 	 */
-	tbnz	x0, #1, workaround_bpiall_vbar1_irq
-	tbnz	x0, #2, workaround_bpiall_vbar1_fiq
-	tbnz	x0, #3, workaround_bpiall_vbar1_serror
-
-	/* Fallthrough case for Sync exception */
-	ldp	x0, x1, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X0]
+	orr	w2, wzr, #SMCCC_ARCH_WORKAROUND_1
+	cmp	w0, w2
+	mov_imm	w2, ESR_EL3_A64_SMC0
+	ccmp	w3, w2, #0, eq
+	/* Static predictor will predict a fall through */
+	bne	1f
+	eret
+1:
+	ldp	x2, x3, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X2]
 	b	sync_exception_aarch64
 	check_vector_size workaround_bpiall_vbar1_sync_exception_aarch32
 
 vector_entry workaround_bpiall_vbar1_irq_aarch32
 	b	report_unhandled_interrupt
-workaround_bpiall_vbar1_irq:
-	ldp	x0, x1, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X0]
+
+	/*
+	 * Post-workaround fan-out for non-sync exceptions
+	 */
+workaround_not_sync:
+	tbnz	w2, #3, workaround_bpiall_vbar1_serror
+	tbnz	w2, #2, workaround_bpiall_vbar1_fiq
+	/* IRQ */
+	ldp	x2, x3, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X2]
 	b	irq_aarch64
+
+workaround_bpiall_vbar1_fiq:
+	ldp	x2, x3, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X2]
+	b	fiq_aarch64
+
+workaround_bpiall_vbar1_serror:
+	ldp	x2, x3, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X2]
+	b	serror_aarch64
 	check_vector_size workaround_bpiall_vbar1_irq_aarch32
 
 vector_entry workaround_bpiall_vbar1_fiq_aarch32
 	b	report_unhandled_interrupt
-workaround_bpiall_vbar1_fiq:
-	ldp	x0, x1, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X0]
-	b	fiq_aarch64
 	check_vector_size workaround_bpiall_vbar1_fiq_aarch32
 
 vector_entry workaround_bpiall_vbar1_serror_aarch32
 	b	report_unhandled_exception
-workaround_bpiall_vbar1_serror:
-	ldp	x0, x1, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X0]
-	b	serror_aarch64
 	check_vector_size workaround_bpiall_vbar1_serror_aarch32
diff --git a/lib/cpus/aarch64/workaround_cve_2017_5715_mmu.S b/lib/cpus/aarch64/workaround_cve_2017_5715_mmu.S
index f4781484c..b24b620c8 100644
--- a/lib/cpus/aarch64/workaround_cve_2017_5715_mmu.S
+++ b/lib/cpus/aarch64/workaround_cve_2017_5715_mmu.S
@@ -1,26 +1,60 @@
 /*
- * Copyright (c) 2017, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2017-2018, ARM Limited and Contributors. All rights reserved.
  *
  * SPDX-License-Identifier: BSD-3-Clause
  */
 
 #include <arch.h>
+#include <arm_arch_svc.h>
 #include <asm_macros.S>
 #include <context.h>
 
 	.globl	workaround_mmu_runtime_exceptions
 
+#define ESR_EL3_A64_SMC0	0x5e000000
+
 vector_base workaround_mmu_runtime_exceptions
 
-	.macro	apply_workaround
+	.macro	apply_workaround _is_sync_exception
 	stp	x0, x1, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X0]
-	mrs	x0, sctlr_el3
+	mrs	x1, sctlr_el3
 	/* Disable MMU */
-	bic	x1, x0, #SCTLR_M_BIT
+	bic	x1, x1, #SCTLR_M_BIT
 	msr	sctlr_el3, x1
 	isb
-	/* Restore MMU config */
-	msr	sctlr_el3, x0
+	/* Enable MMU */
+	orr	x1, x1, #SCTLR_M_BIT
+	msr	sctlr_el3, x1
+	/*
+	 * Defer ISB to avoid synchronizing twice in case we hit
+	 * the workaround SMC call which will implicitly synchronize
+	 * because of the ERET instruction.
+	 */
+
+	/*
+	 * Ensure SMC is coming from A64 state on #0
+	 * with W0 = SMCCC_ARCH_WORKAROUND_1
+	 *
+	 * This sequence evaluates as:
+	 *    (W0==SMCCC_ARCH_WORKAROUND_1) ? (ESR_EL3==SMC#0) : (NE)
+	 * allowing use of a single branch operation
+	 */
+	.if \_is_sync_exception
+		orr	w1, wzr, #SMCCC_ARCH_WORKAROUND_1
+		cmp	w0, w1
+		mrs	x0, esr_el3
+		mov_imm	w1, ESR_EL3_A64_SMC0
+		ccmp	w0, w1, #0, eq
+		/* Static predictor will predict a fall through */
+		bne	1f
+		eret
+1:
+	.endif
+
+	/*
+	 * Synchronize now to enable the MMU.  This is required
+	 * to ensure the load pair below reads the data stored earlier.
+	 */
 	isb
 	ldp	x0, x1, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X0]
 	.endm
@@ -70,22 +104,22 @@ vector_entry workaround_mmu_serror_sp_elx
 	 * ---------------------------------------------------------------------
 	 */
 vector_entry workaround_mmu_sync_exception_aarch64
-	apply_workaround
+	apply_workaround _is_sync_exception=1
 	b	sync_exception_aarch64
 	check_vector_size workaround_mmu_sync_exception_aarch64
 
 vector_entry workaround_mmu_irq_aarch64
-	apply_workaround
+	apply_workaround _is_sync_exception=0
 	b	irq_aarch64
 	check_vector_size workaround_mmu_irq_aarch64
 
 vector_entry workaround_mmu_fiq_aarch64
-	apply_workaround
+	apply_workaround _is_sync_exception=0
 	b	fiq_aarch64
 	check_vector_size workaround_mmu_fiq_aarch64
 
 vector_entry workaround_mmu_serror_aarch64
-	apply_workaround
+	apply_workaround _is_sync_exception=0
 	b	serror_aarch64
 	check_vector_size workaround_mmu_serror_aarch64
 
@@ -94,21 +128,21 @@ vector_entry workaround_mmu_serror_aarch64
 	 * ---------------------------------------------------------------------
 	 */
 vector_entry workaround_mmu_sync_exception_aarch32
-	apply_workaround
+	apply_workaround _is_sync_exception=1
 	b	sync_exception_aarch32
 	check_vector_size workaround_mmu_sync_exception_aarch32
 
 vector_entry workaround_mmu_irq_aarch32
-	apply_workaround
+	apply_workaround _is_sync_exception=0
 	b	irq_aarch32
 	check_vector_size workaround_mmu_irq_aarch32
 
 vector_entry workaround_mmu_fiq_aarch32
-	apply_workaround
+	apply_workaround _is_sync_exception=0
 	b	fiq_aarch32
 	check_vector_size workaround_mmu_fiq_aarch32
 
 vector_entry workaround_mmu_serror_aarch32
-	apply_workaround
+	apply_workaround _is_sync_exception=0
 	b	serror_aarch32
 	check_vector_size workaround_mmu_serror_aarch32
diff --git a/lib/extensions/amu/aarch32/amu.c b/lib/extensions/amu/aarch32/amu.c
index effc5bd3a..68cc4b34c 100644
--- a/lib/extensions/amu/aarch32/amu.c
+++ b/lib/extensions/amu/aarch32/amu.c
@@ -5,6 +5,7 @@
  */
 
 #include <amu.h>
+#include <amu_private.h>
 #include <arch.h>
 #include <arch_helpers.h>
 #include <platform.h>
@@ -14,21 +15,26 @@
 
 struct amu_ctx {
 	uint64_t group0_cnts[AMU_GROUP0_NR_COUNTERS];
+	uint64_t group1_cnts[AMU_GROUP1_NR_COUNTERS];
 };
 
 static struct amu_ctx amu_ctxs[PLATFORM_CORE_COUNT];
 
-void amu_enable(int el2_unused)
+int amu_supported(void)
 {
 	uint64_t features;
 
 	features = read_id_pfr0() >> ID_PFR0_AMU_SHIFT;
-	if ((features & ID_PFR0_AMU_MASK) != 1)
+	return (features & ID_PFR0_AMU_MASK) == 1;
+}
+
+void amu_enable(int el2_unused)
+{
+	if (!amu_supported())
 		return;
 
 	if (el2_unused) {
 		uint64_t v;
-
 		/*
 		 * Non-secure access from EL0 or EL1 to the Activity Monitor
 		 * registers do not trap to EL2.
@@ -40,15 +46,64 @@ void amu_enable(int el2_unused)
 
 	/* Enable group 0 counters */
 	write_amcntenset0(AMU_GROUP0_COUNTERS_MASK);
+
+	/* Enable group 1 counters */
+	write_amcntenset1(AMU_GROUP1_COUNTERS_MASK);
+}
+
+/* Read the group 0 counter identified by the given `idx`. */
+uint64_t amu_group0_cnt_read(int idx)
+{
+	assert(amu_supported());
+	assert(idx >= 0 && idx < AMU_GROUP0_NR_COUNTERS);
+
+	return amu_group0_cnt_read_internal(idx);
+}
+
+/* Write the group 0 counter identified by the given `idx` with `val`. */
+void amu_group0_cnt_write(int idx, uint64_t val)
+{
+	assert(amu_supported());
+	assert(idx >= 0 && idx < AMU_GROUP0_NR_COUNTERS);
+
+	amu_group0_cnt_write_internal(idx, val);
+	isb();
+}
+
+/* Read the group 1 counter identified by the given `idx`. */
+uint64_t amu_group1_cnt_read(int idx)
+{
+	assert(amu_supported());
+	assert(idx >= 0 && idx < AMU_GROUP1_NR_COUNTERS);
+
+	return amu_group1_cnt_read_internal(idx);
+}
+
+/* Write the group 1 counter identified by the given `idx` with `val`. */
+void amu_group1_cnt_write(int idx, uint64_t val)
+{
+	assert(amu_supported());
+	assert(idx >= 0 && idx < AMU_GROUP1_NR_COUNTERS);
+
+	amu_group1_cnt_write_internal(idx, val);
+	isb();
+}
+
+void amu_group1_set_evtype(int idx, unsigned int val)
+{
+	assert(amu_supported());
+	assert(idx >= 0 && idx < AMU_GROUP1_NR_COUNTERS);
+
+	amu_group1_set_evtype_internal(idx, val);
+	isb();
 }
 
 static void *amu_context_save(const void *arg)
 {
 	struct amu_ctx *ctx;
-	uint64_t features;
+	int i;
 
-	features = read_id_pfr0() >> ID_PFR0_AMU_SHIFT;
-	if ((features & ID_PFR0_AMU_MASK) != 1)
+	if (!amu_supported())
 		return (void *)-1;
 
 	ctx = &amu_ctxs[plat_my_core_pos()];
@@ -61,12 +116,14 @@ static void *amu_context_save(const void *arg)
 	 * counter values from the future via the memory mapped view.
 	 */
 	write_amcntenclr0(AMU_GROUP0_COUNTERS_MASK);
+	write_amcntenclr1(AMU_GROUP1_COUNTERS_MASK);
 	isb();
 
-	ctx->group0_cnts[0] = read64_amevcntr00();
-	ctx->group0_cnts[1] = read64_amevcntr01();
-	ctx->group0_cnts[2] = read64_amevcntr02();
-	ctx->group0_cnts[3] = read64_amevcntr03();
+	for (i = 0; i < AMU_GROUP0_NR_COUNTERS; i++)
+		ctx->group0_cnts[i] = amu_group0_cnt_read(i);
+
+	for (i = 0; i < AMU_GROUP1_NR_COUNTERS; i++)
+		ctx->group1_cnts[i] = amu_group1_cnt_read(i);
 
 	return 0;
 }
@@ -75,6 +132,7 @@ static void *amu_context_restore(const void *arg)
 {
 	struct amu_ctx *ctx;
 	uint64_t features;
+	int i;
 
 	features = read_id_pfr0() >> ID_PFR0_AMU_SHIFT;
 	if ((features & ID_PFR0_AMU_MASK) != 1)
@@ -86,19 +144,16 @@ static void *amu_context_restore(const void *arg)
 	assert(read_amcntenset0() == 0);
 
 	/* Restore group 0 counters */
-	if (AMU_GROUP0_COUNTERS_MASK & (1U << 0))
-		write64_amevcntr00(ctx->group0_cnts[0]);
-	if (AMU_GROUP0_COUNTERS_MASK & (1U << 1))
-		write64_amevcntr01(ctx->group0_cnts[1]);
-	if (AMU_GROUP0_COUNTERS_MASK & (1U << 2))
-		write64_amevcntr02(ctx->group0_cnts[2]);
-	if (AMU_GROUP0_COUNTERS_MASK & (1U << 3))
-		write64_amevcntr03(ctx->group0_cnts[3]);
-	isb();
+	for (i = 0; i < AMU_GROUP0_NR_COUNTERS; i++)
+		amu_group0_cnt_write(i, ctx->group0_cnts[i]);
+	for (i = 0; i < AMU_GROUP1_NR_COUNTERS; i++)
+		amu_group1_cnt_write(i, ctx->group1_cnts[i]);
 
 	/* Enable group 0 counters */
 	write_amcntenset0(AMU_GROUP0_COUNTERS_MASK);
 
+	/* Enable group 1 counters */
+	write_amcntenset1(AMU_GROUP1_COUNTERS_MASK);
 	return 0;
 }
 
diff --git a/lib/extensions/amu/aarch32/amu_helpers.S b/lib/extensions/amu/aarch32/amu_helpers.S
new file mode 100644
index 000000000..84dca04c3
--- /dev/null
+++ b/lib/extensions/amu/aarch32/amu_helpers.S
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2018, ARM Limited and Contributors. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include <arch.h>
+#include <assert_macros.S>
+#include <asm_macros.S>
+
+	.globl	amu_group0_cnt_read_internal
+	.globl	amu_group0_cnt_write_internal
+	.globl	amu_group1_cnt_read_internal
+	.globl	amu_group1_cnt_write_internal
+	.globl	amu_group1_set_evtype_internal
+
+/*
+ * uint64_t amu_group0_cnt_read_internal(int idx);
+ *
+ * Given `idx`, read the corresponding AMU counter
+ * and return it in `r0`.
+ */
+func amu_group0_cnt_read_internal
+#if ENABLE_ASSERTIONS
+	/* `idx` should be between [0, 3] */
+	mov	r1, r0
+	lsr	r1, r1, #2
+	cmp	r1, #0
+	ASM_ASSERT(eq)
+#endif
+
+	/*
+	 * Given `idx` calculate address of ldcopr16/bx lr instruction pair
+	 * in the table below.
+	 */
+	adr	r1, 1f
+	lsl	r0, r0, #3	/* each ldcopr16/bx lr sequence is 8 bytes */
+	add	r1, r1, r0
+	bx	r1
+1:
+	ldcopr16	r0, r1, AMEVCNTR00	/* index 0 */
+	bx		lr
+	ldcopr16	r0, r1, AMEVCNTR01	/* index 1 */
+	bx 		lr
+	ldcopr16	r0, r1, AMEVCNTR02	/* index 2 */
+	bx 		lr
+	ldcopr16	r0, r1, AMEVCNTR03	/* index 3 */
+	bx 		lr
+endfunc amu_group0_cnt_read_internal
+
+/*
+ * void amu_group0_cnt_write_internal(int idx, uint64_t val);
+ *
+ * Given `idx`, write `val` to the corresponding AMU counter.
+ */
+func amu_group0_cnt_write_internal
+#if ENABLE_ASSERTIONS
+	/* `idx` should be between [0, 3] */
+	mov	r2, r0
+	lsr	r2, r2, #2
+	cmp	r2, #0
+	ASM_ASSERT(eq)
+#endif
+
+	/*
+	 * Given `idx` calculate address of stcopr16/bx lr instruction pair
+	 * in the table below.
+	 */
+	adr	r2, 1f
+	lsl	r0, r0, #3	/* each stcopr16/bx lr sequence is 8 bytes */
+	add	r2, r2, r0
+	bx	r2
+
+1:
+	stcopr16	r0,r1, AMEVCNTR00	/* index 0 */
+	bx 		lr
+	stcopr16	r0,r1, AMEVCNTR01	/* index 1 */
+	bx 		lr
+	stcopr16	r0,r1, AMEVCNTR02	/* index 2 */
+	bx 		lr
+	stcopr16	r0,r1, AMEVCNTR03	/* index 3 */
+	bx 		lr
+endfunc amu_group0_cnt_write_internal
+
+/*
+ * uint64_t amu_group1_cnt_read_internal(int idx);
+ *
+ * Given `idx`, read the corresponding AMU counter
+ * and return it in `r0`.
+ */
+func amu_group1_cnt_read_internal
+#if ENABLE_ASSERTIONS
+	/* `idx` should be between [0, 15] */
+	mov	r2, r0
+	lsr	r2, r2, #4
+	cmp	r2, #0
+	ASM_ASSERT(eq)
+#endif
+
+	/*
+	 * Given `idx` calculate address of ldcopr16/bx lr instruction pair
+	 * in the table below.
+	 */
+	adr	r1, 1f
+	lsl	r0, r0, #3	/* each ldcopr16/bx lr sequence is 8 bytes */
+	add	r1, r1, r0
+	bx	r1
+
+1:
+	ldcopr16	r0,r1, AMEVCNTR10	/* index 0 */
+	bx	lr
+	ldcopr16	r0,r1, AMEVCNTR11	/* index 1 */
+	bx	lr
+	ldcopr16	r0,r1, AMEVCNTR12	/* index 2 */
+	bx	lr
+	ldcopr16	r0,r1, AMEVCNTR13	/* index 3 */
+	bx	lr
+	ldcopr16	r0,r1, AMEVCNTR14	/* index 4 */
+	bx	lr
+	ldcopr16	r0,r1, AMEVCNTR15	/* index 5 */
+	bx	lr
+	ldcopr16	r0,r1, AMEVCNTR16	/* index 6 */
+	bx	lr
+	ldcopr16	r0,r1, AMEVCNTR17	/* index 7 */
+	bx	lr
+	ldcopr16	r0,r1, AMEVCNTR18	/* index 8 */
+	bx	lr
+	ldcopr16	r0,r1, AMEVCNTR19	/* index 9 */
+	bx	lr
+	ldcopr16	r0,r1, AMEVCNTR1A	/* index 10 */
+	bx	lr
+	ldcopr16	r0,r1, AMEVCNTR1B	/* index 11 */
+	bx	lr
+	ldcopr16	r0,r1, AMEVCNTR1C	/* index 12 */
+	bx	lr
+	ldcopr16	r0,r1, AMEVCNTR1D	/* index 13 */
+	bx	lr
+	ldcopr16	r0,r1, AMEVCNTR1E	/* index 14 */
+	bx	lr
+	ldcopr16	r0,r1, AMEVCNTR1F	/* index 15 */
+	bx	lr
+endfunc amu_group1_cnt_read_internal
+
+/*
+ * void amu_group1_cnt_write_internal(int idx, uint64_t val);
+ *
+ * Given `idx`, write `val` to the corresponding AMU counter.
+ */
+func amu_group1_cnt_write_internal
+#if ENABLE_ASSERTIONS
+	/* `idx` should be between [0, 15] */
+	mov	r2, r0
+	lsr	r2, r2, #4
+	cmp	r2, #0
+	ASM_ASSERT(eq)
+#endif
+
+	/*
+	 * Given `idx` calculate address of ldcopr16/bx lr instruction pair
+	 * in the table below.
+	 */
+	adr	r2, 1f
+	lsl	r0, r0, #3	/* each stcopr16/bx lr sequence is 8 bytes */
+	add	r2, r2, r0
+	bx	r2
+
+1:
+	stcopr16	r0,r1,	AMEVCNTR10	/* index 0 */
+	bx		lr
+	stcopr16	r0,r1,	AMEVCNTR11	/* index 1 */
+	bx		lr
+	stcopr16	r0,r1,	AMEVCNTR12	/* index 2 */
+	bx		lr
+	stcopr16	r0,r1,	AMEVCNTR13	/* index 3 */
+	bx		lr
+	stcopr16	r0,r1,	AMEVCNTR14	/* index 4 */
+	bx		lr
+	stcopr16	r0,r1,	AMEVCNTR15	/* index 5 */
+	bx		lr
+	stcopr16	r0,r1,	AMEVCNTR16	/* index 6 */
+	bx		lr
+	stcopr16	r0,r1,	AMEVCNTR17	/* index 7 */
+	bx		lr
+	stcopr16	r0,r1,	AMEVCNTR18	/* index 8 */
+	bx		lr
+	stcopr16	r0,r1,	AMEVCNTR19	/* index 9 */
+	bx		lr
+	stcopr16	r0,r1,	AMEVCNTR1A	/* index 10 */
+	bx		lr
+	stcopr16	r0,r1,	AMEVCNTR1B	/* index 11 */
+	bx		lr
+	stcopr16	r0,r1,	AMEVCNTR1C	/* index 12 */
+	bx		lr
+	stcopr16	r0,r1,	AMEVCNTR1D	/* index 13 */
+	bx		lr
+	stcopr16	r0,r1,	AMEVCNTR1E	/* index 14 */
+	bx		lr
+	stcopr16	r0,r1,	AMEVCNTR1F	/* index 15 */
+	bx		lr
+endfunc amu_group1_cnt_write_internal
+
+/*
+ * void amu_group1_set_evtype_internal(int idx, unsigned int val);
+ *
+ * Program the AMU event type register indexed by `idx`
+ * with the value `val`.
+ */
+func amu_group1_set_evtype_internal
+#if ENABLE_ASSERTIONS
+	/* `idx` should be between [0, 15] */
+	mov	r2, r0
+	lsr	r2, r2, #4
+	cmp	r2, #0
+	ASM_ASSERT(eq)
+
+	/* val should be between [0, 65535] */
+	mov	r2, r1
+	lsr	r2, r2, #16
+	cmp	r2, #0
+	ASM_ASSERT(eq)
+#endif
+
+	/*
+	 * Given `idx` calculate address of stcopr/bx lr instruction pair
+	 * in the table below.
+	 */
+	adr	r2, 1f
+	lsl	r0, r0, #3	/* each stcopr/bx lr sequence is 8 bytes */
+	add	r2, r2, r0
+	bx	r2
+
+1:
+	stcopr	r0,	AMEVTYPER10 /* index 0 */
+	bx	lr
+	stcopr	r0,	AMEVTYPER11 /* index 1 */
+	bx	lr
+	stcopr	r0,	AMEVTYPER12 /* index 2 */
+	bx	lr
+	stcopr	r0,	AMEVTYPER13 /* index 3 */
+	bx	lr
+	stcopr	r0,	AMEVTYPER14 /* index 4 */
+	bx	lr
+	stcopr	r0,	AMEVTYPER15 /* index 5 */
+	bx	lr
+	stcopr	r0,	AMEVTYPER16 /* index 6 */
+	bx	lr
+	stcopr	r0,	AMEVTYPER17 /* index 7 */
+	bx	lr
+	stcopr	r0,	AMEVTYPER18 /* index 8 */
+	bx	lr
+	stcopr	r0,	AMEVTYPER19 /* index 9 */
+	bx	lr
+	stcopr	r0,	AMEVTYPER1A /* index 10 */
+	bx	lr
+	stcopr	r0,	AMEVTYPER1B /* index 11 */
+	bx	lr
+	stcopr	r0,	AMEVTYPER1C /* index 12 */
+	bx	lr
+	stcopr	r0,	AMEVTYPER1D /* index 13 */
+	bx	lr
+	stcopr	r0,	AMEVTYPER1E /* index 14 */
+	bx	lr
+	stcopr	r0,	AMEVTYPER1F /* index 15 */
+	bx	lr
+endfunc amu_group1_set_evtype_internal
diff --git a/lib/extensions/amu/aarch64/amu.c b/lib/extensions/amu/aarch64/amu.c
index d7645a9e1..7d39f35c1 100644
--- a/lib/extensions/amu/aarch64/amu.c
+++ b/lib/extensions/amu/aarch64/amu.c
@@ -172,7 +172,6 @@ static void *amu_context_restore(const void *arg)
 	for (i = 0; i < AMU_GROUP1_NR_COUNTERS; i++)
 		if (AMU_GROUP1_COUNTERS_MASK & (1U << i))
 			amu_group1_cnt_write(i, ctx->group1_cnts[i]);
-	isb();
 
 	/* Restore group 0/1 counter configuration */
 	write_amcntenset0_el0(AMU_GROUP0_COUNTERS_MASK);
diff --git a/lib/psci/psci_main.c b/lib/psci/psci_main.c
index 8e41cf026..88cf5cbe4 100644
--- a/lib/psci/psci_main.c
+++ b/lib/psci/psci_main.c
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2018, ARM Limited and Contributors. All rights reserved.
  *
  * SPDX-License-Identifier: BSD-3-Clause
  */
 
 #include <arch.h>
 #include <arch_helpers.h>
+#include <arm_arch_svc.h>
 #include <assert.h>
 #include <debug.h>
 #include <platform.h>
@@ -322,6 +323,9 @@ int psci_features(unsigned int psci_fid)
 {
 	unsigned int local_caps = psci_caps;
 
+	if (psci_fid == SMCCC_VERSION)
+		return PSCI_E_SUCCESS;
+
 	/* Check if it is a 64 bit function */
 	if (((psci_fid >> FUNCID_CC_SHIFT) & FUNCID_CC_MASK) == SMC_64)
 		local_caps &= PSCI_CAP_64BIT_MASK;