diff options
author | Alexei Fedorov <Alexei.Fedorov@arm.com> | 2020-09-01 15:38:32 +0100 |
---|---|---|
committer | Alexei Fedorov <Alexei.Fedorov@arm.com> | 2020-09-02 16:21:34 +0000 |
commit | e3f2b1a93211811567a2db0938b814758401b358 (patch) | |
tree | f81f8204ae22f734eac1552e48da8dd4a7cd6e79 /lib/libc/aarch32 | |
parent | 29b76f2e9f5c6f8e3a86134391eee417b3f0f6a8 (diff) | |
download | platform_external_arm-trusted-firmware-e3f2b1a93211811567a2db0938b814758401b358.tar.gz platform_external_arm-trusted-firmware-e3f2b1a93211811567a2db0938b814758401b358.tar.bz2 platform_external_arm-trusted-firmware-e3f2b1a93211811567a2db0938b814758401b358.zip |
plat/arm: Introduce and use libc_asm.mk makefile
Trace analysis of FVP_Base_AEMv8A 0.0/6063 model
running in Aarch32 mode with the build options
listed below:
TRUSTED_BOARD_BOOT=1 GENERATE_COT=1
ARM_ROTPK_LOCATION=devel_ecdsa KEY_ALG=ecdsa
ROT_KEY=plat/arm/board/common/rotpk/arm_rotprivk_ecdsa.pem
shows that when auth_signature() gets called
71.99% of CPU execution time is spent in memset() function
written in C using single byte write operations,
see lib\libc\memset.c.
This patch introduces new libc_asm.mk makefile which
replaces C memset() implementation with assembler
version giving the following results:
- for Aarch32 in auth_signature() call memset() CPU time
reduced to 20.56%.
The number of CPU instructions (Inst) executed during
TF-A boot stage before start of BL33 in RELEASE builds
for different versions is presented in the tables below,
where:
- C TF-A: existing TF-A C code;
- C musl: "lightweight code" C "implementation of the
standard library for Linux-based systems"
https://git.musl-libc.org/cgit/musl/tree/src/string/memset.c
- Asm Opt: assemler version from "Arm Optimized Routines"
project
https://github.com/ARM-software/optimized-routines/blob/
master/string/arm/memset.S
- Asm Linux: assembler version from Linux kernel
https://github.com/torvalds/linux/blob/master/arch/arm/lib/memset.S
- Asm TF-A: assembler version from this patch
Aarch32:
+-----------+------+------+--------------+----------+
| Variant | Set | Size | Inst | Ratio |
+-----------+------+------+--------------+----------+
| C TF-A | T32 | 16 | 2122110003 | 1.000000 |
| C musl | T32 | 156 | 1643917668 | 0.774662 |
| Asm Opt | T32 | 84 | 1604810003 | 0.756233 |
| Asm Linux | A32 | 168 | 1566255018 | 0.738065 |
| Asm TF-A | A32 | 160 | 1525865101 | 0.719032 |
+-----------+------+------+--------------+----------+
AArch64:
+-----------+------+------------+----------+
| Variant | Size | Inst | Ratio |
+-----------+------+------------+----------+
| C TF-A | 28 | 2732497518 | 1.000000 |
| C musl | 212 | 1802999999 | 0.659836 |
| Asm TF-A | 140 | 1680260003 | 0.614917 |
+-----------+------+------------+----------+
This patch modifies 'plat\arm\common\arm_common.mk'
by overriding libc.mk makefile with libc_asm.mk and
does not effect other platforms.
Change-Id: Ie89dd0b74ba1079420733a0d76b7366ad0157c2e
Signed-off-by: Alexei Fedorov <Alexei.Fedorov@arm.com>
Diffstat (limited to 'lib/libc/aarch32')
-rw-r--r-- | lib/libc/aarch32/memset.S | 74 |
1 files changed, 74 insertions, 0 deletions
diff --git a/lib/libc/aarch32/memset.S b/lib/libc/aarch32/memset.S new file mode 100644 index 000000000..880ba8382 --- /dev/null +++ b/lib/libc/aarch32/memset.S @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2020, Arm Limited. All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include <asm_macros.S> + + .syntax unified + .global memset + +/* ----------------------------------------------------------------------- + * void *memset(void *dst, int val, size_t count) + * + * Copy the value of 'val' (converted to an unsigned char) into + * each of the first 'count' characters of the object pointed to by 'dst'. + * + * Returns the value of 'dst'. + * ----------------------------------------------------------------------- + */ +func memset + mov r12, r0 /* keep r0 */ + tst r0, #3 + beq aligned /* 4-bytes aligned */ + + /* Unaligned 'dst' */ +unaligned: + subs r2, r2, #1 + strbhs r1, [r12], #1 + bxls lr /* return if 0 */ + tst r12, #3 + bne unaligned /* continue while unaligned */ + + /* 4-bytes aligned */ +aligned:bfi r1, r1, #8, #8 /* propagate 'val' */ + bfi r1, r1, #16, #16 + + mov r3, r1 + + cmp r2, #16 + blo less_16 /* < 16 */ + + push {r4, lr} + mov r4, r1 + mov lr, r1 + +write_32: + subs r2, r2, #32 + stmiahs r12!, {r1, r3, r4, lr} + stmiahs r12!, {r1, r3, r4, lr} + bhi write_32 /* write 32 bytes in a loop */ + popeq {r4, pc} /* return if 0 */ + lsls r2, r2, #28 /* C = r2[4]; N = r2[3]; Z = r2[3:0] */ + stmiacs r12!, {r1, r3, r4, lr} /* write 16 bytes */ + popeq {r4, pc} /* return if 16 */ + stmiami r12!, {r1, r3} /* write 8 bytes */ + lsls r2, r2, #2 /* C = r2[2]; N = r2[1]; Z = r2[1:0] */ + strcs r1, [r12], #4 /* write 4 bytes */ + popeq {r4, pc} /* return if 8 or 4 */ + strhmi r1, [r12], #2 /* write 2 bytes */ + lsls r2, r2, #1 /* N = Z = r2[0] */ + strbmi r1, [r12] /* write 1 byte */ + pop {r4, pc} + +less_16:lsls r2, r2, #29 /* C = r2[3]; N = r2[2]; Z = r2[2:0] */ + stmiacs r12!, {r1, r3} /* write 8 bytes */ + bxeq lr /* return if 8 */ + strmi r1, [r12], #4 /* write 4 bytes */ + lsls r2, r2, #2 /* C = r2[1]; N = Z = r2[0] */ + strhcs r1, [r12], #2 /* write 2 bytes */ + strbmi r1, [r12] /* write 1 byte */ + bx lr + +endfunc memset |