aboutsummaryrefslogtreecommitdiffstats
path: root/libc/arch-arm
diff options
context:
space:
mode:
authorChitti Babu Theegala <ctheegal@codeaurora.org>2016-12-16 02:13:28 +0530
committerUday Kishore Pasupuleti <upasupul@codeaurora.org>2016-12-19 15:11:43 -0800
commitcbfdc7f9054e0bc8071aaf3a70afc00273a8a869 (patch)
tree3a8e2572533c625e70d56e891a7d267b7639c885 /libc/arch-arm
parentc816e9fa03c67b3a5a112836a8e52b988c42f7ee (diff)
downloadandroid_bionic-cbfdc7f9054e0bc8071aaf3a70afc00273a8a869.tar.gz
android_bionic-cbfdc7f9054e0bc8071aaf3a70afc00273a8a869.tar.bz2
android_bionic-cbfdc7f9054e0bc8071aaf3a70afc00273a8a869.zip
Fix streaming(memcpy) performance on Cortex-A7
Stream-mode detection for L1 in A7-core is failing for non cache-line-size (non 64 byte) aligned addresses. This leads to destination data getting cached unnecessarily. This A7 issue is confirmed by ARM This issue is solved by aligning destination address to 64 byte before entering the loop in memcpy routine. Though we get lower score for micro_bench memcpy when L1 cache is bypassed, it is desirable since it avoids unnecessary eviction of other process data from L1 which is good for overall system performance. Higher micro_bench memcpy numbers for < 64byte alignment shows good numbers but this is at the cost of L1 cache pollution. During memcpy/memset, unnecessary data is filled in L1 cache, this causes eviction of other process data from L1. For example during msmset(0), L1 cache gets filled with 0s which should be avoided. Additionally, there is another issue with cortex A7 that impacts performance for all alignments / all Android Wear versions: Store Buffer on A7 is 32 byte which limits the 32-byte back to back stores. In the current implementation back to back 32bytes writes is causing CPU stalls. This issue can be solved by interleaved Loads and Stores. This helps in avoiding CPU stalls during memcpy by utilizing efficiently the A7 internal load and store buffers. Change-Id: Ie5f12f2bb5d86f627686730416279057e4f5f6d0
Diffstat (limited to 'libc/arch-arm')
-rw-r--r--libc/arch-arm/cortex-a7/bionic/memcpy_base.S28
1 files changed, 25 insertions, 3 deletions
diff --git a/libc/arch-arm/cortex-a7/bionic/memcpy_base.S b/libc/arch-arm/cortex-a7/bionic/memcpy_base.S
index 1d152bbc1..4ff982b0f 100644
--- a/libc/arch-arm/cortex-a7/bionic/memcpy_base.S
+++ b/libc/arch-arm/cortex-a7/bionic/memcpy_base.S
@@ -101,16 +101,38 @@
vld1.8 {d0}, [r1]!
vst1.8 {d0}, [r0, :64]!
-2: // Make sure we have at least 64 bytes to copy.
+2: cmp r2, #256
+ ble .L_copy_loop
+
+ // Make sure DST is 64 BYTE aligned.
+ rsb r3, r0, #0
+ ands r3, r3, #0x30
+ beq .L_copy_loop
+
+ sub r2, r2, r3
+ cmp r3, #0x10
+ beq .L_copy_16
+
+ vld1.8 {d0 - d3}, [r1]!
+ vst1.8 {d0 - d3}, [r0, :128]!
+ ands r3, r3, #0x10
+ beq .L_copy_loop
+
+.L_copy_16:
+ vld1.8 {d0, d1}, [r1]!
+ vst1.8 {d0, d1}, [r0, :128]!
+
+.L_copy_loop:
+ // Make sure we have at least 64 bytes to copy.
subs r2, r2, #64
blo 2f
1: // The main loop copies 64 bytes at a time.
vld1.8 {d0 - d3}, [r1]!
- vld1.8 {d4 - d7}, [r1]!
+ vst1.8 {d0 - d3}, [r0, :128]!
pld [r1, #(64*4)]
subs r2, r2, #64
- vst1.8 {d0 - d3}, [r0, :128]!
+ vld1.8 {d4 - d7}, [r1]!
vst1.8 {d4 - d7}, [r0, :128]!
bhs 1b