From f8a907d25a9f319e67fcf005638adb52fa09dd8b Mon Sep 17 00:00:00 2001 From: Brent DeGraaf Date: Wed, 2 Oct 2013 13:47:11 +0000 Subject: [AOSP Master] libc: krait: Use performance version of memcpy * This commit improves performance for small copies compared to the original CAF one. It also cleans up some functions. Change-Id: Iaa52635240da8b8746693186b66b69778e833c32 --- libc/arch-arm/krait/bionic/__strcat_chk.S | 19 ++--- libc/arch-arm/krait/bionic/__strcpy_chk.S | 15 +--- libc/arch-arm/krait/bionic/memcpy.S | 17 ++-- libc/arch-arm/krait/bionic/memcpy_base.S | 124 ++++++++++++------------------ 4 files changed, 71 insertions(+), 104 deletions(-) (limited to 'libc') diff --git a/libc/arch-arm/krait/bionic/__strcat_chk.S b/libc/arch-arm/krait/bionic/__strcat_chk.S index 246f159c0..1a39c5b88 100644 --- a/libc/arch-arm/krait/bionic/__strcat_chk.S +++ b/libc/arch-arm/krait/bionic/__strcat_chk.S @@ -40,7 +40,7 @@ ENTRY(__strcat_chk) pld [r0, #0] push {r0, lr} - .cfi_def_cfa_offset 8 + .cfi_adjust_cfa_offset 8 .cfi_rel_offset r0, 0 .cfi_rel_offset lr, 4 push {r4, r5} @@ -177,7 +177,7 @@ ENTRY(__strcat_chk) .L_strlen_done: add r2, r3, r4 cmp r2, lr - bhi __strcat_chk_failed + bhi .L_strcat_chk_failed // Set up the registers for the memcpy code. mov r1, r5 @@ -185,20 +185,17 @@ ENTRY(__strcat_chk) mov r2, r4 add r0, r0, r3 pop {r4, r5} -END(__strcat_chk) + .cfi_adjust_cfa_offset -8 + .cfi_restore r4 + .cfi_restore r5 -#define MEMCPY_BASE __strcat_chk_memcpy_base -#define MEMCPY_BASE_ALIGNED __strcat_chk_memcpy_base_aligned #include "memcpy_base.S" -ENTRY_PRIVATE(__strcat_chk_failed) - .cfi_def_cfa_offset 8 - .cfi_rel_offset r0, 0 - .cfi_rel_offset lr, 4 + // Undo the above cfi directives. .cfi_adjust_cfa_offset 8 .cfi_rel_offset r4, 0 .cfi_rel_offset r5, 4 - +.L_strcat_chk_failed: ldr r0, error_message ldr r1, error_code 1: @@ -208,7 +205,7 @@ error_code: .word BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW error_message: .word error_string-(1b+4) -END(__strcat_chk_failed) +END(__strcat_chk) .data error_string: diff --git a/libc/arch-arm/krait/bionic/__strcpy_chk.S b/libc/arch-arm/krait/bionic/__strcpy_chk.S index db766863a..00202f3da 100644 --- a/libc/arch-arm/krait/bionic/__strcpy_chk.S +++ b/libc/arch-arm/krait/bionic/__strcpy_chk.S @@ -39,7 +39,7 @@ ENTRY(__strcpy_chk) pld [r0, #0] push {r0, lr} - .cfi_def_cfa_offset 8 + .cfi_adjust_cfa_offset 8 .cfi_rel_offset r0, 0 .cfi_rel_offset lr, 4 @@ -149,21 +149,14 @@ ENTRY(__strcpy_chk) pld [r1, #64] ldr r0, [sp] cmp r3, lr - bhs __strcpy_chk_failed + bhs .L_strcpy_chk_failed // Add 1 for copy length to get the string terminator. add r2, r3, #1 -END(__strcpy_chk) -#define MEMCPY_BASE __strcpy_chk_memcpy_base -#define MEMCPY_BASE_ALIGNED __strcpy_chk_memcpy_base_aligned #include "memcpy_base.S" -ENTRY_PRIVATE(__strcpy_chk_failed) - .cfi_def_cfa_offset 8 - .cfi_rel_offset r0, 0 - .cfi_rel_offset lr, 4 - +.L_strcpy_chk_failed: ldr r0, error_message ldr r1, error_code 1: @@ -173,7 +166,7 @@ error_code: .word BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW error_message: .word error_string-(1b+4) -END(__strcpy_chk_failed) +END(__strcpy_chk) .data error_string: diff --git a/libc/arch-arm/krait/bionic/memcpy.S b/libc/arch-arm/krait/bionic/memcpy.S index 9ff46a8ac..5d27b574f 100644 --- a/libc/arch-arm/krait/bionic/memcpy.S +++ b/libc/arch-arm/krait/bionic/memcpy.S @@ -45,7 +45,7 @@ ENTRY(__memcpy_chk) cmp r2, r3 - bhi __memcpy_chk_fail + bhi .L_memcpy_chk_fail // Fall through to memcpy... END(__memcpy_chk) @@ -53,19 +53,20 @@ END(__memcpy_chk) ENTRY(memcpy) pld [r1, #64] stmfd sp!, {r0, lr} - .cfi_def_cfa_offset 8 + .cfi_adjust_cfa_offset 8 .cfi_rel_offset r0, 0 .cfi_rel_offset lr, 4 -END(memcpy) -#define MEMCPY_BASE __memcpy_base -#define MEMCPY_BASE_ALIGNED __memcpy_base_aligned #include "memcpy_base.S" -ENTRY_PRIVATE(__memcpy_chk_fail) + // Undo the cfi directives from above. + .cfi_adjust_cfa_offset -8 + .cfi_restore r0 + .cfi_restore lr +.L_memcpy_chk_fail: // Preserve lr for backtrace. push {lr} - .cfi_def_cfa_offset 4 + .cfi_adjust_cfa_offset 4 .cfi_rel_offset lr, 0 ldr r0, error_message @@ -77,7 +78,7 @@ error_code: .word BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW error_message: .word error_string-(1b+4) -END(__memcpy_chk_fail) +END(memcpy) .data error_string: diff --git a/libc/arch-arm/krait/bionic/memcpy_base.S b/libc/arch-arm/krait/bionic/memcpy_base.S index 068f2f60c..76c5a8459 100644 --- a/libc/arch-arm/krait/bionic/memcpy_base.S +++ b/libc/arch-arm/krait/bionic/memcpy_base.S @@ -30,59 +30,35 @@ #include #include -/* - * These default settings are good for all Krait-based systems - * as of this writing, but they can be overridden in: - * device///BoardConfig.mk - * by setting the following: - * TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true - * TARGET_USE_KRAIT_PLD_SET := true - * TARGET_KRAIT_BIONIC_PLDOFFS := - * TARGET_KRAIT_BIONIC_PLDSIZE := - * TARGET_KRAIT_BIONIC_PLDTHRESH := - * TARGET_KRAIT_BIONIC_BBTHRESH := - */ - -#ifndef PLDOFFS #define PLDOFFS (10) -#endif -#ifndef PLDTHRESH #define PLDTHRESH (PLDOFFS) -#endif -#ifndef BBTHRESH #define BBTHRESH (4096/64) -#endif +#define PLDSIZE (64) + #if (PLDOFFS < 1) #error Routine does not support offsets less than 1 #endif + #if (PLDTHRESH < PLDOFFS) #error PLD threshold must be greater than or equal to the PLD offset #endif -#ifndef PLDSIZE -#define PLDSIZE (64) -#endif + .text .fpu neon -ENTRY(MEMCPY_BASE) -MEMCPY_BASE_ALIGNED: - // .cfi_startproc - .save {r0, r9, r10, lr} - // .cfi_def_cfa_offset 8 - //.cfi_rel_offset r0, 0 - //.cfi_rel_offset lr, 4 +.L_memcpy_base: cmp r2, #4 - blt .Lneon_lt4 + blt .L_neon_lt4 cmp r2, #16 - blt .Lneon_lt16 + blt .L_neon_lt16 cmp r2, #32 - blt .Lneon_16 + blt .L_neon_16 cmp r2, #64 - blt .Lneon_copy_32_a + blt .L_neon_copy_32_a mov r12, r2, lsr #6 cmp r12, #PLDTHRESH - ble .Lneon_copy_64_loop_nopld + ble .L_neon_copy_64_loop_nopld push {r9, r10} .cfi_adjust_cfa_offset 8 @@ -90,7 +66,7 @@ MEMCPY_BASE_ALIGNED: .cfi_rel_offset r10, 4 cmp r12, #BBTHRESH - ble .Lneon_prime_pump + ble .L_neon_prime_pump add lr, r0, #0x400 add r9, r1, #(PLDOFFS*PLDSIZE) @@ -99,12 +75,12 @@ MEMCPY_BASE_ALIGNED: lsr lr, lr, #21 add lr, lr, #(PLDOFFS*PLDSIZE) cmp r12, lr, lsr #6 - ble .Lneon_prime_pump + ble .L_neon_prime_pump itt gt movgt r9, #(PLDOFFS) rsbsgt r9, r9, lr, lsr #6 - ble .Lneon_prime_pump + ble .L_neon_prime_pump add r10, r1, lr bic r10, #0x3F @@ -118,7 +94,7 @@ MEMCPY_BASE_ALIGNED: movgt r12, #0 pld [r1, #((PLDOFFS-1)*PLDSIZE)] -.Lneon_copy_64_loop_outer_doublepld: +.L_neon_copy_64_loop_outer_doublepld: pld [r1, #((PLDOFFS)*PLDSIZE)] vld1.32 {q0, q1}, [r1]! vld1.32 {q2, q3}, [r1]! @@ -127,14 +103,14 @@ MEMCPY_BASE_ALIGNED: vst1.32 {q0, q1}, [r0]! vst1.32 {q2, q3}, [r0]! add r10, #64 - bne .Lneon_copy_64_loop_outer_doublepld + bne .L_neon_copy_64_loop_outer_doublepld cmp r12, #0 - beq .Lneon_pop_before_nopld + beq .L_neon_pop_before_nopld cmp r12, #(512*1024/64) - blt .Lneon_copy_64_loop_outer + blt .L_neon_copy_64_loop_outer -.Lneon_copy_64_loop_ddr: +.L_neon_copy_64_loop_ddr: vld1.32 {q0, q1}, [r1]! vld1.32 {q2, q3}, [r1]! pld [r10] @@ -142,16 +118,17 @@ MEMCPY_BASE_ALIGNED: vst1.32 {q0, q1}, [r0]! vst1.32 {q2, q3}, [r0]! add r10, #64 - bne .Lneon_copy_64_loop_ddr - b .Lneon_pop_before_nopld + bne .L_neon_copy_64_loop_ddr + b .L_neon_pop_before_nopld -.Lneon_prime_pump: +.L_neon_prime_pump: mov lr, #(PLDOFFS*PLDSIZE) add r10, r1, #(PLDOFFS*PLDSIZE) bic r10, #0x3F sub r12, r12, #PLDOFFS ldr r3, [r10, #(-1*PLDSIZE)] -.Lneon_copy_64_loop_outer: + +.L_neon_copy_64_loop_outer: vld1.32 {q0, q1}, [r1]! vld1.32 {q2, q3}, [r1]! ldr r3, [r10] @@ -159,47 +136,49 @@ MEMCPY_BASE_ALIGNED: vst1.32 {q0, q1}, [r0]! vst1.32 {q2, q3}, [r0]! add r10, #64 - bne .Lneon_copy_64_loop_outer -.Lneon_pop_before_nopld: + bne .L_neon_copy_64_loop_outer + +.L_neon_pop_before_nopld: mov r12, lr, lsr #6 pop {r9, r10} + .cfi_adjust_cfa_offset -8 .cfi_restore r9 .cfi_restore r10 - .cfi_adjust_cfa_offset -8 -.Lneon_copy_64_loop_nopld: +.L_neon_copy_64_loop_nopld: vld1.32 {q8, q9}, [r1]! vld1.32 {q10, q11}, [r1]! subs r12, r12, #1 vst1.32 {q8, q9}, [r0]! vst1.32 {q10, q11}, [r0]! - bne .Lneon_copy_64_loop_nopld + bne .L_neon_copy_64_loop_nopld ands r2, r2, #0x3f - .cfi_restore r0 - .cfi_adjust_cfa_offset -4 - beq .Lneon_exit -.Lneon_copy_32_a: + beq .L_neon_exit + +.L_neon_copy_32_a: movs r3, r2, lsl #27 - bcc .Lneon_16 + bcc .L_neon_16 vld1.32 {q0,q1}, [r1]! vst1.32 {q0,q1}, [r0]! -.Lneon_16: - bpl .Lneon_lt16 + +.L_neon_16: + bpl .L_neon_lt16 vld1.32 {q8}, [r1]! vst1.32 {q8}, [r0]! ands r2, r2, #0x0f - beq .Lneon_exit -.Lneon_lt16: + beq .L_neon_exit + +.L_neon_lt16: movs r3, r2, lsl #29 - itttt cs - ldrcs r3, [r1], #4 - strcs r3, [r0], #4 - ldrcs r3, [r1], #4 - strcs r3, [r0], #4 - itt mi - ldrmi r3, [r1], #4 - strmi r3, [r0], #4 -.Lneon_lt4: + bcc 1f + vld1.8 {d0}, [r1]! + vst1.8 {d0}, [r0]! +1: + bge .L_neon_lt4 + vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! + vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]! + +.L_neon_lt4: movs r2, r2, lsl #31 itt cs ldrhcs r3, [r1], #2 @@ -207,9 +186,6 @@ MEMCPY_BASE_ALIGNED: itt mi ldrbmi r3, [r1] strbmi r3, [r0] -.Lneon_exit: - pop {r0, lr} - bx lr - //.cfi_endproc -END(MEMCPY_BASE) +.L_neon_exit: + pop {r0, pc} -- cgit v1.2.3