From 1d0268c6b855531eedd297f1cb7e4ac5817c9103 Mon Sep 17 00:00:00 2001 From: Brent DeGraaf Date: Wed, 2 Oct 2013 13:47:11 +0000 Subject: [PATCH] libc: krait: Use performance version of memcpy Change-Id: Iaa52635240da8b8746693186b66b69778e833c32 --- libc/arch-arm/krait/bionic/__strcat_chk.S | 19 +- libc/arch-arm/krait/bionic/__strcpy_chk.S | 15 +- libc/arch-arm/krait/bionic/memcpy.S | 17 +- libc/arch-arm/krait/bionic/memcpy_base.S | 285 ++++++++++++++-------- 4 files changed, 198 insertions(+), 138 deletions(-) diff --git a/libc/arch-arm/krait/bionic/__strcat_chk.S b/libc/arch-arm/krait/bionic/__strcat_chk.S index 246f159c0..1a39c5b88 100644 --- a/libc/arch-arm/krait/bionic/__strcat_chk.S +++ b/libc/arch-arm/krait/bionic/__strcat_chk.S @@ -40,7 +40,7 @@ ENTRY(__strcat_chk) pld [r0, #0] push {r0, lr} - .cfi_def_cfa_offset 8 + .cfi_adjust_cfa_offset 8 .cfi_rel_offset r0, 0 .cfi_rel_offset lr, 4 push {r4, r5} @@ -177,7 +177,7 @@ ENTRY(__strcat_chk) .L_strlen_done: add r2, r3, r4 cmp r2, lr - bhi __strcat_chk_failed + bhi .L_strcat_chk_failed // Set up the registers for the memcpy code. mov r1, r5 @@ -185,20 +185,17 @@ ENTRY(__strcat_chk) mov r2, r4 add r0, r0, r3 pop {r4, r5} -END(__strcat_chk) + .cfi_adjust_cfa_offset -8 + .cfi_restore r4 + .cfi_restore r5 -#define MEMCPY_BASE __strcat_chk_memcpy_base -#define MEMCPY_BASE_ALIGNED __strcat_chk_memcpy_base_aligned #include "memcpy_base.S" -ENTRY_PRIVATE(__strcat_chk_failed) - .cfi_def_cfa_offset 8 - .cfi_rel_offset r0, 0 - .cfi_rel_offset lr, 4 + // Undo the above cfi directives. .cfi_adjust_cfa_offset 8 .cfi_rel_offset r4, 0 .cfi_rel_offset r5, 4 - +.L_strcat_chk_failed: ldr r0, error_message ldr r1, error_code 1: @@ -208,7 +205,7 @@ error_code: .word BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW error_message: .word error_string-(1b+4) -END(__strcat_chk_failed) +END(__strcat_chk) .data error_string: diff --git a/libc/arch-arm/krait/bionic/__strcpy_chk.S b/libc/arch-arm/krait/bionic/__strcpy_chk.S index db766863a..00202f3da 100644 --- a/libc/arch-arm/krait/bionic/__strcpy_chk.S +++ b/libc/arch-arm/krait/bionic/__strcpy_chk.S @@ -39,7 +39,7 @@ ENTRY(__strcpy_chk) pld [r0, #0] push {r0, lr} - .cfi_def_cfa_offset 8 + .cfi_adjust_cfa_offset 8 .cfi_rel_offset r0, 0 .cfi_rel_offset lr, 4 @@ -149,21 +149,14 @@ ENTRY(__strcpy_chk) pld [r1, #64] ldr r0, [sp] cmp r3, lr - bhs __strcpy_chk_failed + bhs .L_strcpy_chk_failed // Add 1 for copy length to get the string terminator. add r2, r3, #1 -END(__strcpy_chk) -#define MEMCPY_BASE __strcpy_chk_memcpy_base -#define MEMCPY_BASE_ALIGNED __strcpy_chk_memcpy_base_aligned #include "memcpy_base.S" -ENTRY_PRIVATE(__strcpy_chk_failed) - .cfi_def_cfa_offset 8 - .cfi_rel_offset r0, 0 - .cfi_rel_offset lr, 4 - +.L_strcpy_chk_failed: ldr r0, error_message ldr r1, error_code 1: @@ -173,7 +166,7 @@ error_code: .word BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW error_message: .word error_string-(1b+4) -END(__strcpy_chk_failed) +END(__strcpy_chk) .data error_string: diff --git a/libc/arch-arm/krait/bionic/memcpy.S b/libc/arch-arm/krait/bionic/memcpy.S index 9ff46a8ac..5d27b574f 100644 --- a/libc/arch-arm/krait/bionic/memcpy.S +++ b/libc/arch-arm/krait/bionic/memcpy.S @@ -45,7 +45,7 @@ ENTRY(__memcpy_chk) cmp r2, r3 - bhi __memcpy_chk_fail + bhi .L_memcpy_chk_fail // Fall through to memcpy... END(__memcpy_chk) @@ -53,19 +53,20 @@ END(__memcpy_chk) ENTRY(memcpy) pld [r1, #64] stmfd sp!, {r0, lr} - .cfi_def_cfa_offset 8 + .cfi_adjust_cfa_offset 8 .cfi_rel_offset r0, 0 .cfi_rel_offset lr, 4 -END(memcpy) -#define MEMCPY_BASE __memcpy_base -#define MEMCPY_BASE_ALIGNED __memcpy_base_aligned #include "memcpy_base.S" -ENTRY_PRIVATE(__memcpy_chk_fail) + // Undo the cfi directives from above. + .cfi_adjust_cfa_offset -8 + .cfi_restore r0 + .cfi_restore lr +.L_memcpy_chk_fail: // Preserve lr for backtrace. push {lr} - .cfi_def_cfa_offset 4 + .cfi_adjust_cfa_offset 4 .cfi_rel_offset lr, 0 ldr r0, error_message @@ -77,7 +78,7 @@ error_code: .word BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW error_message: .word error_string-(1b+4) -END(__memcpy_chk_fail) +END(memcpy) .data error_string: diff --git a/libc/arch-arm/krait/bionic/memcpy_base.S b/libc/arch-arm/krait/bionic/memcpy_base.S index 6c098aca1..76c5a8459 100644 --- a/libc/arch-arm/krait/bionic/memcpy_base.S +++ b/libc/arch-arm/krait/bionic/memcpy_base.S @@ -1,122 +1,191 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS - * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED - * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ +/*************************************************************************** + Copyright (c) 2009-2013 The Linux Foundation. All rights reserved. + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of The Linux Foundation nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. -/* - * This code assumes it is running on a processor that supports all arm v7 - * instructions, that supports neon instructions, and that has a 32 byte - * cache line. - */ + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + ***************************************************************************/ -// Assumes neon instructions and a cache line size of 32 bytes. +/* Assumes neon instructions and a cache line size of 64 bytes. */ -ENTRY_PRIVATE(MEMCPY_BASE) - .cfi_def_cfa_offset 8 - .cfi_rel_offset r0, 0 - .cfi_rel_offset lr, 4 +#include +#include - /* do we have at least 16-bytes to copy (needed for alignment below) */ - cmp r2, #16 - blo 5f +#define PLDOFFS (10) +#define PLDTHRESH (PLDOFFS) +#define BBTHRESH (4096/64) +#define PLDSIZE (64) - /* align destination to cache-line for the write-buffer */ - rsb r3, r0, #0 - ands r3, r3, #0xF - beq 2f +#if (PLDOFFS < 1) +#error Routine does not support offsets less than 1 +#endif - /* copy up to 15-bytes (count in r3) */ - sub r2, r2, r3 - movs ip, r3, lsl #31 - itt mi - ldrbmi lr, [r1], #1 - strbmi lr, [r0], #1 - itttt cs - ldrbcs ip, [r1], #1 - ldrbcs lr, [r1], #1 - strbcs ip, [r0], #1 - strbcs lr, [r0], #1 - movs ip, r3, lsl #29 - bge 1f - // copies 4 bytes, destination 32-bits aligned - vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! - vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]! -1: bcc 2f - // copies 8 bytes, destination 64-bits aligned - vld1.8 {d0}, [r1]! - vst1.8 {d0}, [r0, :64]! +#if (PLDTHRESH < PLDOFFS) +#error PLD threshold must be greater than or equal to the PLD offset +#endif -2: /* make sure we have at least 64 bytes to copy */ - subs r2, r2, #64 - blo 2f + .text + .fpu neon -1: /* The main loop copies 64 bytes at a time */ - vld1.8 {d0 - d3}, [r1]! - vld1.8 {d4 - d7}, [r1]! - pld [r1, #(32*8)] - subs r2, r2, #64 - vst1.8 {d0 - d3}, [r0, :128]! - vst1.8 {d4 - d7}, [r0, :128]! - bhs 1b +.L_memcpy_base: + cmp r2, #4 + blt .L_neon_lt4 + cmp r2, #16 + blt .L_neon_lt16 + cmp r2, #32 + blt .L_neon_16 + cmp r2, #64 + blt .L_neon_copy_32_a -2: /* fix-up the remaining count and make sure we have >= 32 bytes left */ - adds r2, r2, #32 - blo 4f + mov r12, r2, lsr #6 + cmp r12, #PLDTHRESH + ble .L_neon_copy_64_loop_nopld - /* Copy 32 bytes. These cache lines were already preloaded */ - vld1.8 {d0 - d3}, [r1]! - sub r2, r2, #32 - vst1.8 {d0 - d3}, [r0, :128]! + push {r9, r10} + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset r9, 0 + .cfi_rel_offset r10, 4 -4: /* less than 32 left */ - add r2, r2, #32 - tst r2, #0x10 - beq 5f - // copies 16 bytes, 128-bits aligned - vld1.8 {d0, d1}, [r1]! - vst1.8 {d0, d1}, [r0, :128]! + cmp r12, #BBTHRESH + ble .L_neon_prime_pump -5: /* copy up to 15-bytes (count in r2) */ - movs ip, r2, lsl #29 - bcc 1f - vld1.8 {d0}, [r1]! - vst1.8 {d0}, [r0]! -1: bge 2f - vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! - vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]! -2: movs ip, r2, lsl #31 - itt mi - ldrbmi r3, [r1], #1 - strbmi r3, [r0], #1 - itttt cs - ldrbcs ip, [r1], #1 - ldrbcs lr, [r1], #1 - strbcs ip, [r0], #1 - strbcs lr, [r0], #1 + add lr, r0, #0x400 + add r9, r1, #(PLDOFFS*PLDSIZE) + sub lr, lr, r9 + lsl lr, lr, #21 + lsr lr, lr, #21 + add lr, lr, #(PLDOFFS*PLDSIZE) + cmp r12, lr, lsr #6 + ble .L_neon_prime_pump - ldmfd sp!, {r0, pc} -END(MEMCPY_BASE) + itt gt + movgt r9, #(PLDOFFS) + rsbsgt r9, r9, lr, lsr #6 + ble .L_neon_prime_pump + + add r10, r1, lr + bic r10, #0x3F + + sub r12, r12, lr, lsr #6 + + cmp r9, r12 + itee le + suble r12, r12, r9 + movgt r9, r12 + movgt r12, #0 + + pld [r1, #((PLDOFFS-1)*PLDSIZE)] +.L_neon_copy_64_loop_outer_doublepld: + pld [r1, #((PLDOFFS)*PLDSIZE)] + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + ldr r3, [r10] + subs r9, r9, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + add r10, #64 + bne .L_neon_copy_64_loop_outer_doublepld + cmp r12, #0 + beq .L_neon_pop_before_nopld + + cmp r12, #(512*1024/64) + blt .L_neon_copy_64_loop_outer + +.L_neon_copy_64_loop_ddr: + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + pld [r10] + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + add r10, #64 + bne .L_neon_copy_64_loop_ddr + b .L_neon_pop_before_nopld + +.L_neon_prime_pump: + mov lr, #(PLDOFFS*PLDSIZE) + add r10, r1, #(PLDOFFS*PLDSIZE) + bic r10, #0x3F + sub r12, r12, #PLDOFFS + ldr r3, [r10, #(-1*PLDSIZE)] + +.L_neon_copy_64_loop_outer: + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + ldr r3, [r10] + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + add r10, #64 + bne .L_neon_copy_64_loop_outer + +.L_neon_pop_before_nopld: + mov r12, lr, lsr #6 + pop {r9, r10} + .cfi_adjust_cfa_offset -8 + .cfi_restore r9 + .cfi_restore r10 + +.L_neon_copy_64_loop_nopld: + vld1.32 {q8, q9}, [r1]! + vld1.32 {q10, q11}, [r1]! + subs r12, r12, #1 + vst1.32 {q8, q9}, [r0]! + vst1.32 {q10, q11}, [r0]! + bne .L_neon_copy_64_loop_nopld + ands r2, r2, #0x3f + beq .L_neon_exit + +.L_neon_copy_32_a: + movs r3, r2, lsl #27 + bcc .L_neon_16 + vld1.32 {q0,q1}, [r1]! + vst1.32 {q0,q1}, [r0]! + +.L_neon_16: + bpl .L_neon_lt16 + vld1.32 {q8}, [r1]! + vst1.32 {q8}, [r0]! + ands r2, r2, #0x0f + beq .L_neon_exit + +.L_neon_lt16: + movs r3, r2, lsl #29 + bcc 1f + vld1.8 {d0}, [r1]! + vst1.8 {d0}, [r0]! +1: + bge .L_neon_lt4 + vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! + vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]! + +.L_neon_lt4: + movs r2, r2, lsl #31 + itt cs + ldrhcs r3, [r1], #2 + strhcs r3, [r0], #2 + itt mi + ldrbmi r3, [r1] + strbmi r3, [r0] + +.L_neon_exit: + pop {r0, pc}