am 3230d96a: Merge "libc: krait: Use performance version of memcpy"

* commit '3230d96a10ab0cfa128487ef03fa5f08c77ebf20':
  libc: krait: Use performance version of memcpy
This commit is contained in:
Christopher Ferris 2015-09-30 23:44:12 +00:00 committed by Android Git Automerger
commit 8bb2141b77
4 changed files with 198 additions and 138 deletions

View File

@ -40,7 +40,7 @@
ENTRY(__strcat_chk)
pld [r0, #0]
push {r0, lr}
.cfi_def_cfa_offset 8
.cfi_adjust_cfa_offset 8
.cfi_rel_offset r0, 0
.cfi_rel_offset lr, 4
push {r4, r5}
@ -177,7 +177,7 @@ ENTRY(__strcat_chk)
.L_strlen_done:
add r2, r3, r4
cmp r2, lr
bhi __strcat_chk_failed
bhi .L_strcat_chk_failed
// Set up the registers for the memcpy code.
mov r1, r5
@ -185,20 +185,17 @@ ENTRY(__strcat_chk)
mov r2, r4
add r0, r0, r3
pop {r4, r5}
END(__strcat_chk)
.cfi_adjust_cfa_offset -8
.cfi_restore r4
.cfi_restore r5
#define MEMCPY_BASE __strcat_chk_memcpy_base
#define MEMCPY_BASE_ALIGNED __strcat_chk_memcpy_base_aligned
#include "memcpy_base.S"
ENTRY_PRIVATE(__strcat_chk_failed)
.cfi_def_cfa_offset 8
.cfi_rel_offset r0, 0
.cfi_rel_offset lr, 4
// Undo the above cfi directives.
.cfi_adjust_cfa_offset 8
.cfi_rel_offset r4, 0
.cfi_rel_offset r5, 4
.L_strcat_chk_failed:
ldr r0, error_message
ldr r1, error_code
1:
@ -208,7 +205,7 @@ error_code:
.word BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW
error_message:
.word error_string-(1b+4)
END(__strcat_chk_failed)
END(__strcat_chk)
.data
error_string:

View File

@ -39,7 +39,7 @@
ENTRY(__strcpy_chk)
pld [r0, #0]
push {r0, lr}
.cfi_def_cfa_offset 8
.cfi_adjust_cfa_offset 8
.cfi_rel_offset r0, 0
.cfi_rel_offset lr, 4
@ -149,21 +149,14 @@ ENTRY(__strcpy_chk)
pld [r1, #64]
ldr r0, [sp]
cmp r3, lr
bhs __strcpy_chk_failed
bhs .L_strcpy_chk_failed
// Add 1 for copy length to get the string terminator.
add r2, r3, #1
END(__strcpy_chk)
#define MEMCPY_BASE __strcpy_chk_memcpy_base
#define MEMCPY_BASE_ALIGNED __strcpy_chk_memcpy_base_aligned
#include "memcpy_base.S"
ENTRY_PRIVATE(__strcpy_chk_failed)
.cfi_def_cfa_offset 8
.cfi_rel_offset r0, 0
.cfi_rel_offset lr, 4
.L_strcpy_chk_failed:
ldr r0, error_message
ldr r1, error_code
1:
@ -173,7 +166,7 @@ error_code:
.word BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW
error_message:
.word error_string-(1b+4)
END(__strcpy_chk_failed)
END(__strcpy_chk)
.data
error_string:

View File

@ -45,7 +45,7 @@
ENTRY(__memcpy_chk)
cmp r2, r3
bhi __memcpy_chk_fail
bhi .L_memcpy_chk_fail
// Fall through to memcpy...
END(__memcpy_chk)
@ -53,19 +53,20 @@ END(__memcpy_chk)
ENTRY(memcpy)
pld [r1, #64]
stmfd sp!, {r0, lr}
.cfi_def_cfa_offset 8
.cfi_adjust_cfa_offset 8
.cfi_rel_offset r0, 0
.cfi_rel_offset lr, 4
END(memcpy)
#define MEMCPY_BASE __memcpy_base
#define MEMCPY_BASE_ALIGNED __memcpy_base_aligned
#include "memcpy_base.S"
ENTRY_PRIVATE(__memcpy_chk_fail)
// Undo the cfi directives from above.
.cfi_adjust_cfa_offset -8
.cfi_restore r0
.cfi_restore lr
.L_memcpy_chk_fail:
// Preserve lr for backtrace.
push {lr}
.cfi_def_cfa_offset 4
.cfi_adjust_cfa_offset 4
.cfi_rel_offset lr, 0
ldr r0, error_message
@ -77,7 +78,7 @@ error_code:
.word BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
error_message:
.word error_string-(1b+4)
END(__memcpy_chk_fail)
END(memcpy)
.data
error_string:

View File

@ -1,122 +1,191 @@
/*
* Copyright (C) 2013 The Android Open Source Project
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/***************************************************************************
Copyright (c) 2009-2013 The Linux Foundation. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of The Linux Foundation nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
/*
* This code assumes it is running on a processor that supports all arm v7
* instructions, that supports neon instructions, and that has a 32 byte
* cache line.
*/
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
***************************************************************************/
// Assumes neon instructions and a cache line size of 32 bytes.
/* Assumes neon instructions and a cache line size of 64 bytes. */
ENTRY_PRIVATE(MEMCPY_BASE)
.cfi_def_cfa_offset 8
.cfi_rel_offset r0, 0
.cfi_rel_offset lr, 4
#include <machine/cpu-features.h>
#include <machine/asm.h>
/* do we have at least 16-bytes to copy (needed for alignment below) */
#define PLDOFFS (10)
#define PLDTHRESH (PLDOFFS)
#define BBTHRESH (4096/64)
#define PLDSIZE (64)
#if (PLDOFFS < 1)
#error Routine does not support offsets less than 1
#endif
#if (PLDTHRESH < PLDOFFS)
#error PLD threshold must be greater than or equal to the PLD offset
#endif
.text
.fpu neon
.L_memcpy_base:
cmp r2, #4
blt .L_neon_lt4
cmp r2, #16
blo 5f
blt .L_neon_lt16
cmp r2, #32
blt .L_neon_16
cmp r2, #64
blt .L_neon_copy_32_a
/* align destination to cache-line for the write-buffer */
rsb r3, r0, #0
ands r3, r3, #0xF
beq 2f
mov r12, r2, lsr #6
cmp r12, #PLDTHRESH
ble .L_neon_copy_64_loop_nopld
/* copy up to 15-bytes (count in r3) */
sub r2, r2, r3
movs ip, r3, lsl #31
itt mi
ldrbmi lr, [r1], #1
strbmi lr, [r0], #1
itttt cs
ldrbcs ip, [r1], #1
ldrbcs lr, [r1], #1
strbcs ip, [r0], #1
strbcs lr, [r0], #1
movs ip, r3, lsl #29
bge 1f
// copies 4 bytes, destination 32-bits aligned
vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
1: bcc 2f
// copies 8 bytes, destination 64-bits aligned
vld1.8 {d0}, [r1]!
vst1.8 {d0}, [r0, :64]!
push {r9, r10}
.cfi_adjust_cfa_offset 8
.cfi_rel_offset r9, 0
.cfi_rel_offset r10, 4
2: /* make sure we have at least 64 bytes to copy */
subs r2, r2, #64
blo 2f
cmp r12, #BBTHRESH
ble .L_neon_prime_pump
1: /* The main loop copies 64 bytes at a time */
vld1.8 {d0 - d3}, [r1]!
vld1.8 {d4 - d7}, [r1]!
pld [r1, #(32*8)]
subs r2, r2, #64
vst1.8 {d0 - d3}, [r0, :128]!
vst1.8 {d4 - d7}, [r0, :128]!
bhs 1b
add lr, r0, #0x400
add r9, r1, #(PLDOFFS*PLDSIZE)
sub lr, lr, r9
lsl lr, lr, #21
lsr lr, lr, #21
add lr, lr, #(PLDOFFS*PLDSIZE)
cmp r12, lr, lsr #6
ble .L_neon_prime_pump
2: /* fix-up the remaining count and make sure we have >= 32 bytes left */
adds r2, r2, #32
blo 4f
itt gt
movgt r9, #(PLDOFFS)
rsbsgt r9, r9, lr, lsr #6
ble .L_neon_prime_pump
/* Copy 32 bytes. These cache lines were already preloaded */
vld1.8 {d0 - d3}, [r1]!
sub r2, r2, #32
vst1.8 {d0 - d3}, [r0, :128]!
add r10, r1, lr
bic r10, #0x3F
4: /* less than 32 left */
add r2, r2, #32
tst r2, #0x10
beq 5f
// copies 16 bytes, 128-bits aligned
vld1.8 {d0, d1}, [r1]!
vst1.8 {d0, d1}, [r0, :128]!
sub r12, r12, lr, lsr #6
5: /* copy up to 15-bytes (count in r2) */
movs ip, r2, lsl #29
cmp r9, r12
itee le
suble r12, r12, r9
movgt r9, r12
movgt r12, #0
pld [r1, #((PLDOFFS-1)*PLDSIZE)]
.L_neon_copy_64_loop_outer_doublepld:
pld [r1, #((PLDOFFS)*PLDSIZE)]
vld1.32 {q0, q1}, [r1]!
vld1.32 {q2, q3}, [r1]!
ldr r3, [r10]
subs r9, r9, #1
vst1.32 {q0, q1}, [r0]!
vst1.32 {q2, q3}, [r0]!
add r10, #64
bne .L_neon_copy_64_loop_outer_doublepld
cmp r12, #0
beq .L_neon_pop_before_nopld
cmp r12, #(512*1024/64)
blt .L_neon_copy_64_loop_outer
.L_neon_copy_64_loop_ddr:
vld1.32 {q0, q1}, [r1]!
vld1.32 {q2, q3}, [r1]!
pld [r10]
subs r12, r12, #1
vst1.32 {q0, q1}, [r0]!
vst1.32 {q2, q3}, [r0]!
add r10, #64
bne .L_neon_copy_64_loop_ddr
b .L_neon_pop_before_nopld
.L_neon_prime_pump:
mov lr, #(PLDOFFS*PLDSIZE)
add r10, r1, #(PLDOFFS*PLDSIZE)
bic r10, #0x3F
sub r12, r12, #PLDOFFS
ldr r3, [r10, #(-1*PLDSIZE)]
.L_neon_copy_64_loop_outer:
vld1.32 {q0, q1}, [r1]!
vld1.32 {q2, q3}, [r1]!
ldr r3, [r10]
subs r12, r12, #1
vst1.32 {q0, q1}, [r0]!
vst1.32 {q2, q3}, [r0]!
add r10, #64
bne .L_neon_copy_64_loop_outer
.L_neon_pop_before_nopld:
mov r12, lr, lsr #6
pop {r9, r10}
.cfi_adjust_cfa_offset -8
.cfi_restore r9
.cfi_restore r10
.L_neon_copy_64_loop_nopld:
vld1.32 {q8, q9}, [r1]!
vld1.32 {q10, q11}, [r1]!
subs r12, r12, #1
vst1.32 {q8, q9}, [r0]!
vst1.32 {q10, q11}, [r0]!
bne .L_neon_copy_64_loop_nopld
ands r2, r2, #0x3f
beq .L_neon_exit
.L_neon_copy_32_a:
movs r3, r2, lsl #27
bcc .L_neon_16
vld1.32 {q0,q1}, [r1]!
vst1.32 {q0,q1}, [r0]!
.L_neon_16:
bpl .L_neon_lt16
vld1.32 {q8}, [r1]!
vst1.32 {q8}, [r0]!
ands r2, r2, #0x0f
beq .L_neon_exit
.L_neon_lt16:
movs r3, r2, lsl #29
bcc 1f
vld1.8 {d0}, [r1]!
vst1.8 {d0}, [r0]!
1: bge 2f
1:
bge .L_neon_lt4
vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
2: movs ip, r2, lsl #31
itt mi
ldrbmi r3, [r1], #1
strbmi r3, [r0], #1
itttt cs
ldrbcs ip, [r1], #1
ldrbcs lr, [r1], #1
strbcs ip, [r0], #1
strbcs lr, [r0], #1
ldmfd sp!, {r0, pc}
END(MEMCPY_BASE)
.L_neon_lt4:
movs r2, r2, lsl #31
itt cs
ldrhcs r3, [r1], #2
strhcs r3, [r0], #2
itt mi
ldrbmi r3, [r1]
strbmi r3, [r0]
.L_neon_exit:
pop {r0, pc}