Make memcpy memmove
Bug: http://b/63992911 Test: Change BoardConfig.mk and compile for each variant Change-Id: Ia0cc68d8e90e3316ddb2e9ff1555a009b6a0c5be
This commit is contained in:
parent
42596b7bf0
commit
8a0f0ed5e7
|
@ -855,7 +855,6 @@ cc_library_static {
|
|||
arm: {
|
||||
srcs: [
|
||||
"arch-arm/generic/bionic/memcmp.S",
|
||||
"arch-arm/generic/bionic/memcpy.S",
|
||||
"arch-arm/generic/bionic/memmove.S",
|
||||
"arch-arm/generic/bionic/memset.S",
|
||||
"arch-arm/generic/bionic/strcmp.S",
|
||||
|
@ -1125,7 +1124,6 @@ cc_library_static {
|
|||
"arch-x86/atom/string/sse2-wcsrchr-atom.S",
|
||||
"arch-x86/atom/string/sse2-wcslen-atom.S",
|
||||
"arch-x86/atom/string/sse2-wcscmp-atom.S",
|
||||
"arch-x86/silvermont/string/sse2-memcpy-slm.S",
|
||||
"arch-x86/silvermont/string/sse2-memmove-slm.S",
|
||||
"arch-x86/silvermont/string/sse2-memset-slm.S",
|
||||
"arch-x86/silvermont/string/sse2-stpcpy-slm.S",
|
||||
|
@ -1154,14 +1152,12 @@ cc_library_static {
|
|||
"arch-x86/atom/string/sse2-strlen-atom.S",
|
||||
"arch-x86/atom/string/ssse3-memcmp-atom.S",
|
||||
"arch-x86/atom/string/ssse3-memcpy-atom.S",
|
||||
"arch-x86/atom/string/ssse3-memmove-atom.S",
|
||||
"arch-x86/atom/string/ssse3-strcpy-atom.S",
|
||||
"arch-x86/atom/string/ssse3-strncpy-atom.S",
|
||||
"arch-x86/atom/string/ssse3-wmemcmp-atom.S",
|
||||
],
|
||||
exclude_srcs: [
|
||||
"arch-x86/generic/string/memcmp.S",
|
||||
"arch-x86/silvermont/string/sse2-memcpy-slm.S",
|
||||
"arch-x86/silvermont/string/sse2-memmove-slm.S",
|
||||
"arch-x86/silvermont/string/sse2-memset-slm.S",
|
||||
"arch-x86/silvermont/string/sse2-strcpy-slm.S",
|
||||
|
@ -1198,7 +1194,6 @@ cc_library_static {
|
|||
},
|
||||
x86_64: {
|
||||
srcs: [
|
||||
"arch-x86_64/string/sse2-memcpy-slm.S",
|
||||
"arch-x86_64/string/sse2-memmove-slm.S",
|
||||
"arch-x86_64/string/sse2-memset-slm.S",
|
||||
"arch-x86_64/string/sse2-stpcpy-slm.S",
|
||||
|
|
|
@ -64,7 +64,7 @@
|
|||
.arch armv7-a
|
||||
|
||||
// Prototype: void *memcpy (void *dst, const void *src, size_t count).
|
||||
ENTRY(memcpy)
|
||||
ENTRY(__memcpy)
|
||||
pld [r1, #64]
|
||||
push {r0, lr}
|
||||
.cfi_def_cfa_offset 8
|
||||
|
@ -72,4 +72,4 @@ ENTRY(memcpy)
|
|||
.cfi_rel_offset lr, 4
|
||||
|
||||
#include "memcpy_base.S"
|
||||
END(memcpy)
|
||||
END(__memcpy)
|
||||
|
|
|
@ -64,7 +64,7 @@
|
|||
.arch armv7-a
|
||||
|
||||
// Prototype: void *memcpy (void *dst, const void *src, size_t count).
|
||||
ENTRY(memcpy)
|
||||
ENTRY(__memcpy)
|
||||
pld [r1, #64]
|
||||
push {r0, lr}
|
||||
.cfi_def_cfa_offset 8
|
||||
|
@ -72,4 +72,4 @@ ENTRY(memcpy)
|
|||
.cfi_rel_offset lr, 4
|
||||
|
||||
#include "memcpy_base.S"
|
||||
END(memcpy)
|
||||
END(__memcpy)
|
||||
|
|
|
@ -64,7 +64,7 @@
|
|||
.arch armv7-a
|
||||
|
||||
// Prototype: void *memcpy (void *dst, const void *src, size_t count).
|
||||
ENTRY(memcpy)
|
||||
ENTRY(__memcpy)
|
||||
pld [r1, #64]
|
||||
push {r0, lr}
|
||||
.cfi_def_cfa_offset 8
|
||||
|
@ -72,4 +72,4 @@ ENTRY(memcpy)
|
|||
.cfi_rel_offset lr, 4
|
||||
|
||||
#include "memcpy_base.S"
|
||||
END(memcpy)
|
||||
END(__memcpy)
|
||||
|
|
|
@ -39,14 +39,14 @@
|
|||
.thumb
|
||||
.thumb_func
|
||||
|
||||
ENTRY(memcpy)
|
||||
ENTRY(__memcpy)
|
||||
pld [r1, #0]
|
||||
stmfd sp!, {r0, lr}
|
||||
.cfi_def_cfa_offset 8
|
||||
.cfi_rel_offset r0, 0
|
||||
.cfi_rel_offset lr, 4
|
||||
pld [r1, #64]
|
||||
END(memcpy)
|
||||
END(__memcpy)
|
||||
|
||||
#define MEMCPY_BASE __memcpy_base
|
||||
#define MEMCPY_BASE_ALIGNED __memcpy_base_aligned
|
||||
|
|
|
@ -65,13 +65,13 @@
|
|||
// arch. The code generated is exactly the same.
|
||||
.arch armv7-a
|
||||
|
||||
ENTRY(memcpy)
|
||||
ENTRY(__memcpy)
|
||||
pld [r1, #64]
|
||||
push {r0, lr}
|
||||
.cfi_def_cfa_offset 8
|
||||
.cfi_rel_offset r0, 0
|
||||
.cfi_rel_offset lr, 4
|
||||
END(memcpy)
|
||||
END(__memcpy)
|
||||
|
||||
#define MEMCPY_BASE __memcpy_base
|
||||
#define MEMCPY_BASE_ALIGNED __memcpy_base_aligned
|
||||
|
|
|
@ -50,7 +50,7 @@ ENTRY(memmove)
|
|||
bhi .L_reversed_memcpy
|
||||
|
||||
.L_jump_to_memcpy:
|
||||
b memcpy
|
||||
b __memcpy
|
||||
|
||||
.L_reversed_memcpy:
|
||||
push {r0, lr}
|
||||
|
@ -278,3 +278,5 @@ ENTRY(memmove)
|
|||
pop {r0, pc}
|
||||
|
||||
END(memmove)
|
||||
|
||||
ALIAS_SYMBOL(memcpy, memmove)
|
||||
|
|
|
@ -1,379 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) 2008 The Android Open Source Project
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
||||
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
||||
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <private/bionic_asm.h>
|
||||
|
||||
/*
|
||||
* Optimized memcpy() for ARM.
|
||||
*
|
||||
* note that memcpy() always returns the destination pointer,
|
||||
* so we have to preserve R0.
|
||||
*/
|
||||
|
||||
.syntax unified
|
||||
|
||||
ENTRY(memcpy)
|
||||
/* The stack must always be 64-bits aligned to be compliant with the
|
||||
* ARM ABI. Since we have to save R0, we might as well save R4
|
||||
* which we can use for better pipelining of the reads below
|
||||
*/
|
||||
stmfd sp!, {r0, r4, lr}
|
||||
.cfi_def_cfa_offset 12
|
||||
.cfi_rel_offset r0, 0
|
||||
.cfi_rel_offset r4, 4
|
||||
.cfi_rel_offset lr, 8
|
||||
/* Making room for r5-r11 which will be spilled later */
|
||||
sub sp, sp, #28
|
||||
.cfi_adjust_cfa_offset 28
|
||||
|
||||
// preload the destination because we'll align it to a cache line
|
||||
// with small writes. Also start the source "pump".
|
||||
pld [r0, #0]
|
||||
pld [r1, #0]
|
||||
pld [r1, #32]
|
||||
|
||||
/* it simplifies things to take care of len<4 early */
|
||||
cmp r2, #4
|
||||
blo .Lcopy_last_3_and_return
|
||||
|
||||
/* compute the offset to align the source
|
||||
* offset = (4-(src&3))&3 = -src & 3
|
||||
*/
|
||||
rsb r3, r1, #0
|
||||
ands r3, r3, #3
|
||||
beq .Lsrc_aligned
|
||||
|
||||
/* align source to 32 bits. We need to insert 2 instructions between
|
||||
* a ldr[b|h] and str[b|h] because byte and half-word instructions
|
||||
* stall 2 cycles.
|
||||
*/
|
||||
movs r12, r3, lsl #31
|
||||
sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
|
||||
ldrbmi r3, [r1], #1
|
||||
ldrbcs r4, [r1], #1
|
||||
ldrbcs r12,[r1], #1
|
||||
strbmi r3, [r0], #1
|
||||
strbcs r4, [r0], #1
|
||||
strbcs r12,[r0], #1
|
||||
|
||||
.Lsrc_aligned:
|
||||
|
||||
/* see if src and dst are aligned together (congruent) */
|
||||
eor r12, r0, r1
|
||||
tst r12, #3
|
||||
bne .Lnon_congruent
|
||||
|
||||
/* Use post-incriment mode for stm to spill r5-r11 to reserved stack
|
||||
* frame. Don't update sp.
|
||||
*/
|
||||
stmea sp, {r5-r11}
|
||||
|
||||
/* align the destination to a cache-line */
|
||||
rsb r3, r0, #0
|
||||
ands r3, r3, #0x1C
|
||||
beq .Lcongruent_aligned32
|
||||
cmp r3, r2
|
||||
andhi r3, r2, #0x1C
|
||||
|
||||
/* conditionally copies 0 to 7 words (length in r3) */
|
||||
movs r12, r3, lsl #28
|
||||
ldmcs r1!, {r4, r5, r6, r7} /* 16 bytes */
|
||||
ldmmi r1!, {r8, r9} /* 8 bytes */
|
||||
stmcs r0!, {r4, r5, r6, r7}
|
||||
stmmi r0!, {r8, r9}
|
||||
tst r3, #0x4
|
||||
ldrne r10,[r1], #4 /* 4 bytes */
|
||||
strne r10,[r0], #4
|
||||
sub r2, r2, r3
|
||||
|
||||
.Lcongruent_aligned32:
|
||||
/*
|
||||
* here source is aligned to 32 bytes.
|
||||
*/
|
||||
|
||||
.Lcached_aligned32:
|
||||
subs r2, r2, #32
|
||||
blo .Lless_than_32_left
|
||||
|
||||
/*
|
||||
* We preload a cache-line up to 64 bytes ahead. On the 926, this will
|
||||
* stall only until the requested world is fetched, but the linefill
|
||||
* continues in the the background.
|
||||
* While the linefill is going, we write our previous cache-line
|
||||
* into the write-buffer (which should have some free space).
|
||||
* When the linefill is done, the writebuffer will
|
||||
* start dumping its content into memory
|
||||
*
|
||||
* While all this is going, we then load a full cache line into
|
||||
* 8 registers, this cache line should be in the cache by now
|
||||
* (or partly in the cache).
|
||||
*
|
||||
* This code should work well regardless of the source/dest alignment.
|
||||
*
|
||||
*/
|
||||
|
||||
// Align the preload register to a cache-line because the cpu does
|
||||
// "critical word first" (the first word requested is loaded first).
|
||||
bic r12, r1, #0x1F
|
||||
add r12, r12, #64
|
||||
|
||||
1: ldmia r1!, { r4-r11 }
|
||||
pld [r12, #64]
|
||||
subs r2, r2, #32
|
||||
|
||||
// NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
|
||||
// for ARM9 preload will not be safely guarded by the preceding subs.
|
||||
// When it is safely guarded the only possibility to have SIGSEGV here
|
||||
// is because the caller overstates the length.
|
||||
ldrhi r3, [r12], #32 /* cheap ARM9 preload */
|
||||
stmia r0!, { r4-r11 }
|
||||
bhs 1b
|
||||
|
||||
add r2, r2, #32
|
||||
|
||||
.Lless_than_32_left:
|
||||
/*
|
||||
* less than 32 bytes left at this point (length in r2)
|
||||
*/
|
||||
|
||||
/* skip all this if there is nothing to do, which should
|
||||
* be a common case (if not executed the code below takes
|
||||
* about 16 cycles)
|
||||
*/
|
||||
tst r2, #0x1F
|
||||
beq 1f
|
||||
|
||||
/* conditionnaly copies 0 to 31 bytes */
|
||||
movs r12, r2, lsl #28
|
||||
ldmcs r1!, {r4, r5, r6, r7} /* 16 bytes */
|
||||
ldmmi r1!, {r8, r9} /* 8 bytes */
|
||||
stmcs r0!, {r4, r5, r6, r7}
|
||||
stmmi r0!, {r8, r9}
|
||||
movs r12, r2, lsl #30
|
||||
ldrcs r3, [r1], #4 /* 4 bytes */
|
||||
ldrhmi r4, [r1], #2 /* 2 bytes */
|
||||
strcs r3, [r0], #4
|
||||
strhmi r4, [r0], #2
|
||||
tst r2, #0x1
|
||||
ldrbne r3, [r1] /* last byte */
|
||||
strbne r3, [r0]
|
||||
|
||||
/* we're done! restore everything and return */
|
||||
1: ldmfd sp!, {r5-r11}
|
||||
ldmfd sp!, {r0, r4, pc}
|
||||
|
||||
/********************************************************************/
|
||||
|
||||
.Lnon_congruent:
|
||||
/*
|
||||
* here source is aligned to 4 bytes
|
||||
* but destination is not.
|
||||
*
|
||||
* in the code below r2 is the number of bytes read
|
||||
* (the number of bytes written is always smaller, because we have
|
||||
* partial words in the shift queue)
|
||||
*/
|
||||
cmp r2, #4
|
||||
blo .Lcopy_last_3_and_return
|
||||
|
||||
/* Use post-increment mode for stm to spill r5-r11 to reserved stack
|
||||
* frame. Don't update sp.
|
||||
*/
|
||||
stmea sp, {r5-r11}
|
||||
|
||||
/* compute shifts needed to align src to dest */
|
||||
rsb r5, r0, #0
|
||||
and r5, r5, #3 /* r5 = # bytes in partial words */
|
||||
mov r12, r5, lsl #3 /* r12 = right */
|
||||
rsb lr, r12, #32 /* lr = left */
|
||||
|
||||
/* read the first word */
|
||||
ldr r3, [r1], #4
|
||||
sub r2, r2, #4
|
||||
|
||||
/* write a partial word (0 to 3 bytes), such that destination
|
||||
* becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
|
||||
*/
|
||||
movs r5, r5, lsl #31
|
||||
strbmi r3, [r0], #1
|
||||
movmi r3, r3, lsr #8
|
||||
strbcs r3, [r0], #1
|
||||
movcs r3, r3, lsr #8
|
||||
strbcs r3, [r0], #1
|
||||
movcs r3, r3, lsr #8
|
||||
|
||||
cmp r2, #4
|
||||
blo .Lpartial_word_tail
|
||||
|
||||
/* Align destination to 32 bytes (cache line boundary) */
|
||||
1: tst r0, #0x1c
|
||||
beq 2f
|
||||
ldr r5, [r1], #4
|
||||
sub r2, r2, #4
|
||||
orr r4, r3, r5, lsl lr
|
||||
mov r3, r5, lsr r12
|
||||
str r4, [r0], #4
|
||||
cmp r2, #4
|
||||
bhs 1b
|
||||
blo .Lpartial_word_tail
|
||||
|
||||
/* copy 32 bytes at a time */
|
||||
2: subs r2, r2, #32
|
||||
blo .Lless_than_thirtytwo
|
||||
|
||||
/* Use immediate mode for the shifts, because there is an extra cycle
|
||||
* for register shifts, which could account for up to 50% of
|
||||
* performance hit.
|
||||
*/
|
||||
|
||||
cmp r12, #24
|
||||
beq .Lloop24
|
||||
cmp r12, #8
|
||||
beq .Lloop8
|
||||
|
||||
.Lloop16:
|
||||
ldr r12, [r1], #4
|
||||
1: mov r4, r12
|
||||
ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
|
||||
pld [r1, #64]
|
||||
subs r2, r2, #32
|
||||
ldrhs r12, [r1], #4
|
||||
orr r3, r3, r4, lsl #16
|
||||
mov r4, r4, lsr #16
|
||||
orr r4, r4, r5, lsl #16
|
||||
mov r5, r5, lsr #16
|
||||
orr r5, r5, r6, lsl #16
|
||||
mov r6, r6, lsr #16
|
||||
orr r6, r6, r7, lsl #16
|
||||
mov r7, r7, lsr #16
|
||||
orr r7, r7, r8, lsl #16
|
||||
mov r8, r8, lsr #16
|
||||
orr r8, r8, r9, lsl #16
|
||||
mov r9, r9, lsr #16
|
||||
orr r9, r9, r10, lsl #16
|
||||
mov r10, r10, lsr #16
|
||||
orr r10, r10, r11, lsl #16
|
||||
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
|
||||
mov r3, r11, lsr #16
|
||||
bhs 1b
|
||||
b .Lless_than_thirtytwo
|
||||
|
||||
.Lloop8:
|
||||
ldr r12, [r1], #4
|
||||
1: mov r4, r12
|
||||
ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
|
||||
pld [r1, #64]
|
||||
subs r2, r2, #32
|
||||
ldrhs r12, [r1], #4
|
||||
orr r3, r3, r4, lsl #24
|
||||
mov r4, r4, lsr #8
|
||||
orr r4, r4, r5, lsl #24
|
||||
mov r5, r5, lsr #8
|
||||
orr r5, r5, r6, lsl #24
|
||||
mov r6, r6, lsr #8
|
||||
orr r6, r6, r7, lsl #24
|
||||
mov r7, r7, lsr #8
|
||||
orr r7, r7, r8, lsl #24
|
||||
mov r8, r8, lsr #8
|
||||
orr r8, r8, r9, lsl #24
|
||||
mov r9, r9, lsr #8
|
||||
orr r9, r9, r10, lsl #24
|
||||
mov r10, r10, lsr #8
|
||||
orr r10, r10, r11, lsl #24
|
||||
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
|
||||
mov r3, r11, lsr #8
|
||||
bhs 1b
|
||||
b .Lless_than_thirtytwo
|
||||
|
||||
.Lloop24:
|
||||
ldr r12, [r1], #4
|
||||
1: mov r4, r12
|
||||
ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
|
||||
pld [r1, #64]
|
||||
subs r2, r2, #32
|
||||
ldrhs r12, [r1], #4
|
||||
orr r3, r3, r4, lsl #8
|
||||
mov r4, r4, lsr #24
|
||||
orr r4, r4, r5, lsl #8
|
||||
mov r5, r5, lsr #24
|
||||
orr r5, r5, r6, lsl #8
|
||||
mov r6, r6, lsr #24
|
||||
orr r6, r6, r7, lsl #8
|
||||
mov r7, r7, lsr #24
|
||||
orr r7, r7, r8, lsl #8
|
||||
mov r8, r8, lsr #24
|
||||
orr r8, r8, r9, lsl #8
|
||||
mov r9, r9, lsr #24
|
||||
orr r9, r9, r10, lsl #8
|
||||
mov r10, r10, lsr #24
|
||||
orr r10, r10, r11, lsl #8
|
||||
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
|
||||
mov r3, r11, lsr #24
|
||||
bhs 1b
|
||||
|
||||
|
||||
.Lless_than_thirtytwo:
|
||||
/* copy the last 0 to 31 bytes of the source */
|
||||
rsb r12, lr, #32 /* we corrupted r12, recompute it */
|
||||
add r2, r2, #32
|
||||
cmp r2, #4
|
||||
blo .Lpartial_word_tail
|
||||
|
||||
1: ldr r5, [r1], #4
|
||||
sub r2, r2, #4
|
||||
orr r4, r3, r5, lsl lr
|
||||
mov r3, r5, lsr r12
|
||||
str r4, [r0], #4
|
||||
cmp r2, #4
|
||||
bhs 1b
|
||||
|
||||
.Lpartial_word_tail:
|
||||
/* we have a partial word in the input buffer */
|
||||
movs r5, lr, lsl #(31-3)
|
||||
strbmi r3, [r0], #1
|
||||
movmi r3, r3, lsr #8
|
||||
strbcs r3, [r0], #1
|
||||
movcs r3, r3, lsr #8
|
||||
strbcs r3, [r0], #1
|
||||
|
||||
/* Refill spilled registers from the stack. Don't update sp. */
|
||||
ldmfd sp, {r5-r11}
|
||||
|
||||
.Lcopy_last_3_and_return:
|
||||
movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */
|
||||
ldrbmi r2, [r1], #1
|
||||
ldrbcs r3, [r1], #1
|
||||
ldrbcs r12,[r1]
|
||||
strbmi r2, [r0], #1
|
||||
strbcs r3, [r0], #1
|
||||
strbcs r12,[r0]
|
||||
|
||||
/* we're done! restore sp and spilled registers and return */
|
||||
add sp, sp, #28
|
||||
ldmfd sp!, {r0, r4, pc}
|
||||
END(memcpy)
|
|
@ -469,3 +469,5 @@ ENTRY(memmove)
|
|||
bl bsd_safe_memcpy
|
||||
ldmfd sp!, {r0, pc}
|
||||
END(memmove)
|
||||
|
||||
ALIAS_SYMBOL(memcpy, memmove)
|
||||
|
|
|
@ -42,7 +42,7 @@
|
|||
.thumb
|
||||
.thumb_func
|
||||
|
||||
ENTRY(memcpy)
|
||||
ENTRY(__memcpy)
|
||||
pld [r1, #64]
|
||||
stmfd sp!, {r0, lr}
|
||||
.cfi_adjust_cfa_offset 8
|
||||
|
@ -50,4 +50,4 @@ ENTRY(memcpy)
|
|||
.cfi_rel_offset lr, 4
|
||||
|
||||
#include "memcpy_base.S"
|
||||
END(memcpy)
|
||||
END(__memcpy)
|
||||
|
|
|
@ -34,7 +34,7 @@
|
|||
#define PLDSIZE (128) /* L2 cache line size */
|
||||
|
||||
.code 32
|
||||
ENTRY(memcpy)
|
||||
ENTRY(__memcpy)
|
||||
push {r0}
|
||||
.cfi_def_cfa_offset 4
|
||||
.cfi_rel_offset r0, 0
|
||||
|
@ -123,4 +123,4 @@ ENTRY(memcpy)
|
|||
pop {r0}
|
||||
bx lr
|
||||
|
||||
END(memcpy)
|
||||
END(__memcpy)
|
||||
|
|
|
@ -30,6 +30,6 @@
|
|||
|
||||
#include <private/bionic_asm.h>
|
||||
|
||||
ENTRY(memcpy)
|
||||
ENTRY(__memcpy)
|
||||
#include "memcpy_base.S"
|
||||
END(memcpy)
|
||||
END(__memcpy)
|
||||
|
|
|
@ -65,7 +65,7 @@ ENTRY(memmove)
|
|||
b.lo .Ldownwards
|
||||
add tmp1, src, count
|
||||
cmp dstin, tmp1
|
||||
b.hs memcpy /* No overlap. */
|
||||
b.hs __memcpy /* No overlap. */
|
||||
|
||||
/* Upwards move with potential overlap.
|
||||
* Need to move from the tail backwards. SRC and DST point one
|
||||
|
@ -196,7 +196,7 @@ ENTRY(memmove)
|
|||
* DST is more than 16 bytes away from SRC. */
|
||||
sub tmp1, src, #16
|
||||
cmp dstin, tmp1
|
||||
b.ls memcpy /* May overlap, but not critically. */
|
||||
b.ls __memcpy /* May overlap, but not critically. */
|
||||
|
||||
mov dst, dstin /* Preserve DSTIN for return value. */
|
||||
cmp count, #64
|
||||
|
@ -326,4 +326,6 @@ ENTRY(memmove)
|
|||
END(wmemmove)
|
||||
#else
|
||||
END(memmove)
|
||||
|
||||
ALIAS_SYMBOL(memcpy, memmove)
|
||||
#endif
|
||||
|
|
|
@ -30,6 +30,6 @@
|
|||
|
||||
#include <private/bionic_asm.h>
|
||||
|
||||
ENTRY(memcpy)
|
||||
ENTRY(__memcpy)
|
||||
#include "memcpy_base.S"
|
||||
END(memcpy)
|
||||
END(__memcpy)
|
||||
|
|
|
@ -92,7 +92,7 @@ ENTRY(memmove)
|
|||
sub tmp1, dstin, src
|
||||
cmp count, 96
|
||||
ccmp tmp1, count, 2, hi
|
||||
b.hs memcpy
|
||||
b.hs __memcpy
|
||||
|
||||
cbz tmp1, 3f
|
||||
add dstend, dstin, count
|
||||
|
@ -150,4 +150,6 @@ ENTRY(memmove)
|
|||
END(wmemmove)
|
||||
#else
|
||||
END(memmove)
|
||||
|
||||
ALIAS_SYMBOL(memcpy, memmove)
|
||||
#endif
|
||||
|
|
|
@ -34,6 +34,10 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
# define MEMCPY memcpy
|
||||
#endif
|
||||
|
||||
#ifndef USE_AS_MEMMOVE
|
||||
# define USE_AS_MEMMOVE
|
||||
#endif
|
||||
|
||||
#ifndef L
|
||||
# define L(label) .L##label
|
||||
#endif
|
||||
|
@ -67,6 +71,12 @@ name: \
|
|||
cfi_startproc
|
||||
#endif
|
||||
|
||||
#ifndef ALIAS_SYMBOL
|
||||
# define ALIAS_SYMBOL(alias, original) \
|
||||
.globl alias; \
|
||||
.equ alias, original
|
||||
#endif
|
||||
|
||||
#ifndef END
|
||||
# define END(name) \
|
||||
cfi_endproc; \
|
||||
|
@ -3122,3 +3132,5 @@ L(bk_ssse3_cpy):
|
|||
#endif
|
||||
|
||||
END (MEMCPY)
|
||||
|
||||
ALIAS_SYMBOL(memmove, MEMCPY)
|
||||
|
|
|
@ -1,34 +0,0 @@
|
|||
/*
|
||||
Copyright (c) 2010, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
#define MEMCPY memmove
|
||||
#define USE_AS_MEMMOVE
|
||||
#include "ssse3-memcpy-atom.S"
|
|
@ -1,308 +0,0 @@
|
|||
/*
|
||||
Copyright (c) 2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "cache.h"
|
||||
|
||||
#ifndef MEMCPY
|
||||
# define MEMCPY memcpy
|
||||
#endif
|
||||
|
||||
#ifndef L
|
||||
# define L(label) .L##label
|
||||
#endif
|
||||
|
||||
#ifndef cfi_startproc
|
||||
# define cfi_startproc .cfi_startproc
|
||||
#endif
|
||||
|
||||
#ifndef cfi_endproc
|
||||
# define cfi_endproc .cfi_endproc
|
||||
#endif
|
||||
|
||||
#ifndef cfi_rel_offset
|
||||
# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
|
||||
#endif
|
||||
|
||||
#ifndef cfi_restore
|
||||
# define cfi_restore(reg) .cfi_restore reg
|
||||
#endif
|
||||
|
||||
#ifndef cfi_adjust_cfa_offset
|
||||
# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
|
||||
#endif
|
||||
|
||||
#ifndef ENTRY
|
||||
# define ENTRY(name) \
|
||||
.type name, @function; \
|
||||
.globl name; \
|
||||
.p2align 4; \
|
||||
name: \
|
||||
cfi_startproc
|
||||
#endif
|
||||
|
||||
#ifndef END
|
||||
# define END(name) \
|
||||
cfi_endproc; \
|
||||
.size name, .-name
|
||||
#endif
|
||||
|
||||
#define DEST PARMS
|
||||
#define SRC DEST+4
|
||||
#define LEN SRC+4
|
||||
|
||||
#define CFI_PUSH(REG) \
|
||||
cfi_adjust_cfa_offset (4); \
|
||||
cfi_rel_offset (REG, 0)
|
||||
|
||||
#define CFI_POP(REG) \
|
||||
cfi_adjust_cfa_offset (-4); \
|
||||
cfi_restore (REG)
|
||||
|
||||
#define PUSH(REG) pushl REG; CFI_PUSH (REG)
|
||||
#define POP(REG) popl REG; CFI_POP (REG)
|
||||
|
||||
#define PARMS 8 /* Preserve EBX. */
|
||||
#define ENTRANCE PUSH (%ebx);
|
||||
#define RETURN_END POP (%ebx); ret
|
||||
#define RETURN RETURN_END; CFI_PUSH (%ebx)
|
||||
|
||||
.section .text.sse2,"ax",@progbits
|
||||
ENTRY (MEMCPY)
|
||||
ENTRANCE
|
||||
movl LEN(%esp), %ecx
|
||||
movl SRC(%esp), %eax
|
||||
movl DEST(%esp), %edx
|
||||
|
||||
cmp %eax, %edx
|
||||
je L(return)
|
||||
|
||||
cmp $16, %ecx
|
||||
jbe L(len_0_16_bytes)
|
||||
|
||||
cmp $SHARED_CACHE_SIZE_HALF, %ecx
|
||||
jae L(large_page)
|
||||
|
||||
movdqu (%eax), %xmm0
|
||||
movdqu -16(%eax, %ecx), %xmm1
|
||||
cmpl $32, %ecx
|
||||
movdqu %xmm0, (%edx)
|
||||
movdqu %xmm1, -16(%edx, %ecx)
|
||||
jbe L(return)
|
||||
|
||||
movdqu 16(%eax), %xmm0
|
||||
movdqu -32(%eax, %ecx), %xmm1
|
||||
cmpl $64, %ecx
|
||||
movdqu %xmm0, 16(%edx)
|
||||
movdqu %xmm1, -32(%edx, %ecx)
|
||||
jbe L(return)
|
||||
|
||||
movdqu 32(%eax), %xmm0
|
||||
movdqu 48(%eax), %xmm1
|
||||
movdqu -48(%eax, %ecx), %xmm2
|
||||
movdqu -64(%eax, %ecx), %xmm3
|
||||
cmpl $128, %ecx
|
||||
movdqu %xmm0, 32(%edx)
|
||||
movdqu %xmm1, 48(%edx)
|
||||
movdqu %xmm2, -48(%edx, %ecx)
|
||||
movdqu %xmm3, -64(%edx, %ecx)
|
||||
jbe L(return)
|
||||
|
||||
/* Now the main loop: we align the address of the destination. */
|
||||
leal 64(%edx), %ebx
|
||||
andl $-64, %ebx
|
||||
|
||||
addl %edx, %ecx
|
||||
andl $-64, %ecx
|
||||
|
||||
subl %edx, %eax
|
||||
|
||||
/* We should stop two iterations before the termination
|
||||
(in order not to misprefetch). */
|
||||
subl $64, %ecx
|
||||
cmpl %ebx, %ecx
|
||||
je L(main_loop_just_one_iteration)
|
||||
|
||||
subl $64, %ecx
|
||||
cmpl %ebx, %ecx
|
||||
je L(main_loop_last_two_iterations)
|
||||
|
||||
|
||||
.p2align 4
|
||||
L(main_loop_cache):
|
||||
|
||||
prefetcht0 128(%ebx, %eax)
|
||||
|
||||
movdqu (%ebx, %eax), %xmm0
|
||||
movdqu 16(%ebx, %eax), %xmm1
|
||||
movdqu 32(%ebx, %eax), %xmm2
|
||||
movdqu 48(%ebx, %eax), %xmm3
|
||||
movdqa %xmm0, (%ebx)
|
||||
movdqa %xmm1, 16(%ebx)
|
||||
movdqa %xmm2, 32(%ebx)
|
||||
movdqa %xmm3, 48(%ebx)
|
||||
lea 64(%ebx), %ebx
|
||||
cmpl %ebx, %ecx
|
||||
jne L(main_loop_cache)
|
||||
|
||||
L(main_loop_last_two_iterations):
|
||||
movdqu (%ebx, %eax), %xmm0
|
||||
movdqu 16(%ebx, %eax), %xmm1
|
||||
movdqu 32(%ebx, %eax), %xmm2
|
||||
movdqu 48(%ebx, %eax), %xmm3
|
||||
movdqu 64(%ebx, %eax), %xmm4
|
||||
movdqu 80(%ebx, %eax), %xmm5
|
||||
movdqu 96(%ebx, %eax), %xmm6
|
||||
movdqu 112(%ebx, %eax), %xmm7
|
||||
movdqa %xmm0, (%ebx)
|
||||
movdqa %xmm1, 16(%ebx)
|
||||
movdqa %xmm2, 32(%ebx)
|
||||
movdqa %xmm3, 48(%ebx)
|
||||
movdqa %xmm4, 64(%ebx)
|
||||
movdqa %xmm5, 80(%ebx)
|
||||
movdqa %xmm6, 96(%ebx)
|
||||
movdqa %xmm7, 112(%ebx)
|
||||
jmp L(return)
|
||||
|
||||
L(main_loop_just_one_iteration):
|
||||
movdqu (%ebx, %eax), %xmm0
|
||||
movdqu 16(%ebx, %eax), %xmm1
|
||||
movdqu 32(%ebx, %eax), %xmm2
|
||||
movdqu 48(%ebx, %eax), %xmm3
|
||||
movdqa %xmm0, (%ebx)
|
||||
movdqa %xmm1, 16(%ebx)
|
||||
movdqa %xmm2, 32(%ebx)
|
||||
movdqa %xmm3, 48(%ebx)
|
||||
jmp L(return)
|
||||
|
||||
L(large_page):
|
||||
movdqu (%eax), %xmm0
|
||||
movdqu 16(%eax), %xmm1
|
||||
movdqu 32(%eax), %xmm2
|
||||
movdqu 48(%eax), %xmm3
|
||||
movdqu -64(%eax, %ecx), %xmm4
|
||||
movdqu -48(%eax, %ecx), %xmm5
|
||||
movdqu -32(%eax, %ecx), %xmm6
|
||||
movdqu -16(%eax, %ecx), %xmm7
|
||||
movdqu %xmm0, (%edx)
|
||||
movdqu %xmm1, 16(%edx)
|
||||
movdqu %xmm2, 32(%edx)
|
||||
movdqu %xmm3, 48(%edx)
|
||||
movdqu %xmm4, -64(%edx, %ecx)
|
||||
movdqu %xmm5, -48(%edx, %ecx)
|
||||
movdqu %xmm6, -32(%edx, %ecx)
|
||||
movdqu %xmm7, -16(%edx, %ecx)
|
||||
|
||||
movdqu 64(%eax), %xmm0
|
||||
movdqu 80(%eax), %xmm1
|
||||
movdqu 96(%eax), %xmm2
|
||||
movdqu 112(%eax), %xmm3
|
||||
movdqu -128(%eax, %ecx), %xmm4
|
||||
movdqu -112(%eax, %ecx), %xmm5
|
||||
movdqu -96(%eax, %ecx), %xmm6
|
||||
movdqu -80(%eax, %ecx), %xmm7
|
||||
movdqu %xmm0, 64(%edx)
|
||||
movdqu %xmm1, 80(%edx)
|
||||
movdqu %xmm2, 96(%edx)
|
||||
movdqu %xmm3, 112(%edx)
|
||||
movdqu %xmm4, -128(%edx, %ecx)
|
||||
movdqu %xmm5, -112(%edx, %ecx)
|
||||
movdqu %xmm6, -96(%edx, %ecx)
|
||||
movdqu %xmm7, -80(%edx, %ecx)
|
||||
|
||||
/* Now the main loop with non temporal stores. We align
|
||||
the address of the destination. */
|
||||
leal 128(%edx), %ebx
|
||||
andl $-128, %ebx
|
||||
|
||||
addl %edx, %ecx
|
||||
andl $-128, %ecx
|
||||
|
||||
subl %edx, %eax
|
||||
|
||||
.p2align 4
|
||||
L(main_loop_large_page):
|
||||
movdqu (%ebx, %eax), %xmm0
|
||||
movdqu 16(%ebx, %eax), %xmm1
|
||||
movdqu 32(%ebx, %eax), %xmm2
|
||||
movdqu 48(%ebx, %eax), %xmm3
|
||||
movdqu 64(%ebx, %eax), %xmm4
|
||||
movdqu 80(%ebx, %eax), %xmm5
|
||||
movdqu 96(%ebx, %eax), %xmm6
|
||||
movdqu 112(%ebx, %eax), %xmm7
|
||||
movntdq %xmm0, (%ebx)
|
||||
movntdq %xmm1, 16(%ebx)
|
||||
movntdq %xmm2, 32(%ebx)
|
||||
movntdq %xmm3, 48(%ebx)
|
||||
movntdq %xmm4, 64(%ebx)
|
||||
movntdq %xmm5, 80(%ebx)
|
||||
movntdq %xmm6, 96(%ebx)
|
||||
movntdq %xmm7, 112(%ebx)
|
||||
lea 128(%ebx), %ebx
|
||||
cmpl %ebx, %ecx
|
||||
jne L(main_loop_large_page)
|
||||
sfence
|
||||
jmp L(return)
|
||||
|
||||
L(len_0_16_bytes):
|
||||
testb $24, %cl
|
||||
jne L(len_9_16_bytes)
|
||||
testb $4, %cl
|
||||
.p2align 4,,5
|
||||
jne L(len_5_8_bytes)
|
||||
testl %ecx, %ecx
|
||||
.p2align 4,,2
|
||||
je L(return)
|
||||
movzbl (%eax), %ebx
|
||||
testb $2, %cl
|
||||
movb %bl, (%edx)
|
||||
je L(return)
|
||||
movzwl -2(%eax,%ecx), %ebx
|
||||
movw %bx, -2(%edx,%ecx)
|
||||
jmp L(return)
|
||||
|
||||
L(len_9_16_bytes):
|
||||
movq (%eax), %xmm0
|
||||
movq -8(%eax, %ecx), %xmm1
|
||||
movq %xmm0, (%edx)
|
||||
movq %xmm1, -8(%edx, %ecx)
|
||||
jmp L(return)
|
||||
|
||||
L(len_5_8_bytes):
|
||||
movl (%eax), %ebx
|
||||
movl %ebx, (%edx)
|
||||
movl -4(%eax,%ecx), %ebx
|
||||
movl %ebx, -4(%edx,%ecx)
|
||||
jmp L(return)
|
||||
|
||||
L(return):
|
||||
movl %edx, %eax
|
||||
RETURN
|
||||
|
||||
END (MEMCPY)
|
|
@ -67,6 +67,12 @@ name: \
|
|||
cfi_startproc
|
||||
#endif
|
||||
|
||||
#ifndef ALIAS_SYMBOL
|
||||
# define ALIAS_SYMBOL(alias, original) \
|
||||
.globl alias; \
|
||||
.equ alias, original
|
||||
#endif
|
||||
|
||||
#ifndef END
|
||||
# define END(name) \
|
||||
cfi_endproc; \
|
||||
|
@ -537,3 +543,5 @@ L(mm_large_page_loop_backward):
|
|||
jmp L(mm_recalc_len)
|
||||
|
||||
END (MEMMOVE)
|
||||
|
||||
ALIAS_SYMBOL(memcpy, MEMMOVE)
|
||||
|
|
|
@ -1,299 +0,0 @@
|
|||
/*
|
||||
Copyright (c) 2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "cache.h"
|
||||
|
||||
#ifndef MEMCPY
|
||||
# define MEMCPY memcpy
|
||||
#endif
|
||||
|
||||
#ifndef L
|
||||
# define L(label) .L##label
|
||||
#endif
|
||||
|
||||
#ifndef cfi_startproc
|
||||
# define cfi_startproc .cfi_startproc
|
||||
#endif
|
||||
|
||||
#ifndef cfi_endproc
|
||||
# define cfi_endproc .cfi_endproc
|
||||
#endif
|
||||
|
||||
#ifndef cfi_rel_offset
|
||||
# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
|
||||
#endif
|
||||
|
||||
#ifndef cfi_restore
|
||||
# define cfi_restore(reg) .cfi_restore reg
|
||||
#endif
|
||||
|
||||
#ifndef cfi_adjust_cfa_offset
|
||||
# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
|
||||
#endif
|
||||
|
||||
#ifndef ENTRY
|
||||
# define ENTRY(name) \
|
||||
.type name, @function; \
|
||||
.globl name; \
|
||||
.p2align 4; \
|
||||
name: \
|
||||
cfi_startproc
|
||||
#endif
|
||||
|
||||
#ifndef END
|
||||
# define END(name) \
|
||||
cfi_endproc; \
|
||||
.size name, .-name
|
||||
#endif
|
||||
|
||||
#define CFI_PUSH(REG) \
|
||||
cfi_adjust_cfa_offset (4); \
|
||||
cfi_rel_offset (REG, 0)
|
||||
|
||||
#define CFI_POP(REG) \
|
||||
cfi_adjust_cfa_offset (-4); \
|
||||
cfi_restore (REG)
|
||||
|
||||
#define PUSH(REG) push REG;
|
||||
#define POP(REG) pop REG;
|
||||
|
||||
#define ENTRANCE PUSH (%rbx);
|
||||
#define RETURN_END POP (%rbx); ret
|
||||
#define RETURN RETURN_END;
|
||||
|
||||
.section .text.sse2,"ax",@progbits
|
||||
ENTRY (MEMCPY)
|
||||
ENTRANCE
|
||||
cmp %rsi, %rdi
|
||||
je L(return)
|
||||
|
||||
cmp $16, %rdx
|
||||
jbe L(len_0_16_bytes)
|
||||
|
||||
cmp $SHARED_CACHE_SIZE_HALF, %rdx
|
||||
jae L(large_page)
|
||||
|
||||
movdqu (%rsi), %xmm0
|
||||
movdqu -16(%rsi, %rdx), %xmm1
|
||||
cmp $32, %rdx
|
||||
movdqu %xmm0, (%rdi)
|
||||
movdqu %xmm1, -16(%rdi, %rdx)
|
||||
jbe L(return)
|
||||
|
||||
movdqu 16(%rsi), %xmm0
|
||||
movdqu -32(%rsi, %rdx), %xmm1
|
||||
cmp $64, %rdx
|
||||
movdqu %xmm0, 16(%rdi)
|
||||
movdqu %xmm1, -32(%rdi, %rdx)
|
||||
jbe L(return)
|
||||
|
||||
movdqu 32(%rsi), %xmm0
|
||||
movdqu 48(%rsi), %xmm1
|
||||
movdqu -48(%rsi, %rdx), %xmm2
|
||||
movdqu -64(%rsi, %rdx), %xmm3
|
||||
cmp $128, %rdx
|
||||
movdqu %xmm0, 32(%rdi)
|
||||
movdqu %xmm1, 48(%rdi)
|
||||
movdqu %xmm2, -48(%rdi, %rdx)
|
||||
movdqu %xmm3, -64(%rdi, %rdx)
|
||||
jbe L(return)
|
||||
|
||||
/* Now the main loop: we align the address of the destination. */
|
||||
lea 64(%rdi), %r8
|
||||
and $-64, %r8
|
||||
|
||||
add %rdi, %rdx
|
||||
and $-64, %rdx
|
||||
|
||||
sub %rdi, %rsi
|
||||
|
||||
/* We should stop two iterations before the termination
|
||||
(in order not to misprefetch). */
|
||||
sub $64, %rdx
|
||||
cmp %r8, %rdx
|
||||
je L(main_loop_just_one_iteration)
|
||||
|
||||
sub $64, %rdx
|
||||
cmp %r8, %rdx
|
||||
je L(main_loop_last_two_iterations)
|
||||
|
||||
|
||||
.p2align 4
|
||||
L(main_loop_cache):
|
||||
|
||||
prefetcht0 128(%r8, %rsi)
|
||||
|
||||
movdqu (%r8, %rsi), %xmm0
|
||||
movdqu 16(%r8, %rsi), %xmm1
|
||||
movdqu 32(%r8, %rsi), %xmm2
|
||||
movdqu 48(%r8, %rsi), %xmm3
|
||||
movdqa %xmm0, (%r8)
|
||||
movdqa %xmm1, 16(%r8)
|
||||
movdqa %xmm2, 32(%r8)
|
||||
movdqa %xmm3, 48(%r8)
|
||||
lea 64(%r8), %r8
|
||||
cmp %r8, %rdx
|
||||
jne L(main_loop_cache)
|
||||
|
||||
L(main_loop_last_two_iterations):
|
||||
movdqu (%r8, %rsi), %xmm0
|
||||
movdqu 16(%r8, %rsi), %xmm1
|
||||
movdqu 32(%r8, %rsi), %xmm2
|
||||
movdqu 48(%r8, %rsi), %xmm3
|
||||
movdqu 64(%r8, %rsi), %xmm4
|
||||
movdqu 80(%r8, %rsi), %xmm5
|
||||
movdqu 96(%r8, %rsi), %xmm6
|
||||
movdqu 112(%r8, %rsi), %xmm7
|
||||
movdqa %xmm0, (%r8)
|
||||
movdqa %xmm1, 16(%r8)
|
||||
movdqa %xmm2, 32(%r8)
|
||||
movdqa %xmm3, 48(%r8)
|
||||
movdqa %xmm4, 64(%r8)
|
||||
movdqa %xmm5, 80(%r8)
|
||||
movdqa %xmm6, 96(%r8)
|
||||
movdqa %xmm7, 112(%r8)
|
||||
jmp L(return)
|
||||
|
||||
L(main_loop_just_one_iteration):
|
||||
movdqu (%r8, %rsi), %xmm0
|
||||
movdqu 16(%r8, %rsi), %xmm1
|
||||
movdqu 32(%r8, %rsi), %xmm2
|
||||
movdqu 48(%r8, %rsi), %xmm3
|
||||
movdqa %xmm0, (%r8)
|
||||
movdqa %xmm1, 16(%r8)
|
||||
movdqa %xmm2, 32(%r8)
|
||||
movdqa %xmm3, 48(%r8)
|
||||
jmp L(return)
|
||||
|
||||
L(large_page):
|
||||
movdqu (%rsi), %xmm0
|
||||
movdqu 16(%rsi), %xmm1
|
||||
movdqu 32(%rsi), %xmm2
|
||||
movdqu 48(%rsi), %xmm3
|
||||
movdqu -64(%rsi, %rdx), %xmm4
|
||||
movdqu -48(%rsi, %rdx), %xmm5
|
||||
movdqu -32(%rsi, %rdx), %xmm6
|
||||
movdqu -16(%rsi, %rdx), %xmm7
|
||||
movdqu %xmm0, (%rdi)
|
||||
movdqu %xmm1, 16(%rdi)
|
||||
movdqu %xmm2, 32(%rdi)
|
||||
movdqu %xmm3, 48(%rdi)
|
||||
movdqu %xmm4, -64(%rdi, %rdx)
|
||||
movdqu %xmm5, -48(%rdi, %rdx)
|
||||
movdqu %xmm6, -32(%rdi, %rdx)
|
||||
movdqu %xmm7, -16(%rdi, %rdx)
|
||||
|
||||
movdqu 64(%rsi), %xmm0
|
||||
movdqu 80(%rsi), %xmm1
|
||||
movdqu 96(%rsi), %xmm2
|
||||
movdqu 112(%rsi), %xmm3
|
||||
movdqu -128(%rsi, %rdx), %xmm4
|
||||
movdqu -112(%rsi, %rdx), %xmm5
|
||||
movdqu -96(%rsi, %rdx), %xmm6
|
||||
movdqu -80(%rsi, %rdx), %xmm7
|
||||
movdqu %xmm0, 64(%rdi)
|
||||
movdqu %xmm1, 80(%rdi)
|
||||
movdqu %xmm2, 96(%rdi)
|
||||
movdqu %xmm3, 112(%rdi)
|
||||
movdqu %xmm4, -128(%rdi, %rdx)
|
||||
movdqu %xmm5, -112(%rdi, %rdx)
|
||||
movdqu %xmm6, -96(%rdi, %rdx)
|
||||
movdqu %xmm7, -80(%rdi, %rdx)
|
||||
|
||||
/* Now the main loop with non temporal stores. We align
|
||||
the address of the destination. */
|
||||
lea 128(%rdi), %r8
|
||||
and $-128, %r8
|
||||
|
||||
add %rdi, %rdx
|
||||
and $-128, %rdx
|
||||
|
||||
sub %rdi, %rsi
|
||||
|
||||
.p2align 4
|
||||
L(main_loop_large_page):
|
||||
movdqu (%r8, %rsi), %xmm0
|
||||
movdqu 16(%r8, %rsi), %xmm1
|
||||
movdqu 32(%r8, %rsi), %xmm2
|
||||
movdqu 48(%r8, %rsi), %xmm3
|
||||
movdqu 64(%r8, %rsi), %xmm4
|
||||
movdqu 80(%r8, %rsi), %xmm5
|
||||
movdqu 96(%r8, %rsi), %xmm6
|
||||
movdqu 112(%r8, %rsi), %xmm7
|
||||
movntdq %xmm0, (%r8)
|
||||
movntdq %xmm1, 16(%r8)
|
||||
movntdq %xmm2, 32(%r8)
|
||||
movntdq %xmm3, 48(%r8)
|
||||
movntdq %xmm4, 64(%r8)
|
||||
movntdq %xmm5, 80(%r8)
|
||||
movntdq %xmm6, 96(%r8)
|
||||
movntdq %xmm7, 112(%r8)
|
||||
lea 128(%r8), %r8
|
||||
cmp %r8, %rdx
|
||||
jne L(main_loop_large_page)
|
||||
sfence
|
||||
jmp L(return)
|
||||
|
||||
L(len_0_16_bytes):
|
||||
testb $24, %dl
|
||||
jne L(len_9_16_bytes)
|
||||
testb $4, %dl
|
||||
.p2align 4,,5
|
||||
jne L(len_5_8_bytes)
|
||||
test %rdx, %rdx
|
||||
.p2align 4,,2
|
||||
je L(return)
|
||||
movzbl (%rsi), %ebx
|
||||
testb $2, %dl
|
||||
movb %bl, (%rdi)
|
||||
je L(return)
|
||||
movzwl -2(%rsi,%rdx), %ebx
|
||||
movw %bx, -2(%rdi,%rdx)
|
||||
jmp L(return)
|
||||
|
||||
L(len_9_16_bytes):
|
||||
movq (%rsi), %xmm0
|
||||
movq -8(%rsi, %rdx), %xmm1
|
||||
movq %xmm0, (%rdi)
|
||||
movq %xmm1, -8(%rdi, %rdx)
|
||||
jmp L(return)
|
||||
|
||||
L(len_5_8_bytes):
|
||||
movl (%rsi), %ebx
|
||||
movl %ebx, (%rdi)
|
||||
movl -4(%rsi,%rdx), %ebx
|
||||
movl %ebx, -4(%rdi,%rdx)
|
||||
jmp L(return)
|
||||
|
||||
L(return):
|
||||
mov %rdi, %rax
|
||||
RETURN
|
||||
|
||||
END (MEMCPY)
|
|
@ -67,6 +67,12 @@ name: \
|
|||
cfi_startproc
|
||||
#endif
|
||||
|
||||
#ifndef ALIAS_SYMBOL
|
||||
# define ALIAS_SYMBOL(alias, original) \
|
||||
.globl alias; \
|
||||
.equ alias, original
|
||||
#endif
|
||||
|
||||
#ifndef END
|
||||
# define END(name) \
|
||||
cfi_endproc; \
|
||||
|
@ -508,3 +514,5 @@ L(mm_large_page_loop_backward):
|
|||
jmp L(mm_recalc_len)
|
||||
|
||||
END (MEMMOVE)
|
||||
|
||||
ALIAS_SYMBOL(memcpy, MEMMOVE)
|
||||
|
|
Loading…
Reference in New Issue