Make memcpy memmove

Bug: http://b/63992911
Test: Change BoardConfig.mk and compile for each variant
Change-Id: Ia0cc68d8e90e3316ddb2e9ff1555a009b6a0c5be
This commit is contained in:
Haibo Huang 2018-05-24 20:39:18 -07:00
parent 42596b7bf0
commit 8a0f0ed5e7
21 changed files with 58 additions and 1047 deletions

View File

@ -855,7 +855,6 @@ cc_library_static {
arm: {
srcs: [
"arch-arm/generic/bionic/memcmp.S",
"arch-arm/generic/bionic/memcpy.S",
"arch-arm/generic/bionic/memmove.S",
"arch-arm/generic/bionic/memset.S",
"arch-arm/generic/bionic/strcmp.S",
@ -1125,7 +1124,6 @@ cc_library_static {
"arch-x86/atom/string/sse2-wcsrchr-atom.S",
"arch-x86/atom/string/sse2-wcslen-atom.S",
"arch-x86/atom/string/sse2-wcscmp-atom.S",
"arch-x86/silvermont/string/sse2-memcpy-slm.S",
"arch-x86/silvermont/string/sse2-memmove-slm.S",
"arch-x86/silvermont/string/sse2-memset-slm.S",
"arch-x86/silvermont/string/sse2-stpcpy-slm.S",
@ -1154,14 +1152,12 @@ cc_library_static {
"arch-x86/atom/string/sse2-strlen-atom.S",
"arch-x86/atom/string/ssse3-memcmp-atom.S",
"arch-x86/atom/string/ssse3-memcpy-atom.S",
"arch-x86/atom/string/ssse3-memmove-atom.S",
"arch-x86/atom/string/ssse3-strcpy-atom.S",
"arch-x86/atom/string/ssse3-strncpy-atom.S",
"arch-x86/atom/string/ssse3-wmemcmp-atom.S",
],
exclude_srcs: [
"arch-x86/generic/string/memcmp.S",
"arch-x86/silvermont/string/sse2-memcpy-slm.S",
"arch-x86/silvermont/string/sse2-memmove-slm.S",
"arch-x86/silvermont/string/sse2-memset-slm.S",
"arch-x86/silvermont/string/sse2-strcpy-slm.S",
@ -1198,7 +1194,6 @@ cc_library_static {
},
x86_64: {
srcs: [
"arch-x86_64/string/sse2-memcpy-slm.S",
"arch-x86_64/string/sse2-memmove-slm.S",
"arch-x86_64/string/sse2-memset-slm.S",
"arch-x86_64/string/sse2-stpcpy-slm.S",

View File

@ -64,7 +64,7 @@
.arch armv7-a
// Prototype: void *memcpy (void *dst, const void *src, size_t count).
ENTRY(memcpy)
ENTRY(__memcpy)
pld [r1, #64]
push {r0, lr}
.cfi_def_cfa_offset 8
@ -72,4 +72,4 @@ ENTRY(memcpy)
.cfi_rel_offset lr, 4
#include "memcpy_base.S"
END(memcpy)
END(__memcpy)

View File

@ -64,7 +64,7 @@
.arch armv7-a
// Prototype: void *memcpy (void *dst, const void *src, size_t count).
ENTRY(memcpy)
ENTRY(__memcpy)
pld [r1, #64]
push {r0, lr}
.cfi_def_cfa_offset 8
@ -72,4 +72,4 @@ ENTRY(memcpy)
.cfi_rel_offset lr, 4
#include "memcpy_base.S"
END(memcpy)
END(__memcpy)

View File

@ -64,7 +64,7 @@
.arch armv7-a
// Prototype: void *memcpy (void *dst, const void *src, size_t count).
ENTRY(memcpy)
ENTRY(__memcpy)
pld [r1, #64]
push {r0, lr}
.cfi_def_cfa_offset 8
@ -72,4 +72,4 @@ ENTRY(memcpy)
.cfi_rel_offset lr, 4
#include "memcpy_base.S"
END(memcpy)
END(__memcpy)

View File

@ -39,14 +39,14 @@
.thumb
.thumb_func
ENTRY(memcpy)
ENTRY(__memcpy)
pld [r1, #0]
stmfd sp!, {r0, lr}
.cfi_def_cfa_offset 8
.cfi_rel_offset r0, 0
.cfi_rel_offset lr, 4
pld [r1, #64]
END(memcpy)
END(__memcpy)
#define MEMCPY_BASE __memcpy_base
#define MEMCPY_BASE_ALIGNED __memcpy_base_aligned

View File

@ -65,13 +65,13 @@
// arch. The code generated is exactly the same.
.arch armv7-a
ENTRY(memcpy)
ENTRY(__memcpy)
pld [r1, #64]
push {r0, lr}
.cfi_def_cfa_offset 8
.cfi_rel_offset r0, 0
.cfi_rel_offset lr, 4
END(memcpy)
END(__memcpy)
#define MEMCPY_BASE __memcpy_base
#define MEMCPY_BASE_ALIGNED __memcpy_base_aligned

View File

@ -50,7 +50,7 @@ ENTRY(memmove)
bhi .L_reversed_memcpy
.L_jump_to_memcpy:
b memcpy
b __memcpy
.L_reversed_memcpy:
push {r0, lr}
@ -278,3 +278,5 @@ ENTRY(memmove)
pop {r0, pc}
END(memmove)
ALIAS_SYMBOL(memcpy, memmove)

View File

@ -1,379 +0,0 @@
/*
* Copyright (C) 2008 The Android Open Source Project
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <private/bionic_asm.h>
/*
* Optimized memcpy() for ARM.
*
* note that memcpy() always returns the destination pointer,
* so we have to preserve R0.
*/
.syntax unified
ENTRY(memcpy)
/* The stack must always be 64-bits aligned to be compliant with the
* ARM ABI. Since we have to save R0, we might as well save R4
* which we can use for better pipelining of the reads below
*/
stmfd sp!, {r0, r4, lr}
.cfi_def_cfa_offset 12
.cfi_rel_offset r0, 0
.cfi_rel_offset r4, 4
.cfi_rel_offset lr, 8
/* Making room for r5-r11 which will be spilled later */
sub sp, sp, #28
.cfi_adjust_cfa_offset 28
// preload the destination because we'll align it to a cache line
// with small writes. Also start the source "pump".
pld [r0, #0]
pld [r1, #0]
pld [r1, #32]
/* it simplifies things to take care of len<4 early */
cmp r2, #4
blo .Lcopy_last_3_and_return
/* compute the offset to align the source
* offset = (4-(src&3))&3 = -src & 3
*/
rsb r3, r1, #0
ands r3, r3, #3
beq .Lsrc_aligned
/* align source to 32 bits. We need to insert 2 instructions between
* a ldr[b|h] and str[b|h] because byte and half-word instructions
* stall 2 cycles.
*/
movs r12, r3, lsl #31
sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
ldrbmi r3, [r1], #1
ldrbcs r4, [r1], #1
ldrbcs r12,[r1], #1
strbmi r3, [r0], #1
strbcs r4, [r0], #1
strbcs r12,[r0], #1
.Lsrc_aligned:
/* see if src and dst are aligned together (congruent) */
eor r12, r0, r1
tst r12, #3
bne .Lnon_congruent
/* Use post-incriment mode for stm to spill r5-r11 to reserved stack
* frame. Don't update sp.
*/
stmea sp, {r5-r11}
/* align the destination to a cache-line */
rsb r3, r0, #0
ands r3, r3, #0x1C
beq .Lcongruent_aligned32
cmp r3, r2
andhi r3, r2, #0x1C
/* conditionally copies 0 to 7 words (length in r3) */
movs r12, r3, lsl #28
ldmcs r1!, {r4, r5, r6, r7} /* 16 bytes */
ldmmi r1!, {r8, r9} /* 8 bytes */
stmcs r0!, {r4, r5, r6, r7}
stmmi r0!, {r8, r9}
tst r3, #0x4
ldrne r10,[r1], #4 /* 4 bytes */
strne r10,[r0], #4
sub r2, r2, r3
.Lcongruent_aligned32:
/*
* here source is aligned to 32 bytes.
*/
.Lcached_aligned32:
subs r2, r2, #32
blo .Lless_than_32_left
/*
* We preload a cache-line up to 64 bytes ahead. On the 926, this will
* stall only until the requested world is fetched, but the linefill
* continues in the the background.
* While the linefill is going, we write our previous cache-line
* into the write-buffer (which should have some free space).
* When the linefill is done, the writebuffer will
* start dumping its content into memory
*
* While all this is going, we then load a full cache line into
* 8 registers, this cache line should be in the cache by now
* (or partly in the cache).
*
* This code should work well regardless of the source/dest alignment.
*
*/
// Align the preload register to a cache-line because the cpu does
// "critical word first" (the first word requested is loaded first).
bic r12, r1, #0x1F
add r12, r12, #64
1: ldmia r1!, { r4-r11 }
pld [r12, #64]
subs r2, r2, #32
// NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
// for ARM9 preload will not be safely guarded by the preceding subs.
// When it is safely guarded the only possibility to have SIGSEGV here
// is because the caller overstates the length.
ldrhi r3, [r12], #32 /* cheap ARM9 preload */
stmia r0!, { r4-r11 }
bhs 1b
add r2, r2, #32
.Lless_than_32_left:
/*
* less than 32 bytes left at this point (length in r2)
*/
/* skip all this if there is nothing to do, which should
* be a common case (if not executed the code below takes
* about 16 cycles)
*/
tst r2, #0x1F
beq 1f
/* conditionnaly copies 0 to 31 bytes */
movs r12, r2, lsl #28
ldmcs r1!, {r4, r5, r6, r7} /* 16 bytes */
ldmmi r1!, {r8, r9} /* 8 bytes */
stmcs r0!, {r4, r5, r6, r7}
stmmi r0!, {r8, r9}
movs r12, r2, lsl #30
ldrcs r3, [r1], #4 /* 4 bytes */
ldrhmi r4, [r1], #2 /* 2 bytes */
strcs r3, [r0], #4
strhmi r4, [r0], #2
tst r2, #0x1
ldrbne r3, [r1] /* last byte */
strbne r3, [r0]
/* we're done! restore everything and return */
1: ldmfd sp!, {r5-r11}
ldmfd sp!, {r0, r4, pc}
/********************************************************************/
.Lnon_congruent:
/*
* here source is aligned to 4 bytes
* but destination is not.
*
* in the code below r2 is the number of bytes read
* (the number of bytes written is always smaller, because we have
* partial words in the shift queue)
*/
cmp r2, #4
blo .Lcopy_last_3_and_return
/* Use post-increment mode for stm to spill r5-r11 to reserved stack
* frame. Don't update sp.
*/
stmea sp, {r5-r11}
/* compute shifts needed to align src to dest */
rsb r5, r0, #0
and r5, r5, #3 /* r5 = # bytes in partial words */
mov r12, r5, lsl #3 /* r12 = right */
rsb lr, r12, #32 /* lr = left */
/* read the first word */
ldr r3, [r1], #4
sub r2, r2, #4
/* write a partial word (0 to 3 bytes), such that destination
* becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
*/
movs r5, r5, lsl #31
strbmi r3, [r0], #1
movmi r3, r3, lsr #8
strbcs r3, [r0], #1
movcs r3, r3, lsr #8
strbcs r3, [r0], #1
movcs r3, r3, lsr #8
cmp r2, #4
blo .Lpartial_word_tail
/* Align destination to 32 bytes (cache line boundary) */
1: tst r0, #0x1c
beq 2f
ldr r5, [r1], #4
sub r2, r2, #4
orr r4, r3, r5, lsl lr
mov r3, r5, lsr r12
str r4, [r0], #4
cmp r2, #4
bhs 1b
blo .Lpartial_word_tail
/* copy 32 bytes at a time */
2: subs r2, r2, #32
blo .Lless_than_thirtytwo
/* Use immediate mode for the shifts, because there is an extra cycle
* for register shifts, which could account for up to 50% of
* performance hit.
*/
cmp r12, #24
beq .Lloop24
cmp r12, #8
beq .Lloop8
.Lloop16:
ldr r12, [r1], #4
1: mov r4, r12
ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
pld [r1, #64]
subs r2, r2, #32
ldrhs r12, [r1], #4
orr r3, r3, r4, lsl #16
mov r4, r4, lsr #16
orr r4, r4, r5, lsl #16
mov r5, r5, lsr #16
orr r5, r5, r6, lsl #16
mov r6, r6, lsr #16
orr r6, r6, r7, lsl #16
mov r7, r7, lsr #16
orr r7, r7, r8, lsl #16
mov r8, r8, lsr #16
orr r8, r8, r9, lsl #16
mov r9, r9, lsr #16
orr r9, r9, r10, lsl #16
mov r10, r10, lsr #16
orr r10, r10, r11, lsl #16
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
mov r3, r11, lsr #16
bhs 1b
b .Lless_than_thirtytwo
.Lloop8:
ldr r12, [r1], #4
1: mov r4, r12
ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
pld [r1, #64]
subs r2, r2, #32
ldrhs r12, [r1], #4
orr r3, r3, r4, lsl #24
mov r4, r4, lsr #8
orr r4, r4, r5, lsl #24
mov r5, r5, lsr #8
orr r5, r5, r6, lsl #24
mov r6, r6, lsr #8
orr r6, r6, r7, lsl #24
mov r7, r7, lsr #8
orr r7, r7, r8, lsl #24
mov r8, r8, lsr #8
orr r8, r8, r9, lsl #24
mov r9, r9, lsr #8
orr r9, r9, r10, lsl #24
mov r10, r10, lsr #8
orr r10, r10, r11, lsl #24
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
mov r3, r11, lsr #8
bhs 1b
b .Lless_than_thirtytwo
.Lloop24:
ldr r12, [r1], #4
1: mov r4, r12
ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
pld [r1, #64]
subs r2, r2, #32
ldrhs r12, [r1], #4
orr r3, r3, r4, lsl #8
mov r4, r4, lsr #24
orr r4, r4, r5, lsl #8
mov r5, r5, lsr #24
orr r5, r5, r6, lsl #8
mov r6, r6, lsr #24
orr r6, r6, r7, lsl #8
mov r7, r7, lsr #24
orr r7, r7, r8, lsl #8
mov r8, r8, lsr #24
orr r8, r8, r9, lsl #8
mov r9, r9, lsr #24
orr r9, r9, r10, lsl #8
mov r10, r10, lsr #24
orr r10, r10, r11, lsl #8
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
mov r3, r11, lsr #24
bhs 1b
.Lless_than_thirtytwo:
/* copy the last 0 to 31 bytes of the source */
rsb r12, lr, #32 /* we corrupted r12, recompute it */
add r2, r2, #32
cmp r2, #4
blo .Lpartial_word_tail
1: ldr r5, [r1], #4
sub r2, r2, #4
orr r4, r3, r5, lsl lr
mov r3, r5, lsr r12
str r4, [r0], #4
cmp r2, #4
bhs 1b
.Lpartial_word_tail:
/* we have a partial word in the input buffer */
movs r5, lr, lsl #(31-3)
strbmi r3, [r0], #1
movmi r3, r3, lsr #8
strbcs r3, [r0], #1
movcs r3, r3, lsr #8
strbcs r3, [r0], #1
/* Refill spilled registers from the stack. Don't update sp. */
ldmfd sp, {r5-r11}
.Lcopy_last_3_and_return:
movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */
ldrbmi r2, [r1], #1
ldrbcs r3, [r1], #1
ldrbcs r12,[r1]
strbmi r2, [r0], #1
strbcs r3, [r0], #1
strbcs r12,[r0]
/* we're done! restore sp and spilled registers and return */
add sp, sp, #28
ldmfd sp!, {r0, r4, pc}
END(memcpy)

View File

@ -469,3 +469,5 @@ ENTRY(memmove)
bl bsd_safe_memcpy
ldmfd sp!, {r0, pc}
END(memmove)
ALIAS_SYMBOL(memcpy, memmove)

View File

@ -42,7 +42,7 @@
.thumb
.thumb_func
ENTRY(memcpy)
ENTRY(__memcpy)
pld [r1, #64]
stmfd sp!, {r0, lr}
.cfi_adjust_cfa_offset 8
@ -50,4 +50,4 @@ ENTRY(memcpy)
.cfi_rel_offset lr, 4
#include "memcpy_base.S"
END(memcpy)
END(__memcpy)

View File

@ -34,7 +34,7 @@
#define PLDSIZE (128) /* L2 cache line size */
.code 32
ENTRY(memcpy)
ENTRY(__memcpy)
push {r0}
.cfi_def_cfa_offset 4
.cfi_rel_offset r0, 0
@ -123,4 +123,4 @@ ENTRY(memcpy)
pop {r0}
bx lr
END(memcpy)
END(__memcpy)

View File

@ -30,6 +30,6 @@
#include <private/bionic_asm.h>
ENTRY(memcpy)
ENTRY(__memcpy)
#include "memcpy_base.S"
END(memcpy)
END(__memcpy)

View File

@ -65,7 +65,7 @@ ENTRY(memmove)
b.lo .Ldownwards
add tmp1, src, count
cmp dstin, tmp1
b.hs memcpy /* No overlap. */
b.hs __memcpy /* No overlap. */
/* Upwards move with potential overlap.
* Need to move from the tail backwards. SRC and DST point one
@ -196,7 +196,7 @@ ENTRY(memmove)
* DST is more than 16 bytes away from SRC. */
sub tmp1, src, #16
cmp dstin, tmp1
b.ls memcpy /* May overlap, but not critically. */
b.ls __memcpy /* May overlap, but not critically. */
mov dst, dstin /* Preserve DSTIN for return value. */
cmp count, #64
@ -326,4 +326,6 @@ ENTRY(memmove)
END(wmemmove)
#else
END(memmove)
ALIAS_SYMBOL(memcpy, memmove)
#endif

View File

@ -30,6 +30,6 @@
#include <private/bionic_asm.h>
ENTRY(memcpy)
ENTRY(__memcpy)
#include "memcpy_base.S"
END(memcpy)
END(__memcpy)

View File

@ -92,7 +92,7 @@ ENTRY(memmove)
sub tmp1, dstin, src
cmp count, 96
ccmp tmp1, count, 2, hi
b.hs memcpy
b.hs __memcpy
cbz tmp1, 3f
add dstend, dstin, count
@ -150,4 +150,6 @@ ENTRY(memmove)
END(wmemmove)
#else
END(memmove)
ALIAS_SYMBOL(memcpy, memmove)
#endif

View File

@ -34,6 +34,10 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# define MEMCPY memcpy
#endif
#ifndef USE_AS_MEMMOVE
# define USE_AS_MEMMOVE
#endif
#ifndef L
# define L(label) .L##label
#endif
@ -67,6 +71,12 @@ name: \
cfi_startproc
#endif
#ifndef ALIAS_SYMBOL
# define ALIAS_SYMBOL(alias, original) \
.globl alias; \
.equ alias, original
#endif
#ifndef END
# define END(name) \
cfi_endproc; \
@ -3122,3 +3132,5 @@ L(bk_ssse3_cpy):
#endif
END (MEMCPY)
ALIAS_SYMBOL(memmove, MEMCPY)

View File

@ -1,34 +0,0 @@
/*
Copyright (c) 2010, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define MEMCPY memmove
#define USE_AS_MEMMOVE
#include "ssse3-memcpy-atom.S"

View File

@ -1,308 +0,0 @@
/*
Copyright (c) 2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "cache.h"
#ifndef MEMCPY
# define MEMCPY memcpy
#endif
#ifndef L
# define L(label) .L##label
#endif
#ifndef cfi_startproc
# define cfi_startproc .cfi_startproc
#endif
#ifndef cfi_endproc
# define cfi_endproc .cfi_endproc
#endif
#ifndef cfi_rel_offset
# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
#endif
#ifndef cfi_restore
# define cfi_restore(reg) .cfi_restore reg
#endif
#ifndef cfi_adjust_cfa_offset
# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
#endif
#ifndef ENTRY
# define ENTRY(name) \
.type name, @function; \
.globl name; \
.p2align 4; \
name: \
cfi_startproc
#endif
#ifndef END
# define END(name) \
cfi_endproc; \
.size name, .-name
#endif
#define DEST PARMS
#define SRC DEST+4
#define LEN SRC+4
#define CFI_PUSH(REG) \
cfi_adjust_cfa_offset (4); \
cfi_rel_offset (REG, 0)
#define CFI_POP(REG) \
cfi_adjust_cfa_offset (-4); \
cfi_restore (REG)
#define PUSH(REG) pushl REG; CFI_PUSH (REG)
#define POP(REG) popl REG; CFI_POP (REG)
#define PARMS 8 /* Preserve EBX. */
#define ENTRANCE PUSH (%ebx);
#define RETURN_END POP (%ebx); ret
#define RETURN RETURN_END; CFI_PUSH (%ebx)
.section .text.sse2,"ax",@progbits
ENTRY (MEMCPY)
ENTRANCE
movl LEN(%esp), %ecx
movl SRC(%esp), %eax
movl DEST(%esp), %edx
cmp %eax, %edx
je L(return)
cmp $16, %ecx
jbe L(len_0_16_bytes)
cmp $SHARED_CACHE_SIZE_HALF, %ecx
jae L(large_page)
movdqu (%eax), %xmm0
movdqu -16(%eax, %ecx), %xmm1
cmpl $32, %ecx
movdqu %xmm0, (%edx)
movdqu %xmm1, -16(%edx, %ecx)
jbe L(return)
movdqu 16(%eax), %xmm0
movdqu -32(%eax, %ecx), %xmm1
cmpl $64, %ecx
movdqu %xmm0, 16(%edx)
movdqu %xmm1, -32(%edx, %ecx)
jbe L(return)
movdqu 32(%eax), %xmm0
movdqu 48(%eax), %xmm1
movdqu -48(%eax, %ecx), %xmm2
movdqu -64(%eax, %ecx), %xmm3
cmpl $128, %ecx
movdqu %xmm0, 32(%edx)
movdqu %xmm1, 48(%edx)
movdqu %xmm2, -48(%edx, %ecx)
movdqu %xmm3, -64(%edx, %ecx)
jbe L(return)
/* Now the main loop: we align the address of the destination. */
leal 64(%edx), %ebx
andl $-64, %ebx
addl %edx, %ecx
andl $-64, %ecx
subl %edx, %eax
/* We should stop two iterations before the termination
(in order not to misprefetch). */
subl $64, %ecx
cmpl %ebx, %ecx
je L(main_loop_just_one_iteration)
subl $64, %ecx
cmpl %ebx, %ecx
je L(main_loop_last_two_iterations)
.p2align 4
L(main_loop_cache):
prefetcht0 128(%ebx, %eax)
movdqu (%ebx, %eax), %xmm0
movdqu 16(%ebx, %eax), %xmm1
movdqu 32(%ebx, %eax), %xmm2
movdqu 48(%ebx, %eax), %xmm3
movdqa %xmm0, (%ebx)
movdqa %xmm1, 16(%ebx)
movdqa %xmm2, 32(%ebx)
movdqa %xmm3, 48(%ebx)
lea 64(%ebx), %ebx
cmpl %ebx, %ecx
jne L(main_loop_cache)
L(main_loop_last_two_iterations):
movdqu (%ebx, %eax), %xmm0
movdqu 16(%ebx, %eax), %xmm1
movdqu 32(%ebx, %eax), %xmm2
movdqu 48(%ebx, %eax), %xmm3
movdqu 64(%ebx, %eax), %xmm4
movdqu 80(%ebx, %eax), %xmm5
movdqu 96(%ebx, %eax), %xmm6
movdqu 112(%ebx, %eax), %xmm7
movdqa %xmm0, (%ebx)
movdqa %xmm1, 16(%ebx)
movdqa %xmm2, 32(%ebx)
movdqa %xmm3, 48(%ebx)
movdqa %xmm4, 64(%ebx)
movdqa %xmm5, 80(%ebx)
movdqa %xmm6, 96(%ebx)
movdqa %xmm7, 112(%ebx)
jmp L(return)
L(main_loop_just_one_iteration):
movdqu (%ebx, %eax), %xmm0
movdqu 16(%ebx, %eax), %xmm1
movdqu 32(%ebx, %eax), %xmm2
movdqu 48(%ebx, %eax), %xmm3
movdqa %xmm0, (%ebx)
movdqa %xmm1, 16(%ebx)
movdqa %xmm2, 32(%ebx)
movdqa %xmm3, 48(%ebx)
jmp L(return)
L(large_page):
movdqu (%eax), %xmm0
movdqu 16(%eax), %xmm1
movdqu 32(%eax), %xmm2
movdqu 48(%eax), %xmm3
movdqu -64(%eax, %ecx), %xmm4
movdqu -48(%eax, %ecx), %xmm5
movdqu -32(%eax, %ecx), %xmm6
movdqu -16(%eax, %ecx), %xmm7
movdqu %xmm0, (%edx)
movdqu %xmm1, 16(%edx)
movdqu %xmm2, 32(%edx)
movdqu %xmm3, 48(%edx)
movdqu %xmm4, -64(%edx, %ecx)
movdqu %xmm5, -48(%edx, %ecx)
movdqu %xmm6, -32(%edx, %ecx)
movdqu %xmm7, -16(%edx, %ecx)
movdqu 64(%eax), %xmm0
movdqu 80(%eax), %xmm1
movdqu 96(%eax), %xmm2
movdqu 112(%eax), %xmm3
movdqu -128(%eax, %ecx), %xmm4
movdqu -112(%eax, %ecx), %xmm5
movdqu -96(%eax, %ecx), %xmm6
movdqu -80(%eax, %ecx), %xmm7
movdqu %xmm0, 64(%edx)
movdqu %xmm1, 80(%edx)
movdqu %xmm2, 96(%edx)
movdqu %xmm3, 112(%edx)
movdqu %xmm4, -128(%edx, %ecx)
movdqu %xmm5, -112(%edx, %ecx)
movdqu %xmm6, -96(%edx, %ecx)
movdqu %xmm7, -80(%edx, %ecx)
/* Now the main loop with non temporal stores. We align
the address of the destination. */
leal 128(%edx), %ebx
andl $-128, %ebx
addl %edx, %ecx
andl $-128, %ecx
subl %edx, %eax
.p2align 4
L(main_loop_large_page):
movdqu (%ebx, %eax), %xmm0
movdqu 16(%ebx, %eax), %xmm1
movdqu 32(%ebx, %eax), %xmm2
movdqu 48(%ebx, %eax), %xmm3
movdqu 64(%ebx, %eax), %xmm4
movdqu 80(%ebx, %eax), %xmm5
movdqu 96(%ebx, %eax), %xmm6
movdqu 112(%ebx, %eax), %xmm7
movntdq %xmm0, (%ebx)
movntdq %xmm1, 16(%ebx)
movntdq %xmm2, 32(%ebx)
movntdq %xmm3, 48(%ebx)
movntdq %xmm4, 64(%ebx)
movntdq %xmm5, 80(%ebx)
movntdq %xmm6, 96(%ebx)
movntdq %xmm7, 112(%ebx)
lea 128(%ebx), %ebx
cmpl %ebx, %ecx
jne L(main_loop_large_page)
sfence
jmp L(return)
L(len_0_16_bytes):
testb $24, %cl
jne L(len_9_16_bytes)
testb $4, %cl
.p2align 4,,5
jne L(len_5_8_bytes)
testl %ecx, %ecx
.p2align 4,,2
je L(return)
movzbl (%eax), %ebx
testb $2, %cl
movb %bl, (%edx)
je L(return)
movzwl -2(%eax,%ecx), %ebx
movw %bx, -2(%edx,%ecx)
jmp L(return)
L(len_9_16_bytes):
movq (%eax), %xmm0
movq -8(%eax, %ecx), %xmm1
movq %xmm0, (%edx)
movq %xmm1, -8(%edx, %ecx)
jmp L(return)
L(len_5_8_bytes):
movl (%eax), %ebx
movl %ebx, (%edx)
movl -4(%eax,%ecx), %ebx
movl %ebx, -4(%edx,%ecx)
jmp L(return)
L(return):
movl %edx, %eax
RETURN
END (MEMCPY)

View File

@ -67,6 +67,12 @@ name: \
cfi_startproc
#endif
#ifndef ALIAS_SYMBOL
# define ALIAS_SYMBOL(alias, original) \
.globl alias; \
.equ alias, original
#endif
#ifndef END
# define END(name) \
cfi_endproc; \
@ -537,3 +543,5 @@ L(mm_large_page_loop_backward):
jmp L(mm_recalc_len)
END (MEMMOVE)
ALIAS_SYMBOL(memcpy, MEMMOVE)

View File

@ -1,299 +0,0 @@
/*
Copyright (c) 2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "cache.h"
#ifndef MEMCPY
# define MEMCPY memcpy
#endif
#ifndef L
# define L(label) .L##label
#endif
#ifndef cfi_startproc
# define cfi_startproc .cfi_startproc
#endif
#ifndef cfi_endproc
# define cfi_endproc .cfi_endproc
#endif
#ifndef cfi_rel_offset
# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
#endif
#ifndef cfi_restore
# define cfi_restore(reg) .cfi_restore reg
#endif
#ifndef cfi_adjust_cfa_offset
# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
#endif
#ifndef ENTRY
# define ENTRY(name) \
.type name, @function; \
.globl name; \
.p2align 4; \
name: \
cfi_startproc
#endif
#ifndef END
# define END(name) \
cfi_endproc; \
.size name, .-name
#endif
#define CFI_PUSH(REG) \
cfi_adjust_cfa_offset (4); \
cfi_rel_offset (REG, 0)
#define CFI_POP(REG) \
cfi_adjust_cfa_offset (-4); \
cfi_restore (REG)
#define PUSH(REG) push REG;
#define POP(REG) pop REG;
#define ENTRANCE PUSH (%rbx);
#define RETURN_END POP (%rbx); ret
#define RETURN RETURN_END;
.section .text.sse2,"ax",@progbits
ENTRY (MEMCPY)
ENTRANCE
cmp %rsi, %rdi
je L(return)
cmp $16, %rdx
jbe L(len_0_16_bytes)
cmp $SHARED_CACHE_SIZE_HALF, %rdx
jae L(large_page)
movdqu (%rsi), %xmm0
movdqu -16(%rsi, %rdx), %xmm1
cmp $32, %rdx
movdqu %xmm0, (%rdi)
movdqu %xmm1, -16(%rdi, %rdx)
jbe L(return)
movdqu 16(%rsi), %xmm0
movdqu -32(%rsi, %rdx), %xmm1
cmp $64, %rdx
movdqu %xmm0, 16(%rdi)
movdqu %xmm1, -32(%rdi, %rdx)
jbe L(return)
movdqu 32(%rsi), %xmm0
movdqu 48(%rsi), %xmm1
movdqu -48(%rsi, %rdx), %xmm2
movdqu -64(%rsi, %rdx), %xmm3
cmp $128, %rdx
movdqu %xmm0, 32(%rdi)
movdqu %xmm1, 48(%rdi)
movdqu %xmm2, -48(%rdi, %rdx)
movdqu %xmm3, -64(%rdi, %rdx)
jbe L(return)
/* Now the main loop: we align the address of the destination. */
lea 64(%rdi), %r8
and $-64, %r8
add %rdi, %rdx
and $-64, %rdx
sub %rdi, %rsi
/* We should stop two iterations before the termination
(in order not to misprefetch). */
sub $64, %rdx
cmp %r8, %rdx
je L(main_loop_just_one_iteration)
sub $64, %rdx
cmp %r8, %rdx
je L(main_loop_last_two_iterations)
.p2align 4
L(main_loop_cache):
prefetcht0 128(%r8, %rsi)
movdqu (%r8, %rsi), %xmm0
movdqu 16(%r8, %rsi), %xmm1
movdqu 32(%r8, %rsi), %xmm2
movdqu 48(%r8, %rsi), %xmm3
movdqa %xmm0, (%r8)
movdqa %xmm1, 16(%r8)
movdqa %xmm2, 32(%r8)
movdqa %xmm3, 48(%r8)
lea 64(%r8), %r8
cmp %r8, %rdx
jne L(main_loop_cache)
L(main_loop_last_two_iterations):
movdqu (%r8, %rsi), %xmm0
movdqu 16(%r8, %rsi), %xmm1
movdqu 32(%r8, %rsi), %xmm2
movdqu 48(%r8, %rsi), %xmm3
movdqu 64(%r8, %rsi), %xmm4
movdqu 80(%r8, %rsi), %xmm5
movdqu 96(%r8, %rsi), %xmm6
movdqu 112(%r8, %rsi), %xmm7
movdqa %xmm0, (%r8)
movdqa %xmm1, 16(%r8)
movdqa %xmm2, 32(%r8)
movdqa %xmm3, 48(%r8)
movdqa %xmm4, 64(%r8)
movdqa %xmm5, 80(%r8)
movdqa %xmm6, 96(%r8)
movdqa %xmm7, 112(%r8)
jmp L(return)
L(main_loop_just_one_iteration):
movdqu (%r8, %rsi), %xmm0
movdqu 16(%r8, %rsi), %xmm1
movdqu 32(%r8, %rsi), %xmm2
movdqu 48(%r8, %rsi), %xmm3
movdqa %xmm0, (%r8)
movdqa %xmm1, 16(%r8)
movdqa %xmm2, 32(%r8)
movdqa %xmm3, 48(%r8)
jmp L(return)
L(large_page):
movdqu (%rsi), %xmm0
movdqu 16(%rsi), %xmm1
movdqu 32(%rsi), %xmm2
movdqu 48(%rsi), %xmm3
movdqu -64(%rsi, %rdx), %xmm4
movdqu -48(%rsi, %rdx), %xmm5
movdqu -32(%rsi, %rdx), %xmm6
movdqu -16(%rsi, %rdx), %xmm7
movdqu %xmm0, (%rdi)
movdqu %xmm1, 16(%rdi)
movdqu %xmm2, 32(%rdi)
movdqu %xmm3, 48(%rdi)
movdqu %xmm4, -64(%rdi, %rdx)
movdqu %xmm5, -48(%rdi, %rdx)
movdqu %xmm6, -32(%rdi, %rdx)
movdqu %xmm7, -16(%rdi, %rdx)
movdqu 64(%rsi), %xmm0
movdqu 80(%rsi), %xmm1
movdqu 96(%rsi), %xmm2
movdqu 112(%rsi), %xmm3
movdqu -128(%rsi, %rdx), %xmm4
movdqu -112(%rsi, %rdx), %xmm5
movdqu -96(%rsi, %rdx), %xmm6
movdqu -80(%rsi, %rdx), %xmm7
movdqu %xmm0, 64(%rdi)
movdqu %xmm1, 80(%rdi)
movdqu %xmm2, 96(%rdi)
movdqu %xmm3, 112(%rdi)
movdqu %xmm4, -128(%rdi, %rdx)
movdqu %xmm5, -112(%rdi, %rdx)
movdqu %xmm6, -96(%rdi, %rdx)
movdqu %xmm7, -80(%rdi, %rdx)
/* Now the main loop with non temporal stores. We align
the address of the destination. */
lea 128(%rdi), %r8
and $-128, %r8
add %rdi, %rdx
and $-128, %rdx
sub %rdi, %rsi
.p2align 4
L(main_loop_large_page):
movdqu (%r8, %rsi), %xmm0
movdqu 16(%r8, %rsi), %xmm1
movdqu 32(%r8, %rsi), %xmm2
movdqu 48(%r8, %rsi), %xmm3
movdqu 64(%r8, %rsi), %xmm4
movdqu 80(%r8, %rsi), %xmm5
movdqu 96(%r8, %rsi), %xmm6
movdqu 112(%r8, %rsi), %xmm7
movntdq %xmm0, (%r8)
movntdq %xmm1, 16(%r8)
movntdq %xmm2, 32(%r8)
movntdq %xmm3, 48(%r8)
movntdq %xmm4, 64(%r8)
movntdq %xmm5, 80(%r8)
movntdq %xmm6, 96(%r8)
movntdq %xmm7, 112(%r8)
lea 128(%r8), %r8
cmp %r8, %rdx
jne L(main_loop_large_page)
sfence
jmp L(return)
L(len_0_16_bytes):
testb $24, %dl
jne L(len_9_16_bytes)
testb $4, %dl
.p2align 4,,5
jne L(len_5_8_bytes)
test %rdx, %rdx
.p2align 4,,2
je L(return)
movzbl (%rsi), %ebx
testb $2, %dl
movb %bl, (%rdi)
je L(return)
movzwl -2(%rsi,%rdx), %ebx
movw %bx, -2(%rdi,%rdx)
jmp L(return)
L(len_9_16_bytes):
movq (%rsi), %xmm0
movq -8(%rsi, %rdx), %xmm1
movq %xmm0, (%rdi)
movq %xmm1, -8(%rdi, %rdx)
jmp L(return)
L(len_5_8_bytes):
movl (%rsi), %ebx
movl %ebx, (%rdi)
movl -4(%rsi,%rdx), %ebx
movl %ebx, -4(%rdi,%rdx)
jmp L(return)
L(return):
mov %rdi, %rax
RETURN
END (MEMCPY)

View File

@ -67,6 +67,12 @@ name: \
cfi_startproc
#endif
#ifndef ALIAS_SYMBOL
# define ALIAS_SYMBOL(alias, original) \
.globl alias; \
.equ alias, original
#endif
#ifndef END
# define END(name) \
cfi_endproc; \
@ -508,3 +514,5 @@ L(mm_large_page_loop_backward):
jmp L(mm_recalc_len)
END (MEMMOVE)
ALIAS_SYMBOL(memcpy, MEMMOVE)