424 lines
11 KiB
ArmAsm
424 lines
11 KiB
ArmAsm
/*
|
|
* Copyright (c) 2009
|
|
* MIPS Technologies, Inc., California.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
|
|
* contributors may be used to endorse or promote products derived from
|
|
* this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*/
|
|
|
|
/************************************************************************
|
|
*
|
|
* memcpy.S
|
|
* Version: "043009"
|
|
*
|
|
************************************************************************/
|
|
|
|
|
|
/************************************************************************
|
|
* Include files
|
|
************************************************************************/
|
|
|
|
#include <private/bionic_asm.h>
|
|
|
|
|
|
/*
|
|
* This routine could be optimized for MIPS64. The current code only
|
|
* uses MIPS32 instructions.
|
|
*/
|
|
#if defined(__MIPSEB__)
|
|
# define LWHI lwl /* high part is left in big-endian */
|
|
# define SWHI swl /* high part is left in big-endian */
|
|
# define LWLO lwr /* low part is right in big-endian */
|
|
# define SWLO swr /* low part is right in big-endian */
|
|
#endif
|
|
|
|
#if defined(__MIPSEL__)
|
|
# define LWHI lwr /* high part is right in little-endian */
|
|
# define SWHI swr /* high part is right in little-endian */
|
|
# define LWLO lwl /* low part is left in big-endian */
|
|
# define SWLO swl /* low part is left in big-endian */
|
|
#endif
|
|
|
|
LEAF(memcpy,0)
|
|
|
|
.set noreorder
|
|
.set noat
|
|
/*
|
|
* Below we handle the case where memcpy is called with overlapping src and dst.
|
|
* Although memcpy is not required to handle this case, some parts of Android like Skia
|
|
* rely on such usage. We call memmove to handle such cases.
|
|
*/
|
|
subu t0,a0,a1
|
|
sra AT,t0,31
|
|
xor t1,t0,AT
|
|
subu t0,t1,AT
|
|
sltu AT,t0,a2
|
|
beq AT,zero,.Lmemcpy
|
|
la t9,memmove
|
|
jr t9
|
|
nop
|
|
.Lmemcpy:
|
|
slti AT,a2,8
|
|
bne AT,zero,.Llast8
|
|
move v0,a0 # memcpy returns the dst pointer
|
|
|
|
# Test if the src and dst are word-aligned, or can be made word-aligned
|
|
xor t8,a1,a0
|
|
andi t8,t8,0x3 # t8 is a0/a1 word-displacement
|
|
|
|
bne t8,zero,.Lunaligned
|
|
negu a3,a0
|
|
|
|
andi a3,a3,0x3 # we need to copy a3 bytes to make a0/a1 aligned
|
|
beq a3,zero,.Lchk16w # when a3=0 then the dst (a0) is word-aligned
|
|
subu a2,a2,a3 # now a2 is the remining bytes count
|
|
|
|
LWHI t8,0(a1)
|
|
addu a1,a1,a3
|
|
SWHI t8,0(a0)
|
|
addu a0,a0,a3
|
|
|
|
# Now the dst/src are mutually word-aligned with word-aligned addresses
|
|
.Lchk16w:
|
|
andi t8,a2,0x3f # any whole 64-byte chunks?
|
|
# t8 is the byte count after 64-byte chunks
|
|
|
|
beq a2,t8,.Lchk8w # if a2==t8, no 64-byte chunks
|
|
# There will be at most 1 32-byte chunk after it
|
|
subu a3,a2,t8 # subtract from a2 the reminder
|
|
# Here a3 counts bytes in 16w chunks
|
|
addu a3,a0,a3 # Now a3 is the final dst after 64-byte chunks
|
|
|
|
addu t0,a0,a2 # t0 is the "past the end" address
|
|
|
|
# When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past
|
|
# the "t0-32" address
|
|
# This means: for x=128 the last "safe" a0 address is "t0-160"
|
|
# Alternatively, for x=64 the last "safe" a0 address is "t0-96"
|
|
# In the current version we will use "pref 30,128(a0)", so "t0-160" is the limit
|
|
subu t9,t0,160 # t9 is the "last safe pref 30,128(a0)" address
|
|
|
|
pref 0,0(a1) # bring the first line of src, addr 0
|
|
pref 0,32(a1) # bring the second line of src, addr 32
|
|
pref 0,64(a1) # bring the third line of src, addr 64
|
|
pref 30,32(a0) # safe, as we have at least 64 bytes ahead
|
|
# In case the a0 > t9 don't use "pref 30" at all
|
|
sgtu v1,a0,t9
|
|
bgtz v1,.Lloop16w # skip "pref 30,64(a0)" for too short arrays
|
|
nop
|
|
# otherwise, start with using pref30
|
|
pref 30,64(a0)
|
|
.Lloop16w:
|
|
pref 0,96(a1)
|
|
lw t0,0(a1)
|
|
bgtz v1,.Lskip_pref30_96 # skip "pref 30,96(a0)"
|
|
lw t1,4(a1)
|
|
pref 30,96(a0) # continue setting up the dest, addr 96
|
|
.Lskip_pref30_96:
|
|
lw t2,8(a1)
|
|
lw t3,12(a1)
|
|
lw t4,16(a1)
|
|
lw t5,20(a1)
|
|
lw t6,24(a1)
|
|
lw t7,28(a1)
|
|
pref 0,128(a1) # bring the next lines of src, addr 128
|
|
|
|
sw t0,0(a0)
|
|
sw t1,4(a0)
|
|
sw t2,8(a0)
|
|
sw t3,12(a0)
|
|
sw t4,16(a0)
|
|
sw t5,20(a0)
|
|
sw t6,24(a0)
|
|
sw t7,28(a0)
|
|
|
|
lw t0,32(a1)
|
|
bgtz v1,.Lskip_pref30_128 # skip "pref 30,128(a0)"
|
|
lw t1,36(a1)
|
|
pref 30,128(a0) # continue setting up the dest, addr 128
|
|
.Lskip_pref30_128:
|
|
lw t2,40(a1)
|
|
lw t3,44(a1)
|
|
lw t4,48(a1)
|
|
lw t5,52(a1)
|
|
lw t6,56(a1)
|
|
lw t7,60(a1)
|
|
pref 0, 160(a1) # bring the next lines of src, addr 160
|
|
|
|
sw t0,32(a0)
|
|
sw t1,36(a0)
|
|
sw t2,40(a0)
|
|
sw t3,44(a0)
|
|
sw t4,48(a0)
|
|
sw t5,52(a0)
|
|
sw t6,56(a0)
|
|
sw t7,60(a0)
|
|
|
|
addiu a0,a0,64 # adding 64 to dest
|
|
sgtu v1,a0,t9
|
|
bne a0,a3,.Lloop16w
|
|
addiu a1,a1,64 # adding 64 to src
|
|
move a2,t8
|
|
|
|
# Here we have src and dest word-aligned but less than 64-bytes to go
|
|
|
|
.Lchk8w:
|
|
pref 0, 0x0(a1)
|
|
andi t8,a2,0x1f # is there a 32-byte chunk?
|
|
# the t8 is the reminder count past 32-bytes
|
|
beq a2,t8,.Lchk1w # when a2=t8, no 32-byte chunk
|
|
nop
|
|
|
|
lw t0,0(a1)
|
|
lw t1,4(a1)
|
|
lw t2,8(a1)
|
|
lw t3,12(a1)
|
|
lw t4,16(a1)
|
|
lw t5,20(a1)
|
|
lw t6,24(a1)
|
|
lw t7,28(a1)
|
|
addiu a1,a1,32
|
|
|
|
sw t0,0(a0)
|
|
sw t1,4(a0)
|
|
sw t2,8(a0)
|
|
sw t3,12(a0)
|
|
sw t4,16(a0)
|
|
sw t5,20(a0)
|
|
sw t6,24(a0)
|
|
sw t7,28(a0)
|
|
addiu a0,a0,32
|
|
|
|
.Lchk1w:
|
|
andi a2,t8,0x3 # now a2 is the reminder past 1w chunks
|
|
beq a2,t8,.Llast8
|
|
subu a3,t8,a2 # a3 is count of bytes in 1w chunks
|
|
addu a3,a0,a3 # now a3 is the dst address past the 1w chunks
|
|
|
|
# copying in words (4-byte chunks)
|
|
.LwordCopy_loop:
|
|
lw t3,0(a1) # the first t3 may be equal t0 ... optimize?
|
|
addiu a1,a1,4
|
|
addiu a0,a0,4
|
|
bne a0,a3,.LwordCopy_loop
|
|
sw t3,-4(a0)
|
|
|
|
# For the last (<8) bytes
|
|
.Llast8:
|
|
blez a2,.Lleave
|
|
addu a3,a0,a2 # a3 is the last dst address
|
|
.Llast8loop:
|
|
lb v1,0(a1)
|
|
addiu a1,a1,1
|
|
addiu a0,a0,1
|
|
bne a0,a3,.Llast8loop
|
|
sb v1,-1(a0)
|
|
|
|
.Lleave:
|
|
j ra
|
|
nop
|
|
|
|
#
|
|
# UNALIGNED case
|
|
#
|
|
|
|
.Lunaligned:
|
|
# got here with a3="negu a0"
|
|
andi a3,a3,0x3 # test if the a0 is word aligned
|
|
beqz a3,.Lua_chk16w
|
|
subu a2,a2,a3 # bytes left after initial a3 bytes
|
|
|
|
LWHI v1,0(a1)
|
|
LWLO v1,3(a1)
|
|
addu a1,a1,a3 # a3 may be here 1, 2 or 3
|
|
SWHI v1,0(a0)
|
|
addu a0,a0,a3 # below the dst will be word aligned (NOTE1)
|
|
|
|
.Lua_chk16w:
|
|
andi t8,a2,0x3f # any whole 64-byte chunks?
|
|
# t8 is the byte count after 64-byte chunks
|
|
beq a2,t8,.Lua_chk8w # if a2==t8, no 64-byte chunks
|
|
# There will be at most 1 32-byte chunk after it
|
|
subu a3,a2,t8 # subtract from a2 the reminder
|
|
# Here a3 counts bytes in 16w chunks
|
|
addu a3,a0,a3 # Now a3 is the final dst after 64-byte chunks
|
|
|
|
addu t0,a0,a2 # t0 is the "past the end" address
|
|
|
|
subu t9,t0,160 # t9 is the "last safe pref 30,128(a0)" address
|
|
|
|
pref 0,0(a1) # bring the first line of src, addr 0
|
|
pref 0,32(a1) # bring the second line of src, addr 32
|
|
pref 0,64(a1) # bring the third line of src, addr 64
|
|
pref 30,32(a0) # safe, as we have at least 64 bytes ahead
|
|
# In case the a0 > t9 don't use "pref 30" at all
|
|
sgtu v1,a0,t9
|
|
bgtz v1,.Lua_loop16w # skip "pref 30,64(a0)" for too short arrays
|
|
nop
|
|
# otherwise, start with using pref30
|
|
pref 30,64(a0)
|
|
.Lua_loop16w:
|
|
pref 0,96(a1)
|
|
LWHI t0,0(a1)
|
|
LWLO t0,3(a1)
|
|
LWHI t1,4(a1)
|
|
bgtz v1,.Lua_skip_pref30_96
|
|
LWLO t1,7(a1)
|
|
pref 30,96(a0) # continue setting up the dest, addr 96
|
|
.Lua_skip_pref30_96:
|
|
LWHI t2,8(a1)
|
|
LWLO t2,11(a1)
|
|
LWHI t3,12(a1)
|
|
LWLO t3,15(a1)
|
|
LWHI t4,16(a1)
|
|
LWLO t4,19(a1)
|
|
LWHI t5,20(a1)
|
|
LWLO t5,23(a1)
|
|
LWHI t6,24(a1)
|
|
LWLO t6,27(a1)
|
|
LWHI t7,28(a1)
|
|
LWLO t7,31(a1)
|
|
pref 0,128(a1) # bring the next lines of src, addr 128
|
|
|
|
sw t0,0(a0)
|
|
sw t1,4(a0)
|
|
sw t2,8(a0)
|
|
sw t3,12(a0)
|
|
sw t4,16(a0)
|
|
sw t5,20(a0)
|
|
sw t6,24(a0)
|
|
sw t7,28(a0)
|
|
|
|
LWHI t0,32(a1)
|
|
LWLO t0,35(a1)
|
|
LWHI t1,36(a1)
|
|
bgtz v1,.Lua_skip_pref30_128
|
|
LWLO t1,39(a1)
|
|
pref 30,128(a0) # continue setting up the dest, addr 128
|
|
.Lua_skip_pref30_128:
|
|
LWHI t2,40(a1)
|
|
LWLO t2,43(a1)
|
|
LWHI t3,44(a1)
|
|
LWLO t3,47(a1)
|
|
LWHI t4,48(a1)
|
|
LWLO t4,51(a1)
|
|
LWHI t5,52(a1)
|
|
LWLO t5,55(a1)
|
|
LWHI t6,56(a1)
|
|
LWLO t6,59(a1)
|
|
LWHI t7,60(a1)
|
|
LWLO t7,63(a1)
|
|
pref 0, 160(a1) # bring the next lines of src, addr 160
|
|
|
|
sw t0,32(a0)
|
|
sw t1,36(a0)
|
|
sw t2,40(a0)
|
|
sw t3,44(a0)
|
|
sw t4,48(a0)
|
|
sw t5,52(a0)
|
|
sw t6,56(a0)
|
|
sw t7,60(a0)
|
|
|
|
addiu a0,a0,64 # adding 64 to dest
|
|
sgtu v1,a0,t9
|
|
bne a0,a3,.Lua_loop16w
|
|
addiu a1,a1,64 # adding 64 to src
|
|
move a2,t8
|
|
|
|
# Here we have src and dest word-aligned but less than 64-bytes to go
|
|
|
|
.Lua_chk8w:
|
|
pref 0, 0x0(a1)
|
|
andi t8,a2,0x1f # is there a 32-byte chunk?
|
|
# the t8 is the reminder count
|
|
beq a2,t8,.Lua_chk1w # when a2=t8, no 32-byte chunk
|
|
nop
|
|
|
|
LWHI t0,0(a1)
|
|
LWLO t0,3(a1)
|
|
LWHI t1,4(a1)
|
|
LWLO t1,7(a1)
|
|
LWHI t2,8(a1)
|
|
LWLO t2,11(a1)
|
|
LWHI t3,12(a1)
|
|
LWLO t3,15(a1)
|
|
LWHI t4,16(a1)
|
|
LWLO t4,19(a1)
|
|
LWHI t5,20(a1)
|
|
LWLO t5,23(a1)
|
|
LWHI t6,24(a1)
|
|
LWLO t6,27(a1)
|
|
LWHI t7,28(a1)
|
|
LWLO t7,31(a1)
|
|
addiu a1,a1,32
|
|
|
|
sw t0,0(a0)
|
|
sw t1,4(a0)
|
|
sw t2,8(a0)
|
|
sw t3,12(a0)
|
|
sw t4,16(a0)
|
|
sw t5,20(a0)
|
|
sw t6,24(a0)
|
|
sw t7,28(a0)
|
|
addiu a0,a0,32
|
|
|
|
.Lua_chk1w:
|
|
andi a2,t8,0x3 # now a2 is the reminder past 1w chunks
|
|
beq a2,t8,.Lua_smallCopy
|
|
subu a3,t8,a2 # a3 is count of bytes in 1w chunks
|
|
addu a3,a0,a3 # now a3 is the dst address past the 1w chunks
|
|
|
|
# copying in words (4-byte chunks)
|
|
.Lua_wordCopy_loop:
|
|
LWHI v1,0(a1)
|
|
LWLO v1,3(a1)
|
|
addiu a1,a1,4
|
|
addiu a0,a0,4 # note: dst=a0 is word aligned here, see NOTE1
|
|
bne a0,a3,.Lua_wordCopy_loop
|
|
sw v1,-4(a0)
|
|
|
|
# Now less than 4 bytes (value in a2) left to copy
|
|
.Lua_smallCopy:
|
|
beqz a2,.Lleave
|
|
addu a3,a0,a2 # a3 is the last dst address
|
|
.Lua_smallCopy_loop:
|
|
lb v1,0(a1)
|
|
addiu a1,a1,1
|
|
addiu a0,a0,1
|
|
bne a0,a3,.Lua_smallCopy_loop
|
|
sb v1,-1(a0)
|
|
|
|
j ra
|
|
nop
|
|
|
|
.set at
|
|
.set reorder
|
|
|
|
END(memcpy)
|
|
|
|
|
|
/************************************************************************
|
|
* Implementation : Static functions
|
|
************************************************************************/
|