libc/newlib/libc/machine/sh3eb/strncpy.S

210 lines
4.8 KiB
ArmAsm

/* Copyright 2003 SuperH Ltd. */
#include "asm.h"
#ifdef __SH5__
#if __SHMEDIA__
#ifdef __LITTLE_ENDIAN__
#define ZPAD_MASK(src, dst) addi src, -1, dst
#else
#define ZPAD_MASK(src, dst) \
byterev src, dst; addi dst, -1, dst; byterev dst, dst
#endif
/* We assume that the destination is not in the first 16 bytes of memory.
A typical linker script will put the text section first, and as
this code is longer that 16 bytes, you have to get out of your way
to put data there. */
ENTRY(strncpy)
pt L_small, tr2
ldlo.q r3, 0, r0
shlli r3, 3, r19
mcmpeq.b r0, r63, r1
SHHI r1, r19, r7
add r2, r4, r20
addi r20, -8, r5
/* If the size is greater than 8, we know we can read beyond the first
(possibly partial) quadword, and write out a full first and last
(possibly unaligned and/or overlapping) quadword. */
bge/u r2, r5, tr2 // L_small
pt L_found0, tr0
addi r2, 8, r22
bnei/u r7, 0, tr0 // L_found0
ori r3, -8, r38
pt L_end_early, tr1
sub r2, r38, r22
stlo.q r2, 0, r0
sthi.q r2, 7, r0
sub r3, r2, r6
ldx.q r22, r6, r0
/* Before each iteration, check that we can store in full the next quad we
are about to fetch. */
addi r5, -8, r36
bgtu/u r22, r36, tr1 // L_end_early
pt L_scan0, tr1
L_scan0:
addi r22, 8, r22
mcmpeq.b r0, r63, r1
stlo.q r22, -8, r0
bnei/u r1, 0, tr0 // L_found0
sthi.q r22, -1, r0
ldx.q r22, r6, r0
bgeu/l r36, r22, tr1 // L_scan0
L_end:
// At end; we might re-read a few bytes when we fetch the last quad.
// branch mispredict, so load is ready now.
mcmpeq.b r0, r63, r1
addi r22, 8, r22
bnei/u r1, 0, tr0 // L_found0
add r3, r4, r7
ldlo.q r7, -8, r1
ldhi.q r7, -1, r7
ptabs r18, tr0
stlo.q r22, -8, r0
or r1, r7, r1
mcmpeq.b r1, r63, r7
sthi.q r22, -1, r0
ZPAD_MASK (r7, r7)
and r1, r7, r1 // mask out non-zero bytes after first zero byte
stlo.q r20, -8, r1
sthi.q r20, -1, r1
blink tr0, r63
L_end_early:
/* Check if we can store the current quad in full. */
pt L_end, tr1
add r3, r4, r7
bgtu/u r5, r22, tr1 // L_end // Not really unlikely, but gap is short.
/* If not, that means we can just proceed to process the last quad.
Two pipeline stalls are unavoidable, as we don't have enough ILP. */
ldlo.q r7, -8, r1
ldhi.q r7, -1, r7
ptabs r18, tr0
or r1, r7, r1
mcmpeq.b r1, r63, r7
ZPAD_MASK (r7, r7)
and r1, r7, r1 // mask out non-zero bytes after first zero byte
stlo.q r20, -8, r1
sthi.q r20, -1, r1
blink tr0, r63
L_found0:
// r0: string to store, not yet zero-padding normalized.
// r1: result of mcmpeq.b r0, r63, r1.
// r22: store address plus 8. I.e. address where zero padding beyond the
// string in r0 goes.
// r20: store end address.
// r5: store end address minus 8.
pt L_write0_multiquad, tr0
ZPAD_MASK (r1, r1)
and r0, r1, r0 // mask out non-zero bytes after first zero byte
stlo.q r22, -8, r0
sthi.q r22, -1, r0
andi r22, -8, r1 // Check if zeros to write fit in one quad word.
bgtu/l r5, r1, tr0 // L_write0_multiquad
ptabs r18, tr1
sub r20, r22, r1
shlli r1, 2, r1 // Do shift in two steps so that 64 bit case is
SHLO r0, r1, r0 // handled correctly.
SHLO r0, r1, r0
sthi.q r20, -1, r0
blink tr1, r63
L_write0_multiquad:
pt L_write0_loop, tr0
ptabs r18, tr1
stlo.q r22, 0, r63
sthi.q r20, -1, r63
addi r1, 8, r1
bgeu/l r5, r1, tr0 // L_write0_loop
blink tr1, r63
L_write0_loop:
st.q r1, 0 ,r63
addi r1, 8, r1
bgeu/l r5, r1, tr0 // L_write0_loop
blink tr1, r63
L_small:
// r0: string to store, not yet zero-padding normalized.
// r1: result of mcmpeq.b r0, r63, r1.
// r7: nonzero indicates relevant zero found r0.
// r2: store address.
// r3: read address.
// r4: size, max 8
// r20: store end address.
// r5: store end address minus 8.
pt L_nohi, tr0
pt L_small_storelong, tr1
ptabs r18, tr2
sub r63, r4, r23
bnei/u r7, 0, tr0 // L_nohi
ori r3, -8, r7
bge/l r23, r7, tr0 // L_nohi
ldhi.q r3, 7, r1
or r0, r1, r0
mcmpeq.b r0, r63, r1
L_nohi:
ZPAD_MASK (r1, r1)
and r0, r1, r0
movi 4, r19
bge/u r4, r19, tr1 // L_small_storelong
pt L_small_end, tr0
#ifndef __LITTLE_ENDIAN__
byterev r0, r0
#endif
beqi/u r4, 0, tr0 // L_small_end
st.b r2, 0, r0
beqi/u r4, 1, tr0 // L_small_end
shlri r0, 8, r0
st.b r2, 1, r0
beqi/u r4, 2, tr0 // L_small_end
shlri r0, 8, r0
st.b r2, 2, r0
L_small_end:
blink tr2, r63
L_small_storelong:
shlli r23, 3, r7
SHHI r0, r7, r1
#ifdef __LITTLE_ENDIAN__
shlri r1, 32, r1
#else
shlri r0, 32, r0
#endif
stlo.l r2, 0, r0
sthi.l r2, 3, r0
stlo.l r20, -4, r1
sthi.l r20, -1, r1
blink tr2, r63
#else /* SHcompact */
/* This code is optimized for size. Instruction selection is SH5 specific.
SH4 should use a different version. */
ENTRY(strncpy)
mov #0, r6
cmp/eq r4, r6
bt return
mov r2, r5
add #-1, r5
add r5, r4
loop:
bt/s found0
add #1, r5
mov.b @r3+, r1
found0:
cmp/eq r5,r4
mov.b r1, @r5
bf/s loop
cmp/eq r1, r6
return:
rts
nop
#endif /* SHcompact */
#endif /* __SH5__ */