render-cg: restore bopti method on P4 and defined p4_clearbg_alt

This commit is contained in:
Lephe 2022-05-06 16:26:44 +01:00
parent a4df076214
commit ede19fc878
Signed by: Lephenixnoir
GPG Key ID: 1BBA026E13FC0495
6 changed files with 241 additions and 113 deletions

View File

@ -181,6 +181,7 @@ set(SOURCES_CG
src/render-cg/image/image_p4.S
src/render-cg/image/image_p4_normal.S
src/render-cg/image/image_p4_clearbg.S
src/render-cg/image/image_p4_clearbg_alt.S
src/render-cg/image/image_p4_swapcolor.S
src/render-cg/image/image_p4_dye.S
# Interface to the fast image renderer
@ -193,6 +194,7 @@ set(SOURCES_CG
src/render-cg/image/image_p8_swapcolor.c
src/render-cg/image/image_p8_dye.c
src/render-cg/image/image_p4.c
src/render-cg/image/image_p4_clearbg_alt.c
src/render-cg/image/image_p4_effect.c
src/render-cg/image/image_p4_swapcolor.c
src/render-cg/image/image_p4_dye.c

View File

@ -197,6 +197,12 @@ DIMAGE_SIG(_addbg, int effects, int bg_color)
/* d[sub]image_{rgb16,p8,p4}_dye(..., effects, dye_color) */
DIMAGE_SIG(_dye, int effects, int dye_color)
/* d[sub]image_p4_clearbg_alt(..., effects, bg_index)
This is functionally identical to CLEARBG, but it uses an alternative
rendering method that is faster for larger images with wide transparent
areas. You can swap it with the normal CLEARBG freely. */
DIMAGE_SIG1(p4_clearbg_alt, int effects, int bg_index)
#define dimage_rgb16_effect(x, y, img, eff, ...) \
dsubimage_rgb16_effect(x, y, img, 0, 0, (img)->width, (img)->height, \
eff, ##__VA_ARGS__)
@ -353,6 +359,7 @@ void gint_image_p8_dye(void);
void gint_image_p4_normal(void);
void gint_image_p4_clearbg(void);
void gint_image_p4_clearbg_alt(void);
void gint_image_p4_swapcolor(void);
void gint_image_p4_dye(void);

View File

@ -33,7 +33,7 @@ void dsubimage_p4_clearbg(int x, int y, image_t const *img,
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(!gint_image_mkcmd(&box, img, eff, true, true, &cmd, DWIDTH,
if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
DHEIGHT)) return;
cmd.effect += 4;
cmd.color_1 = bg_color;

View File

@ -1,146 +1,90 @@
.global _gint_image_p4_clearbg
#include "image_macros.S"
/* P4 CLEARBG, RAM version: by NULL canceling.
/* P4 CLEARBG, RAM version: trivial.
This function is similar to P8 CLEARBG. Transparent pixels are not limited
by RAM writing speed, so a tight CPU loop is used. See P8 CLEARBG for an
explanation of NULL canceling.
This is the bopti algorithm. Azur's is faster when there are enough
transparent pixels, but very limiting for quasi-opaque images.
r0: [temporary]
r7: Right edge pointer
r7: Current x position
r8: Alpha value
r9: Palette
r10: Left edge pointer
r11: Nullable output pointer
r12: 0 (in outer loop: edge stride)
r13: [temporary]
r14: [temporary]
Spilled to stack:
@(-12,r15): Right edge value
@(-8,r15): Left edge value
@(-4,r15): Edge stride */
.macro GEN_CLEARBG_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
shlr r2
nop
add r10, r10
nop
r10: Initial x position
r11: Column counter
r12: -3 */
.macro GEN_CLEARBG_LOOP HFLIP, OUT_DIR
/* Cancel the last operation to keep r4 = bytes between rows */
mov.l @r8+, r9 /* cmd.palette */
mov r2, r0
mov r2, r7
shlr r7
addc r7, r4
mov.w @r8+, r7 /* cmd.edge_2 */
shll2 r0
mov.l r12, @-r15
shll r7
mov.l r11, @-r15
add r5, r7
mov r0, r12
add r6, r12
mov.l r13, @-r15
add r5, r10
mov.l r14, @-r15
add #-4, r5
nop
mov.w @r8, r8 /* cmd.color_1 */
add #-1, r4 /* Input stride compensation for pipelining */
.if \HFLIP
add r0, r5
nop
shll r0
nop
add r0, r6
nop
.endif
mov.l r11, @-r15
shll r8 /* alpha*2 compares against palette offsets */
nop
START
mov.b @r3+, \TMP1
nop
mov.w @r7, r0 /* Save right edge */
nop
mov.l r0, @-r15
shll \TMP1
mov.w @r10, r0 /* Save left edge */
nop
mov.l r0, @-r15
nop
mov.l r12, @-r15
mov #0, r12
mov #-3, r12
2: mov \TMP1, r0
.if \HFLIP
add #-2, r5
mov r2, r0
shll r0
add r0, r5
shll r0
add r0, r6
.endif
1: mov r2, r11
mov r10, r7
/* Load 4 bits from offet r7 (in pixels) within input */
2: mov r7, r0
shlr r0
mov.b @(r0, r3), r0
nop
bt.s 3f
add #1, r7
/* Aligned */
shld r12, r0
and #0x1e, r0
cmp/eq r0, r8
mov #-1, r11
bt 4f
addc r12, r11
mov #-4, \TMP2
mov.w @(r0, r9), r0
bra 4f
mov.w r0, @r5
and r5, r11
mov.w @(r0,r9), r0
/* Unaligned */
3: shll r0
and #0x1e, r0
shld \TMP2, \TMP1
mov #0x1e, \TMP2
cmp/eq r0, r8
bt 4f
and \TMP2, \TMP1
mov.w r0, @(\OFF1,r11)
cmp/eq \TMP1, r8
mov #-1, r11
addc r12, r11
mov \TMP1, r0
and r5, r11
mov.b @r3+, \TMP1
mov.w @(r0, r9), r0
mov.w r0, @r5
/* End */
4: dt r11
bf.s 2b
add #\OUT_DIR, r5
mov.w @(r0,r9), r0
mov.w r0, @(\OFF2,r11)
3: shll \TMP1
mov.l @r15+, r12
nop
mov.l @r15+, r0
nop
mov.w r0, @r10 /* Restore left edge */
add r12, r10
mov.l @r15+, r0
nop
mov.w r0, @r7 /* Restore right edge */
add r12, r7
END
mov.l @r15+, r14
mov.l @r15+, r13
mov.l @r15+, r11
mov.l @r15+, r12
mov.l @r15+, r11
mov.l @r15+, r10
EPILOGUE
.endm
@ -149,5 +93,5 @@ _gint_image_p4_clearbg:
tst #1, r0
bf 9f
GEN_CLEARBG_LOOP 0, 4, r13, r14, 6, 4
9: GEN_CLEARBG_LOOP 1, -4, r13, r14, 0, 2
GEN_CLEARBG_LOOP 0, 2
9: GEN_CLEARBG_LOOP 1, -2

View File

@ -0,0 +1,153 @@
.global _gint_image_p4_clearbg_alt
#include "image_macros.S"
/* P4 CLEARBG, alternative version: by NULL canceling.
This function is similar to P8 CLEARBG. Transparent pixels are not limited
by RAM writing speed, so a tight CPU loop is used. See P8 CLEARBG for an
explanation of NULL canceling.
r0: [temporary]
r7: Right edge pointer
r8: Alpha value
r9: Palette
r10: Left edge pointer
r11: Nullable output pointer
r12: 0 (in outer loop: edge stride)
r13: [temporary]
r14: [temporary]
Spilled to stack:
@(-12,r15): Right edge value
@(-8,r15): Left edge value
@(-4,r15): Edge stride */
.macro GEN_CLEARBG_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
shlr r2
nop
add r10, r10
nop
mov.l @r8+, r9 /* cmd.palette */
mov r2, r0
mov.w @r8+, r7 /* cmd.edge_2 */
shll2 r0
mov.l r12, @-r15
shll r7
mov.l r11, @-r15
add r5, r7
mov r0, r12
add r6, r12
mov.l r13, @-r15
add r5, r10
mov.l r14, @-r15
add #-4, r5
mov.w @r8, r8 /* cmd.color_1 */
add #-1, r4 /* Input stride compensation for pipelining */
.if \HFLIP
add r0, r5
nop
shll r0
nop
add r0, r6
nop
.endif
shll r8 /* alpha*2 compares against palette offsets */
nop
START
mov.b @r3+, \TMP1
nop
mov.w @r7, r0 /* Save right edge */
nop
mov.l r0, @-r15
shll \TMP1
mov.w @r10, r0 /* Save left edge */
nop
mov.l r0, @-r15
nop
mov.l r12, @-r15
mov #0, r12
2: mov \TMP1, r0
and #0x1e, r0
cmp/eq r0, r8
mov #-1, r11
addc r12, r11
mov #-4, \TMP2
and r5, r11
mov.w @(r0,r9), r0
shld \TMP2, \TMP1
mov #0x1e, \TMP2
and \TMP2, \TMP1
mov.w r0, @(\OFF1,r11)
cmp/eq \TMP1, r8
mov #-1, r11
addc r12, r11
mov \TMP1, r0
and r5, r11
mov.b @r3+, \TMP1
add #\OUT_DIR, r5
mov.w @(r0,r9), r0
mov.w r0, @(\OFF2,r11)
3: shll \TMP1
mov.l @r15+, r12
nop
mov.l @r15+, r0
nop
mov.w r0, @r10 /* Restore left edge */
add r12, r10
mov.l @r15+, r0
nop
mov.w r0, @r7 /* Restore right edge */
add r12, r7
END
mov.l @r15+, r14
mov.l @r15+, r13
mov.l @r15+, r11
mov.l @r15+, r12
mov.l @r15+, r10
EPILOGUE
.endm
_gint_image_p4_clearbg_alt:
tst #1, r0
bf 9f
GEN_CLEARBG_LOOP 0, 4, r13, r14, 6, 4
9: GEN_CLEARBG_LOOP 1, -4, r13, r14, 0, 2

View File

@ -0,0 +1,22 @@
#include <gint/image.h>
#include <gint/display.h>
void dimage_p4_clearbg_alt(int x, int y, image_t const *img, int eff, int bg)
{
dsubimage_p4_clearbg_alt(x, y, img, 0, 0, img->width, img->height, eff,
bg);
}
void dsubimage_p4_clearbg_alt(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, int bg_color)
{
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(!gint_image_mkcmd(&box, img, eff, true, true, &cmd, DWIDTH,
DHEIGHT)) return;
cmd.effect += 16;
cmd.color_1 = bg_color;
cmd.loop = gint_image_p4_clearbg_alt;
gint_image_p4_loop(DWIDTH, &cmd);
}