diff --git a/CMakeLists.txt b/CMakeLists.txt index d962ed5..23efdfe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -181,6 +181,7 @@ set(SOURCES_CG src/render-cg/image/image_p4.S src/render-cg/image/image_p4_normal.S src/render-cg/image/image_p4_clearbg.S + src/render-cg/image/image_p4_clearbg_alt.S src/render-cg/image/image_p4_swapcolor.S src/render-cg/image/image_p4_dye.S # Interface to the fast image renderer @@ -193,6 +194,7 @@ set(SOURCES_CG src/render-cg/image/image_p8_swapcolor.c src/render-cg/image/image_p8_dye.c src/render-cg/image/image_p4.c + src/render-cg/image/image_p4_clearbg_alt.c src/render-cg/image/image_p4_effect.c src/render-cg/image/image_p4_swapcolor.c src/render-cg/image/image_p4_dye.c diff --git a/include/gint/image.h b/include/gint/image.h index b54dd75..ce4262b 100644 --- a/include/gint/image.h +++ b/include/gint/image.h @@ -197,6 +197,12 @@ DIMAGE_SIG(_addbg, int effects, int bg_color) /* d[sub]image_{rgb16,p8,p4}_dye(..., effects, dye_color) */ DIMAGE_SIG(_dye, int effects, int dye_color) +/* d[sub]image_p4_clearbg_alt(..., effects, bg_index) + This is functionally identical to CLEARBG, but it uses an alternative + rendering method that is faster for larger images with wide transparent + areas. You can swap it with the normal CLEARBG freely. */ +DIMAGE_SIG1(p4_clearbg_alt, int effects, int bg_index) + #define dimage_rgb16_effect(x, y, img, eff, ...) \ dsubimage_rgb16_effect(x, y, img, 0, 0, (img)->width, (img)->height, \ eff, ##__VA_ARGS__) @@ -353,6 +359,7 @@ void gint_image_p8_dye(void); void gint_image_p4_normal(void); void gint_image_p4_clearbg(void); +void gint_image_p4_clearbg_alt(void); void gint_image_p4_swapcolor(void); void gint_image_p4_dye(void); diff --git a/src/render-cg/image/image_p4.c b/src/render-cg/image/image_p4.c index b81f31a..7bee888 100644 --- a/src/render-cg/image/image_p4.c +++ b/src/render-cg/image/image_p4.c @@ -33,7 +33,7 @@ void dsubimage_p4_clearbg(int x, int y, image_t const *img, struct gint_image_box box = { x, y, w, h, left, top }; struct gint_image_cmd cmd; - if(!gint_image_mkcmd(&box, img, eff, true, true, &cmd, DWIDTH, + if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH, DHEIGHT)) return; cmd.effect += 4; cmd.color_1 = bg_color; diff --git a/src/render-cg/image/image_p4_clearbg.S b/src/render-cg/image/image_p4_clearbg.S index 7fd1f54..873222b 100644 --- a/src/render-cg/image/image_p4_clearbg.S +++ b/src/render-cg/image/image_p4_clearbg.S @@ -1,146 +1,90 @@ .global _gint_image_p4_clearbg #include "image_macros.S" -/* P4 CLEARBG, RAM version: by NULL canceling. +/* P4 CLEARBG, RAM version: trivial. - This function is similar to P8 CLEARBG. Transparent pixels are not limited - by RAM writing speed, so a tight CPU loop is used. See P8 CLEARBG for an - explanation of NULL canceling. + This is the bopti algorithm. Azur's is faster when there are enough + transparent pixels, but very limiting for quasi-opaque images. r0: [temporary] - r7: Right edge pointer + r7: Current x position r8: Alpha value r9: Palette - r10: Left edge pointer - r11: Nullable output pointer - r12: 0 (in outer loop: edge stride) - r13: [temporary] - r14: [temporary] - - Spilled to stack: - @(-12,r15): Right edge value - @(-8,r15): Left edge value - @(-4,r15): Edge stride */ - -.macro GEN_CLEARBG_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2 - shlr r2 - nop - - add r10, r10 - nop + r10: Initial x position + r11: Column counter + r12: -3 */ +.macro GEN_CLEARBG_LOOP HFLIP, OUT_DIR + /* Cancel the last operation to keep r4 = bytes between rows */ mov.l @r8+, r9 /* cmd.palette */ - mov r2, r0 + mov r2, r7 + shlr r7 + addc r7, r4 mov.w @r8+, r7 /* cmd.edge_2 */ - shll2 r0 - - mov.l r12, @-r15 - shll r7 - - mov.l r11, @-r15 - add r5, r7 - - mov r0, r12 - add r6, r12 - - mov.l r13, @-r15 - add r5, r10 - - mov.l r14, @-r15 - add #-4, r5 + nop mov.w @r8, r8 /* cmd.color_1 */ - add #-1, r4 /* Input stride compensation for pipelining */ - - .if \HFLIP - add r0, r5 nop - shll r0 - nop - - add r0, r6 - nop - .endif - + mov.l r11, @-r15 shll r8 /* alpha*2 compares against palette offsets */ - nop - - START - - mov.b @r3+, \TMP1 - nop - - mov.w @r7, r0 /* Save right edge */ - nop - - mov.l r0, @-r15 - shll \TMP1 - - mov.w @r10, r0 /* Save left edge */ - nop - - mov.l r0, @-r15 - nop mov.l r12, @-r15 - mov #0, r12 + mov #-3, r12 -2: mov \TMP1, r0 + .if \HFLIP + add #-2, r5 + mov r2, r0 + shll r0 + add r0, r5 + shll r0 + add r0, r6 + .endif + +1: mov r2, r11 + mov r10, r7 + + /* Load 4 bits from offet r7 (in pixels) within input */ +2: mov r7, r0 + shlr r0 + + mov.b @(r0, r3), r0 + nop + + bt.s 3f + add #1, r7 + +/* Aligned */ + shld r12, r0 and #0x1e, r0 cmp/eq r0, r8 - mov #-1, r11 + bt 4f - addc r12, r11 - mov #-4, \TMP2 + mov.w @(r0, r9), r0 + bra 4f + mov.w r0, @r5 - and r5, r11 - mov.w @(r0,r9), r0 +/* Unaligned */ +3: shll r0 + and #0x1e, r0 - shld \TMP2, \TMP1 - mov #0x1e, \TMP2 + cmp/eq r0, r8 + bt 4f - and \TMP2, \TMP1 - mov.w r0, @(\OFF1,r11) - - cmp/eq \TMP1, r8 - mov #-1, r11 - - addc r12, r11 - mov \TMP1, r0 - - and r5, r11 - mov.b @r3+, \TMP1 + mov.w @(r0, r9), r0 + mov.w r0, @r5 +/* End */ +4: dt r11 + bf.s 2b add #\OUT_DIR, r5 - mov.w @(r0,r9), r0 - - mov.w r0, @(\OFF2,r11) -3: shll \TMP1 - - mov.l @r15+, r12 - nop - - mov.l @r15+, r0 - nop - - mov.w r0, @r10 /* Restore left edge */ - add r12, r10 - - mov.l @r15+, r0 - nop - - mov.w r0, @r7 /* Restore right edge */ - add r12, r7 END - mov.l @r15+, r14 - mov.l @r15+, r13 - mov.l @r15+, r11 mov.l @r15+, r12 + mov.l @r15+, r11 mov.l @r15+, r10 EPILOGUE .endm @@ -149,5 +93,5 @@ _gint_image_p4_clearbg: tst #1, r0 bf 9f - GEN_CLEARBG_LOOP 0, 4, r13, r14, 6, 4 -9: GEN_CLEARBG_LOOP 1, -4, r13, r14, 0, 2 + GEN_CLEARBG_LOOP 0, 2 +9: GEN_CLEARBG_LOOP 1, -2 diff --git a/src/render-cg/image/image_p4_clearbg_alt.S b/src/render-cg/image/image_p4_clearbg_alt.S new file mode 100644 index 0000000..660c49c --- /dev/null +++ b/src/render-cg/image/image_p4_clearbg_alt.S @@ -0,0 +1,153 @@ +.global _gint_image_p4_clearbg_alt +#include "image_macros.S" + +/* P4 CLEARBG, alternative version: by NULL canceling. + + This function is similar to P8 CLEARBG. Transparent pixels are not limited + by RAM writing speed, so a tight CPU loop is used. See P8 CLEARBG for an + explanation of NULL canceling. + + r0: [temporary] + r7: Right edge pointer + r8: Alpha value + r9: Palette + r10: Left edge pointer + r11: Nullable output pointer + r12: 0 (in outer loop: edge stride) + r13: [temporary] + r14: [temporary] + + Spilled to stack: + @(-12,r15): Right edge value + @(-8,r15): Left edge value + @(-4,r15): Edge stride */ + +.macro GEN_CLEARBG_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2 + shlr r2 + nop + + add r10, r10 + nop + + mov.l @r8+, r9 /* cmd.palette */ + mov r2, r0 + + mov.w @r8+, r7 /* cmd.edge_2 */ + shll2 r0 + + mov.l r12, @-r15 + shll r7 + + mov.l r11, @-r15 + add r5, r7 + + mov r0, r12 + add r6, r12 + + mov.l r13, @-r15 + add r5, r10 + + mov.l r14, @-r15 + add #-4, r5 + + mov.w @r8, r8 /* cmd.color_1 */ + add #-1, r4 /* Input stride compensation for pipelining */ + + .if \HFLIP + add r0, r5 + nop + + shll r0 + nop + + add r0, r6 + nop + .endif + + shll r8 /* alpha*2 compares against palette offsets */ + nop + + START + + mov.b @r3+, \TMP1 + nop + + mov.w @r7, r0 /* Save right edge */ + nop + + mov.l r0, @-r15 + shll \TMP1 + + mov.w @r10, r0 /* Save left edge */ + nop + + mov.l r0, @-r15 + nop + + mov.l r12, @-r15 + mov #0, r12 + +2: mov \TMP1, r0 + and #0x1e, r0 + + cmp/eq r0, r8 + mov #-1, r11 + + addc r12, r11 + mov #-4, \TMP2 + + and r5, r11 + mov.w @(r0,r9), r0 + + shld \TMP2, \TMP1 + mov #0x1e, \TMP2 + + and \TMP2, \TMP1 + mov.w r0, @(\OFF1,r11) + + cmp/eq \TMP1, r8 + mov #-1, r11 + + addc r12, r11 + mov \TMP1, r0 + + and r5, r11 + mov.b @r3+, \TMP1 + + add #\OUT_DIR, r5 + mov.w @(r0,r9), r0 + + mov.w r0, @(\OFF2,r11) +3: shll \TMP1 + + mov.l @r15+, r12 + nop + + mov.l @r15+, r0 + nop + + mov.w r0, @r10 /* Restore left edge */ + add r12, r10 + + mov.l @r15+, r0 + nop + + mov.w r0, @r7 /* Restore right edge */ + add r12, r7 + + END + + mov.l @r15+, r14 + mov.l @r15+, r13 + mov.l @r15+, r11 + mov.l @r15+, r12 + mov.l @r15+, r10 + EPILOGUE +.endm + +_gint_image_p4_clearbg_alt: + tst #1, r0 + bf 9f + + GEN_CLEARBG_LOOP 0, 4, r13, r14, 6, 4 +9: GEN_CLEARBG_LOOP 1, -4, r13, r14, 0, 2 diff --git a/src/render-cg/image/image_p4_clearbg_alt.c b/src/render-cg/image/image_p4_clearbg_alt.c new file mode 100644 index 0000000..fe5173b --- /dev/null +++ b/src/render-cg/image/image_p4_clearbg_alt.c @@ -0,0 +1,22 @@ +#include +#include + +void dimage_p4_clearbg_alt(int x, int y, image_t const *img, int eff, int bg) +{ + dsubimage_p4_clearbg_alt(x, y, img, 0, 0, img->width, img->height, eff, + bg); +} + +void dsubimage_p4_clearbg_alt(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, int bg_color) +{ + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(!gint_image_mkcmd(&box, img, eff, true, true, &cmd, DWIDTH, + DHEIGHT)) return; + cmd.effect += 16; + cmd.color_1 = bg_color; + cmd.loop = gint_image_p4_clearbg_alt; + gint_image_p4_loop(DWIDTH, &cmd); +}