render-cg: add new image rendering functions with dynamic effects

This commit is contained in:
Lephe 2022-05-04 17:27:02 +01:00
parent 904ab74984
commit f219e5c882
Signed by untrusted user: Lephenixnoir
GPG Key ID: 1BBA026E13FC0495
31 changed files with 2506 additions and 47 deletions

View File

@ -169,6 +169,35 @@ set(SOURCES_CG
src/render-cg/gint_dline.c
src/render-cg/topti-asm.s
src/render-cg/topti.c
# Fast image renderer
src/render-cg/image/image.c
src/render-cg/image/image_rgb16.S
src/render-cg/image/image_rgb16_normal.S
src/render-cg/image/image_rgb16_clearbg_dye.S
src/render-cg/image/image_rgb16_swapcolor.S
src/render-cg/image/image_p8.S
src/render-cg/image/image_p8_normal.S
src/render-cg/image/image_p8_clearbg.S
src/render-cg/image/image_p8_swapcolor.S
src/render-cg/image/image_p8_dye.S
src/render-cg/image/image_p4.S
src/render-cg/image/image_p4_normal.S
src/render-cg/image/image_p4_clearbg.S
src/render-cg/image/image_p4_swapcolor.S
src/render-cg/image/image_p4_dye.S
# Interface to the fast image renderer
src/render-cg/image/image_rgb16.c
src/render-cg/image/image_rgb16_effect.c
src/render-cg/image/image_rgb16_swapcolor.c
src/render-cg/image/image_rgb16_dye.c
src/render-cg/image/image_p8.c
src/render-cg/image/image_p8_effect.c
src/render-cg/image/image_p8_swapcolor.c
src/render-cg/image/image_p8_dye.c
src/render-cg/image/image_p4.c
src/render-cg/image/image_p4_effect.c
src/render-cg/image/image_p4_swapcolor.c
src/render-cg/image/image_p4_dye.c
)
set(ASSETS_FX src/font5x7.png)

View File

@ -1,11 +1,15 @@
//---
// gint:display-cg - fxcg50 rendering functions
// gint:display-cg - fx-CG 50 rendering functions
//
// This module covers all 16-bit opaque rendering functions. For
// gamma-related functions, color composition, check out a color library.
// This module covers rendering functions specific to the fx-CG 50. In addition
// to triple-buffering management, this mainly includes image manipulation
// tools as well as the very versatile dimage_effect() and dsubimage_effect()
// functions that support high-performance image rendering with a number of
// geometric and color effects.
//
// All the functions in this module work on a 396x224 resolution - gint
// lets you use the full surface!
// The fx-CG OS restricts the display to a 384x216 rectangle rougly around the
// center, leaving margins on three sides. However, gint configures the display
// to use the full 396x224 surface!
//---
#ifndef GINT_DISPLAY_CG
@ -18,6 +22,7 @@ extern "C" {
#endif
#include <gint/defs/types.h>
#include <gint/image.h>
/* Dimensions of the VRAM */
#define DWIDTH 396
@ -57,49 +62,9 @@ enum {
green is not used). */
#define C_RGB(r,g,b) (((r) << 11) | ((g) << 6) | (b))
//---
// Image rendering (bopti)
//---
/* See <gint/image.h> for the details on image manipulation. */
typedef image_t bopti_image_t;
/* bopti_image_t: Image files encoded for bopti
This format is created by the fxSDK's [fxconv] tool from standard images. */
typedef struct
{
/* Color profile (type of palette), could be extended into a bit field
later on */
uint16_t profile;
/* Color code assigned to transparent pixels (unused in 16-bit). In
P8_RGB565A, the value assigned to alpha is always 0. */
uint16_t alpha;
/* Full width and height, in pixels */
uint16_t width;
uint16_t height;
/* Here we lose structure because of the flexible array.
RGB565, RGB565A:
* Pixels in row-major order, 16 bits per pixel
P8:
* Palette with 256 entries (512 bytes total)
* Pixels in row-major order, 8 bits per pixel
P8_RGB565A, P8_RGB565:
* Number of entries in palette, N (2 bytes)
* Palette with N entries (2N bytes)
* Pixels in row-major order, 8 bits per pixel (signed indices in
an uint16_t array starting at <palette>+<256 bytes>)
P4/P4_RGB565A, P4_RGB565:
* Palette with 16 entries (32 bytes total)
* Pixels in row-major order, 4 bits per pixel, each row
byte-padded */
uint16_t data[];
} GPACKED(4) bopti_image_t;
/* Old alias to image_t, now deprecated because of libimg */
typedef bopti_image_t image_t __attribute__((deprecated(
"image_t has been renamed to bopti_image_t")));
//---
// Video RAM management

365
include/gint/image.h Normal file
View File

@ -0,0 +1,365 @@
//---
// gint:image - Image manipulation and rendering
//
// Note: this module is currently only available on fx-CG.
//
// This header provides image manipulation functions. This mainly consists of a
// reference-based image format, various access and modification functions, and
// a number of high-performance transformations and rendering effects. If you
// find yourself limited by rendering time, note that RAM writing speed is
// often the bottleneck, and image rendering is much faster in Azur (which is
// what the renderer was initially designed for).
//
// We support 3 bit depths: full-color 16-bit (RGB565), indexed 8-bit (P8) and
// indexed 4-bit (P4). All three have an "alpha" variation where one color is
// treated as transparent, leading to 6 total formats.
//
// The image renderers support so-called *dynamic effects*, which are image
// transformations performed on-the-fly while rendering, without generating an
// intermediate image. They comprise straightforward transformations that
// achieve similar performance to straight rendering and can be combined to
// some extent, which makes them reliable whenever applicable.
//
// TODO: Switch to libimg-style image refs.
//---
#ifndef GINT_IMAGE
#define GINT_IMAGE
#ifdef __cplusplus
extern "C" {
#endif
#ifndef FXCG50
#error <gint/image.h> is only supported on FXCG50
#else
#include <gint/defs/attributes.h>
#include <gint/defs/types.h>
//---
// Image structures
//---
/* Image formats. Note that transparency really only indicates the default
rendering method, as a transparent background can always be added or removed
by a dynamic effect on any image. */
enum {
IMAGE_RGB565 = 0, /* RGB565 without alpha */
IMAGE_RGB565A = 1, /* RGB565 with one transparent color */
IMAGE_P8_RGB565 = 4, /* 8-bit palette, all opaque colors */
IMAGE_P8_RGB565A = 5, /* 8-bit with one transparent color */
IMAGE_P4_RGB565 = 6, /* 4-bit palette, all opaque colors */
IMAGE_P4_RGB565A = 3, /* 4-bit with one transparent color */
IMAGE_DEPRECATED_P8 = 2,
};
/* image_t: gint's native bitmap image format
Images of this format can be created through this header's API but also by
using the fxSDK's built-in image converters with fxconv. */
typedef struct
{
/* Color format, one of the IMAGE_* values defined above. */
uint16_t profile;
/* For formats with alpha, value or index used for transparency. */
uint16_t alpha;
/* Full width and height, in pixels */
uint16_t width;
uint16_t height;
/* Here we lose structure because of the flexible array.
RGB565, RGB565A:
* Pixels in row-major order, 16 bits per pixel
P8:
* Palette with 256 entries (512 bytes total)
* Pixels in row-major order, 8 bits per pixel
P8_RGB565A, P8_RGB565:
* Number of entries in palette, N (2 bytes)
* Palette with N entries (2N bytes)
* Pixels in row-major order, 8 bits per pixel (signed indices in
an uint16_t array starting at <palette>+<256 bytes>)
P4/P4_RGB565A, P4_RGB565:
* Palette with 16 entries (32 bytes total)
* Pixels in row-major order, 4 bits per pixel, each row
byte-padded */
uint16_t data[];
} GPACKED(4) image_t;
/* Dynamic effects: these transformations can be applied on images while
rendering. Not all effects can be combined; unless specified otherwise:
- HFLIP and VFLIP can both be added regardless of any other effect
- At most one color effect can be applied */
enum {
/* Value 0x01 is reserved, because it is DIMAGE_NOCLIP, which although
part of the old API still needs to be supported. */
/* [Any]: Skip clipping the command against the source image */
IMAGE_NOCLIP_INPUT = 0x04,
/* [Any]: Skip clipping the command against the output VRAM */
IMAGE_NOCLIP_OUTPUT = 0x08,
/* [Any]: Skip clipping both */
IMAGE_NOCLIP = IMAGE_NOCLIP_INPUT | IMAGE_NOCLIP_OUTPUT,
// Geometric effects. These values should remain at exactly bit 8 and
// following, or change gint_image_mkcmd() along with it.
/* [Any]: Flip image vertically */
IMAGE_VFLIP = 0x0100,
/* [Any]: Flip image horizontally */
IMAGE_HFLIP = 0x0200,
// Color effects
/* [RGB565, P8_RGB565, P4_RGB565]: Make a color transparent
Adds one argument:
* Color to clear (RGB16: 16-bit value; P8/P4: palette index) */
IMAGE_CLEARBG = 0x10,
/* [RGB565, P8_RGB565, P4_RGB565]: Turn a color into another
Adds two arguments:
* Color to replace (RGB16: 16-bit value; P8/P4: palette index)
* Replacement color (16-bit value) */
IMAGE_SWAPCOLOR = 0x20,
/* [RGB565A, P8_RGB565A, P4_RGB565A]: Add a background
Adds one argument:
* Background color (16-bit value) */
IMAGE_ADDBG = 0x40,
/* [RGB565A, P8_RGB565A, P4_RGB565A]: Dye all non-transparent pixels
Adds one argument:
* Dye color (16-bit value) */
IMAGE_DYE = 0x80,
};
//---
// Image access and information
//---
/* TODO: Expand */
int image_get_pixel(image_t const *img, int x, int y);
int image_decode_pixel(image_t const *img, int pixel);
//---
// Image rendering functions
//
// The following functions extend dimage() and dsubimage(). The [effects]
// parameter takes a combination of IMAGE_* flags and effects, limited to the
// combinations previously described, with additional arguments depending on
// the color effect being applied.
//
// dimage_effect(x, y, img, effects, ...)
// dsubimage_effect(x, y, img, left, top, w, h, effects, ...)
//
// However if you use these super-generic functions you will link the code for
// all effects and all formats into your add-in, which takes a fair amount of
// space. If that's a problem, you can use the more specific functions below:
//
// * dimage_<FORMAT>_<EFFECT>() for one particular format (rgb16, p8, p4) along
// with one particular color effect (clearbg, swapcolor, addbg, dye).
// * dimage_<FORMAT>() is like the above when no color effect is applied.
//
// All of them support the HFLIP and VFLIP flags. For effect-specific functions
// the corresponding effect flag can be omitted (fi. IMAGE_CLEARBG is implicit
// when using dimage_p8_clearbg()).
//---
/* dimage_effect(): Generalized dimage() supporting dynamic effects */
#define dimage_effect(x, y, img, eff, ...) \
dsubimage_effect(x, y, img, 0, 0, (img)->width, (img)->height, eff, \
##__VA_ARGS__)
/* dsubimage_effect(): Generalized dsubimage() supporting dynamic effects */
void dsubimage_effect(int x, int y, image_t const *img,
int left, int top, int w, int h, int effects, ...);
/* Specific versions for each format */
#define DIMAGE_SIG1(NAME, ...) \
void dimage_ ## NAME(int x, int y, image_t const *img,##__VA_ARGS__); \
void dsubimage_ ## NAME(int x, int y, image_t const *img, \
int left, int top, int w, int h, ##__VA_ARGS__);
#define DIMAGE_SIG(NAME, ...) \
DIMAGE_SIG1(rgb16 ## NAME, ##__VA_ARGS__) \
DIMAGE_SIG1(p8 ## NAME, ##__VA_ARGS__) \
DIMAGE_SIG1(p4 ## NAME, ##__VA_ARGS__)
/* d[sub]image_{rgb16,p8,p4}_effect(..., effects, <extra arguments>) */
DIMAGE_SIG(_effect, int effects, ...)
/* d[sub]image_{rgb16,p8,p4}(..., effects) (no color effect, like dimage()) */
DIMAGE_SIG(, int effects)
/* d[sub]image_{rgb16,p8,p4}_clearbg(..., effects, bg_color_or_index) */
DIMAGE_SIG(_clearbg, int effects, int bg_color_or_index)
/* d[sub]image_{rgb16,p8,p4}_swapcolor(..., effects, source, replacement) */
DIMAGE_SIG(_swapcolor, int effects, int source, int replacement)
/* d[sub]image_{rgb16,p8,p4}_addbg(..., effects, bg_color) */
DIMAGE_SIG(_addbg, int effects, int bg_color)
/* d[sub]image_{rgb16,p8,p4}_dye(..., effects, dye_color) */
DIMAGE_SIG(_dye, int effects, int dye_color)
#define dimage_rgb16_effect(x, y, img, eff, ...) \
dsubimage_rgb16_effect(x, y, img, 0, 0, (img)->width, (img)->height, \
eff, ##__VA_ARGS__)
#define dimage_p8_effect(x, y, img, eff, ...) \
dsubimage_p8_effect(x, y, img, 0, 0, (img)->width, (img)->height, \
eff, ##__VA_ARGS__)
#define dimage_p4_effect(x, y, img, eff, ...) \
dsubimage_p4_effect(x, y, img, 0, 0, (img)->width, (img)->height, \
eff, ##__VA_ARGS__)
#undef DIMAGE_SIG
#undef DIMAGE_SIG1
//---
// Clipping utilities
//---
/* Double box specifying both a source and target area */
struct gint_image_box
{
/* Target location of top-left corner */
int x, y;
/* Width and height of rendered sub-image */
int w, h;
/* Source bounding box (low included, high excluded) */
int left, top;
};
/* Clip the provided box against the input. If, after clipping, the box no
longer intersects the output (whose size is specified as out_w/out_h),
returns false. Otherwise, returns true. */
bool gint_image_clip_input(image_t const *img, struct gint_image_box *box,
int out_w, int out_h);
/* Clip the provided box against the output. */
void gint_image_clip_output(struct gint_image_box *b, int out_w, int out_h);
//---
// Internal image rendering routines
//
// The following functions (or non-functions) are implemented in assembler and
// make up the internal interface of the image renderer. If you just want to
// display images, use dimage() and variations; these are only useful if you
// have a different rendering system and wish to use image rendering with
// dynamic effects in it.
//---
/* Renderer command. This structure includes most of the information used by
the image renderer to perform blits. Some of the information on the target
is also passed as direct arguments, which is more convenient and slightly
faster.
Most of the values here can be set with gint_image_mkcmd(). The last two
members, along with the return values of the gint_image_FORMAT_loop()
functions, are used to update the command if one needs to draw *parts* of
the image and resume the rendering later. This is used in Azur. */
struct gint_image_cmd
{
/* Shader ID. This is used in Azur, and ignored in gint */
uint8_t shader_id;
/* Dynamic effects
Bit 0: VFLIP
Bit 1: HFLIP
Bits 2-7: 0=NONE, 1=CLEARBG, 2=SWAPCOLOR, 3=DYE */
uint8_t effect;
/* Number of pixels to render per line. For formats that force either x
or width alignment (most of them), this is already adjusted to a
suitable multiple (usually a multiple of 2). */
int16_t columns;
/* Stride of the input image (number of pixels between each row), in
pixels, without subtracting the number of columns */
int16_t input_stride;
/* Number of lines in the command. This can be adjusted freely, and is
particularly useful in Azur for fragmented rendering. */
uint8_t lines;
/* [Any effect]: Offset of first edge */
int8_t edge_1;
/* Core loop; this is an internal label of the renderer */
void const *loop;
/* Output pixel array, offset by target x/y */
void const *output;
/* Input pixel array, offset by source x/y. For formats that force x
alignment, this is already adjusted. */
void const *input;
/* Palette, when applicable */
uint16_t const *palette;
/* [Any effect]: Offset of right edge */
int16_t edge_2;
/* [CLEARBG, SWAPCOLOR]: Source color */
uint16_t color_1;
/* [SWAPCOLOR]: Destination color */
uint16_t color_2;
/* Remaining height (for updates between fragments) */
int16_t height;
/* Local x position (for updates between fragments) */
int16_t x;
};
/* gint_image_mkcmd(): Prepare a rendering command with dynamic effects
This function crafts an image renderer command. It loads all the settings
except for effect-dependent parameters: the [.loop] label, the color section
of [.effect], and color effect settings. See the effect-specific functions
to see how they are defined.
The benefit of this approach is that the rendering code does not need to be
linked in unless an effect is actually used, which avoids blowing up the
size of the add-in as the number of support dynamic effects increases.
@box Requested on-screen box (will be clipped depending on effects)
@img Source image
@effects Set of dynamic effects to be applied, as an [IMAGE_*] bitmask
@left_edge Whether to force 2-alignment on the input (box->left)
@right_edge Whether to force 2-alignment on the width
@cmd Command to be filled
@out_width Output width (usually DWIDTH)
@out_height Output height (usually DHEIGHT)
Returns false if there is nothing to render because of clipping (in which
case [cmd] is unchanged), true otherwise. [*box] is also updated to reflect
the final box after clipping but not accounting for edges. */
bool gint_image_mkcmd(struct gint_image_box *box, image_t const *img,
int effects, bool left_edge, bool right_edge,
struct gint_image_cmd *cmd, int out_width, int out_height);
/* Entry point of the renderers. These functions can be called normally as long
as you can build the commands (eg. by using gint_image_mkcmd() then filling
the effect-specific information). */
void *gint_image_rgb16_loop (int output_width, struct gint_image_cmd *cmd);
void *gint_image_p8_loop (int output_width, struct gint_image_cmd *cmd);
void *gint_image_p4_loop (int output_width, struct gint_image_cmd *cmd);
/* Renderer fragments. The following can absolutely not be called from C code
as they aren't full functions (and this isn't their prototype). These are
continuations to be specified in the [.loop] field of a command before using
one of the functions above. */
void gint_image_rgb16_normal(void);
void gint_image_rgb16_clearbg(void);
void gint_image_rgb16_swapcolor(void);
void gint_image_rgb16_dye(void);
void gint_image_p8_normal(void);
void gint_image_p8_clearbg(void);
void gint_image_p8_swapcolor(void);
void gint_image_p8_dye(void);
void gint_image_p4_normal(void);
void gint_image_p4_clearbg(void);
void gint_image_p4_swapcolor(void);
void gint_image_p4_dye(void);
#endif /* FXCG50 */
#ifdef __cplusplus
}
#endif
#endif /* GINT_IMAGE */

107
src/render-cg/image/image.c Normal file
View File

@ -0,0 +1,107 @@
#include <gint/image.h>
#include <gint/display.h>
bool gint_image_clip_input(image_t const *img, struct gint_image_box *b,
int out_w, int out_h)
{
/* Adjust the bounding box of the input image */
if(b->left < 0) b->w += b->left, b->x -= b->left, b->left = 0;
if(b->top < 0) b->h += b->top, b->y -= b->top, b->top = 0;
if(b->left + b->w > img->width) b->w = img->width - b->left;
if(b->top + b->h > img->height) b->h = img->height - b->top;
/* Check whether the box intersects the screen */
if(b->w <= 0 || b->h <= 0)
return false;
if(b->x + b->w <= 0 || b->x >= out_w)
return false;
if(b->y + b->w <= 0 || b->y >= out_h)
return false;
return true;
}
void gint_image_clip_output(struct gint_image_box *b, int out_w, int out_h)
{
/* Intersect with the bounding box on-screen */
if(b->y < 0) b->top -= b->y, b->h += b->y, b->y = 0;
if(b->y + b->h > out_h) b->h = (out_h - b->y);
if(b->x < 0) b->left -= b->x, b->w += b->x, b->x = 0;
if(b->x + b->w > out_w) b->w = (out_w - b->x);
}
bool gint_image_mkcmd(struct gint_image_box *box, image_t const *img,
int effects, bool left_edge, bool right_edge,
struct gint_image_cmd *cmd, int out_width, int out_height)
{
/* Convert the old DIMAGE_NOCLIP flag */
if(effects & DIMAGE_NOCLIP)
effects |= IMAGE_NOCLIP;
if(!(effects & IMAGE_NOCLIP_INPUT)) {
if(!gint_image_clip_input(img, box, out_width, out_height))
return false;
}
if(!(effects & IMAGE_NOCLIP_OUTPUT))
gint_image_clip_output(box, out_width, out_height);
cmd->effect = (effects & (IMAGE_VFLIP | IMAGE_HFLIP)) >> 8;
cmd->columns = box->w;
cmd->input_stride = img->width;
cmd->x = box->x;
cmd->edge_1 = -1;
cmd->edge_2 = -1;
int p = img->profile;
int input_row = (effects & IMAGE_VFLIP) ? box->top+box->h-1 : box->top;
if(p == IMAGE_RGB565 || p == IMAGE_RGB565A) {
cmd->input_stride += (cmd->input_stride & 1);
cmd->input = (void *)img->data +
(input_row * cmd->input_stride + box->left) * 2;
}
else if(p == IMAGE_P8_RGB565 || p == IMAGE_P8_RGB565A) {
cmd->input = (void *)img->data + img->data[0] * 2 + 2 +
(input_row * img->width + box->left);
cmd->palette = (void *)img->data + 258;
}
else {
cmd->input = (void *)img->data + 32 +
input_row * ((img->width + 1) >> 1) + (box->left >> 1);
cmd->palette = img->data;
/* By default, use edge_1 to indicate (box->left & 1), so that
functions that don't use edge_1 can still work properly */
if(!left_edge)
cmd->edge_1 = (box->left & 1);
}
if(left_edge && (box->left & 1)) {
if(effects & IMAGE_HFLIP) {
cmd->edge_1 = cmd->columns;
}
else {
cmd->x--;
cmd->edge_1 = 0;
}
cmd->columns++;
}
if(right_edge && (cmd->columns & 1)) {
if(effects & IMAGE_HFLIP) {
cmd->x--;
cmd->edge_1++;
cmd->edge_2 = 0;
}
else {
cmd->edge_2 = cmd->columns;
}
cmd->columns++;
}
/* Settings for further updates */
cmd->height = box->h;
/* This is the default for gint, but Azur overwrites it */
cmd->lines = box->h;
cmd->output = (void *)gint_vram + (DWIDTH * box->y + cmd->x) * 2;
return true;
}

View File

@ -0,0 +1,25 @@
/* START: Sets up the inner and outer loop. The outer loop is anything between
the calls to macros START and END, while the inner loop is the code between
labels 2: and 3: (both *INCLUDED*). */
.macro START
ldrs 2f
ldre 3f
1: ldrc r2
nop
.endm
/* END: Finishes the outer loop and adds strides. */
.macro END
dt r1
add r4, r3
bf.s 1b
add r6, r5
.endm
/* EPILOGUE: Finishes the call by reloading registers saved in the prologue. */
.macro EPILOGUE
mov.l @r15+, r9
mov r3, r0
rts
mov.l @r15+, r8
.endm

View File

@ -0,0 +1,86 @@
.global _gint_image_p4_loop
/* gint's image renderer: 4-bit indexed entry point
P4 compacts pixel data further than P8 by restricting values to a 16-color
palette and packing 2 pixels in each byte. This severely restricts our
ability to use sub-images because odd positions land within bytes.
Fortunately, we can solve this by using more edge pixels. The simplest way
to write a P4 loop is to process 2 pixels from a 2-aligned source image
position in a single iteration. Other structures don't even come close in
terms of CPU performance (which, as a reminder, is the main bottleneck in
Azur but not in gint): selecting nibbles individually is too long, while not
unrolling is still clearly inefficient. So it becomes very important to
forcibly align the sub-image on byte-aligned input boundaries and stick to
that grid.
Obviously, this approach causes up to one extra pixel to be overwritten on
each side of every line. We solve this problem by adding *another* edge
pixel on the left side. In the renderer this is called the left edge or
edge_1, while the standard one is called right edge or edge_2.
r0: - (initially: cmd.effect)
r1: Number of lines remaining to draw
r2: Number of columns per line
r3: Input pointer
r4: Input stride
r5: Output pointer
r6: Output stride
r7: Right edge pointer
r8: - (initially: cmd)
r9: - (initially: cmd.loop)
r10: Left edge pointer */
_gint_image_p4_loop:
/* r4: int output_width (pixels)
r5: struct gint_image_cmd *cmd */
mov.b @(1,r5), r0 /* cmd.effect */
add #2, r5
mov.w @r5+, r2 /* cmd.columns */
mov r4, r6
mov.l r8, @-r15
mov r5, r8
/* For here on the command is r8 */
mov.l r9, @-r15
sub r2, r6
mov.w @r8+, r4 /* cmd.input_stride */
add r6, r6
mov.b @r8+, r1 /* cmd.lines */
shlr r4
mov.l r10, @-r15
extu.b r1, r1
mov.b @r8+, r10 /* cmd.edge_1 */
nop
mov #0, r9
addc r9, r4 /* r4 = (img.width + 1) >> 1 */
mov.l @r8+, r9
shlr r0 /* T bit is now VFLIP */
mov.l @r8+, r5 /* cmd.output */
nop
bf.s _NO_VFLIP
mov.l @r8+, r3 /* cmd.input */
_VFLIP:
neg r4, r4
nop
_NO_VFLIP:
mov r2, r7
shlr r7
jmp @r9
subc r7, r4

View File

@ -0,0 +1,42 @@
#include <gint/image.h>
#include <gint/display.h>
void dimage_p4(int x, int y, image_t const *img, int eff)
{
dsubimage_p4(x, y, img, 0, 0, img->width, img->height, eff);
}
void dsubimage_p4(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff)
{
if(img->profile == IMAGE_P4_RGB565A)
return dsubimage_p4_clearbg(x, y, img, left, top, w, h, eff,
img->alpha);
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
DHEIGHT)) return;
cmd.loop = gint_image_p4_normal;
gint_image_p4_loop(DWIDTH, &cmd);
}
void dimage_p4_clearbg(int x, int y, image_t const *img, int eff, int bg)
{
dsubimage_p4_clearbg(x, y, img, 0, 0, img->width, img->height, eff,bg);
}
void dsubimage_p4_clearbg(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, int bg_color)
{
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(!gint_image_mkcmd(&box, img, eff, true, true, &cmd, DWIDTH,
DHEIGHT)) return;
cmd.effect += 4;
cmd.color_1 = bg_color;
cmd.loop = gint_image_p4_clearbg;
gint_image_p4_loop(DWIDTH, &cmd);
}

View File

@ -0,0 +1,153 @@
.global _gint_image_p4_clearbg
#include "image_macros.S"
/* P4 CLEARBG, RAM version: by NULL canceling.
This function is similar to P8 CLEARBG. Transparent pixels are not limited
by RAM writing speed, so a tight CPU loop is used. See P8 CLEARBG for an
explanation of NULL canceling.
r0: [temporary]
r7: Right edge pointer
r8: Alpha value
r9: Palette
r10: Left edge pointer
r11: Nullable output pointer
r12: 0 (in outer loop: edge stride)
r13: [temporary]
r14: [temporary]
Spilled to stack:
@(-12,r15): Right edge value
@(-8,r15): Left edge value
@(-4,r15): Edge stride */
.macro GEN_CLEARBG_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
shlr r2
nop
add r10, r10
nop
mov.l @r8+, r9 /* cmd.palette */
mov r2, r0
mov.w @r8+, r7 /* cmd.edge_2 */
shll2 r0
mov.l r12, @-r15
shll r7
mov.l r11, @-r15
add r5, r7
mov r0, r12
add r6, r12
mov.l r13, @-r15
add r5, r10
mov.l r14, @-r15
add #-4, r5
mov.w @r8, r8 /* cmd.color_1 */
add #-1, r4 /* Input stride compensation for pipelining */
.if \HFLIP
add r0, r5
nop
shll r0
nop
add r0, r6
nop
.endif
shll r8 /* alpha*2 compares against palette offsets */
nop
START
mov.b @r3+, \TMP1
nop
mov.w @r7, r0 /* Save right edge */
nop
mov.l r0, @-r15
shll \TMP1
mov.w @r10, r0 /* Save left edge */
nop
mov.l r0, @-r15
nop
mov.l r12, @-r15
mov #0, r12
2: mov \TMP1, r0
and #0x1e, r0
cmp/eq r0, r8
mov #-1, r11
addc r12, r11
mov #-4, \TMP2
and r5, r11
mov.w @(r0,r9), r0
shld \TMP2, \TMP1
mov #0x1e, \TMP2
and \TMP2, \TMP1
mov.w r0, @(\OFF1,r11)
cmp/eq \TMP1, r8
mov #-1, r11
addc r12, r11
mov \TMP1, r0
and r5, r11
mov.b @r3+, \TMP1
add #\OUT_DIR, r5
mov.w @(r0,r9), r0
mov.w r0, @(\OFF2,r11)
3: shll \TMP1
mov.l @r15+, r12
nop
mov.l @r15+, r0
nop
mov.w r0, @r10 /* Restore left edge */
add r12, r10
mov.l @r15+, r0
nop
mov.w r0, @r7 /* Restore right edge */
add r12, r7
END
mov.l @r15+, r14
mov.l @r15+, r13
mov.l @r15+, r11
mov.l @r15+, r12
mov.l @r15+, r10
EPILOGUE
.endm
_gint_image_p4_clearbg:
tst #1, r0
bf 9f
GEN_CLEARBG_LOOP 0, 4, r13, r14, 6, 4
9: GEN_CLEARBG_LOOP 1, -4, r13, r14, 0, 2

View File

@ -0,0 +1,147 @@
.global _gint_image_p4_dye
#include "image_macros.S"
/* P4 DYE, RAM version: by NULL canceling.
Like with P8, this effect removes most of the complexity because there is no
longer any need to index the palette. However the decoding still takes a lot
of EX work so the performance is not as good. Since there are transparent
areas, Azur's CPU-bound version is at least to some extent faster than
bopti, so that's what we're using.
See P8 CLEARBG for an explanation of NULL canceling.
r0: Dye value
r7: Right edge pointer
r8: Alpha value
r9: 0 (to neutralize addc during NULL-cancelling)
r10: Left edge pointer
r11: Nullable output pointer
r12: Edge stride
r13: [temporary]
r14: [temporary]
Spilled to stack:
@(-8,r15): Right edge value
@(-4,r15): Left edge value */
.macro GEN_DYE_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
shlr r2
nop
add r10, r10
nop
mov.l @r8+, r0 /* cmd.palette (don't care) */
mov r2, r0
mov.w @r8+, r7 /* cmd.edge_2 */
shll2 r0
mov.l r12, @-r15
shll r7
mov.l r11, @-r15
add r5, r7
mov r0, r12
add r6, r12
mov.l r13, @-r15
add r5, r10
mov.l r14, @-r15
add #-4, r5
.if \HFLIP
add r0, r5
nop
shll r0
nop
add r0, r6
nop
.endif
mov.w @(2,r8), r0 /* cmd.color_2 (dye value) */
add #-1, r4 /* Input stride compensation for pipelining */
mov.w @r8, r8 /* cmd.color_1 (alpha value) */
nop
START
mov.b @r3+, \TMP1
nop
mov.w @r7, \TMP2 /* Save right edge */
nop
mov.l \TMP2, @-r15
mov #0x0f, \TMP2
mov.w @r10, r9 /* Save left edge */
and \TMP1, \TMP2
mov.l r9, @-r15
mov #0, r9
2: cmp/eq \TMP2, r8
mov #-1, r11
addc r9, r11
mov #-4, \TMP2
and r5, r11
nop
shld \TMP2, \TMP1
mov #0x0f, \TMP2
and \TMP2, \TMP1
mov.w r0, @(\OFF1,r11)
cmp/eq \TMP1, r8
mov #-1, r11
addc r9, r11
mov.b @r3+, \TMP1
and r5, r11
nop
mov #0x0f, \TMP2
and \TMP1, \TMP2
add #\OUT_DIR, r5
3: mov.w r0, @(\OFF2,r11)
mov.l @r15+, \TMP2
nop
mov.w \TMP2, @r10 /* Restore left edge */
add r12, r10
mov.l @r15+, \TMP2
nop
mov.w \TMP2, @r7 /* Restore right edge */
add r12, r7
END
mov.l @r15+, r14
mov.l @r15+, r13
mov.l @r15+, r11
mov.l @r15+, r12
mov.l @r15+, r10
EPILOGUE
.endm
_gint_image_p4_dye:
tst #1, r0
bf 9f
GEN_DYE_LOOP 0, 4, r13, r14, 6, 4
9: GEN_DYE_LOOP 1, -4, r13, r14, 0, 2

View File

@ -0,0 +1,23 @@
#include <gint/display.h>
#include <gint/image.h>
void dimage_p4_dye(int x, int y, image_t const *img, int eff, int dye_color)
{
dsubimage_p4_dye(x, y, img, 0, 0, img->width, img->height, eff,
dye_color);
}
void dsubimage_p4_dye(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, int dye_color)
{
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(!gint_image_mkcmd(&box, img, eff, true, true, &cmd, DWIDTH,
DHEIGHT)) return;
cmd.effect += 12;
cmd.color_1 = img->alpha;
cmd.color_2 = dye_color;
cmd.loop = gint_image_p4_dye;
gint_image_p4_loop(DWIDTH, &cmd);
}

View File

@ -0,0 +1,32 @@
#include <gint/image.h>
void dsubimage_p4_effect(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, ...)
{
va_list args;
va_start(args, eff);
if(eff & IMAGE_CLEARBG) {
int bg = va_arg(args, int);
dsubimage_p4_clearbg(x, y, img, left, top, w, h, eff, bg);
}
else if(eff & IMAGE_SWAPCOLOR) {
int from = va_arg(args, int);
int to = va_arg(args, int);
dsubimage_p4_swapcolor(x, y, img, left, top, w, h, eff, from,
to);
}
else if(eff & IMAGE_ADDBG) {
int bg = va_arg(args, int);
dsubimage_p4_addbg(x, y, img, left, top, w, h, eff, bg);
}
else if(eff & IMAGE_DYE) {
int dye = va_arg(args, int);
dsubimage_p4_dye(x, y, img, left, top, w, h, eff, dye);
}
else {
dsubimage_p4(x, y, img, left, top, w, h, eff);
}
va_end(args);
}

View File

@ -0,0 +1,125 @@
.global _gint_image_p4_normal
#include "image_macros.S"
/* P4 Opaque rendering, VRAM version: by unrolling without edge pixels.
This is the most unique function in the renderer, Azur included. A P4 image
cannot reasonably be decoded on a per-pixel basis because extracting half-
bytes is too slow. But using edge pixels results in extra write surface that
makes us slower than bopti in gint 2.7.
This loop is thus the only one to implement 2-unrolling (no pipeline) while
manually avoiding the writes that a pair of edge pixels usually fix. Subtle
adjustments to strides are involved, making this function one of the most
tricky.
A slight change is made to the command for the purpose of this function;
cmd.edge_1 (which is r10) is set to indicate whether the [left] side of the
box is even (r10=0) or odd (r10=1). This allows us to enter the loop at the
correct position.
r0: [temporary]
r7: [temporary]
r8: Column counter
r9: Palette
r10: box->left & 1
r11: [temporary] */
.macro GEN_NORMAL_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
mov.l @r8+, r9 /* cmd.palette */
add #-4, r5 /* Better positioning for @(OFF[12], r5) */
/* The following arithmetic is to decrease r4 if the width is even
(r2 & 1) and left is odd (r10 = 1), since that means both the first
and last pixel load a full byte but use only half */
mov r2, r0
xor #1, r0
mov.w @r8+, r7 /* cmd.edge_2 (don't care) */
and r10, r0
mov.l r11, @-r15
sub r0, r4
.if \HFLIP
mov r2, r0
shll r0
add r0, r5
nop
shll r0
nop
add r0, r6
nop
.endif
1: mov r2, r8
tst r10, r10 /* Check whether to do an extra half iter. */
bt 2f
nop
/* Additional half-iteration if box->left = 1 */
mov.b @r3+, r0
shll r0
and #0x1e, r0
mov.w @(r0, r9), r0
dt r8
mov.w r0, @(\OFF1, r5)
bt.s 3f
add #\OUT_DIR, r5
/* The main loop needs to load pixels in output order. This is not
ideal for CPU usage, but we have some margins */
2: mov.b @r3+, \TMP1
mov #-4, \TMP2
/* Stall */
shll \TMP1
mov \TMP1, r0
shld \TMP2, r0
nop
and #0x1e, r0
mov #0x1e, \TMP2
/* Stall */
mov.w @(r0,r9), r0
and \TMP2, \TMP1
dt r8
mov.w r0, @(\OFF1,r5)
bt.s 3f
add #\OUT_DIR, r5
mov \TMP1, r0
add #\OUT_DIR, r5
dt r8
mov.w @(r0,r9), r0
bf.s 2b
mov.w r0, @(\OFF2,r5)
3: END
mov.l @r15+, r11
mov.l @r15+, r10
EPILOGUE
.endm
_gint_image_p4_normal:
tst #1, r0
bf 9f
GEN_NORMAL_LOOP 0, 2, r7, r11, 4, 2
9: GEN_NORMAL_LOOP 1, -2, r7, r11, 2, 4

View File

@ -0,0 +1,175 @@
.global _gint_image_p4_swapcolor
#include "image_macros.S"
/* P4 SWAPCOLOR, RAM version: by branchless xor selection.
I'm not sure whether this is the most optimized version for RAM. But it's
about 7-8% slower than bopti, and the effort of writing yet another
variation of P4's arduous loops doesn't seem worth it for a rare dynamic
effect. This is Azur's version.
See P8 SWAPCOLOR for an explanation of branchless xor selection.
r0: [temporary]
r7: Right edge pointer
r8: palette[cmd.color_1] ^ cmd.color_2 (ie. x ^ y)
r9: Palette
r10: Left edge pointer
r11: Holds (x ^ y) & -(c == x) during selection
r12: cmd.color_1
r13: [temporary]
r14: [temporary] (in outer loop: edge stride)
Spilled to stack:
@(-12,r15): Right edge value
@(-8,r15): Left edge value
@(-4,r15): Edge stride */
.macro GEN_SWAPCOLOR_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
shlr r2
nop
add r10, r10
nop
mov.l @r8+, r9 /* cmd.palette */
mov r2, r0
mov.w @r8+, r7 /* cmd.edge_2 */
shll2 r0
mov.l r12, @-r15
shll r7
mov.l r13, @-r15
add r5, r7
mov.w @r8+, r13 /* cmd.color_1 */
add r5, r10
mov.l r11, @-r15
add #-4, r5
mov r13, r12
shll r13
mov.l r14, @-r15
add r9, r13
mov.w @r8, r8 /* cmd.color_2 */
add #-1, r4 /* Input stride compensation for pipelining */
mov.w @r13, r13
mov r0, r14
add r6, r14
nop
xor r13, r8
nop
.if \HFLIP
add r0, r5
nop
shll r0
nop
add r0, r6
nop
.endif
shll r12 /* Compare color_1 * 2 with shifted values */
nop
START
mov.b @r3+, \TMP1
nop
mov.w @r7, r0 /* Save right edge */
nop
mov.l r0, @-r15
shll \TMP1
mov.w @r10, r0 /* Save left edge */
nop
mov.l r0, @-r15
nop
mov.l r14, @-r15
nop
2: mov \TMP1, r0
and #0x1e, r0
cmp/eq r0, r12
mov #-4, \TMP2
subc r11, r11
nop
mov.w @(r0,r9), r0
and r8, r11
shld \TMP2, \TMP1
mov #0x1e, \TMP2
xor r11, r0
mov.w r0, @(\OFF1,r5)
and \TMP2, \TMP1
nop
cmp/eq \TMP1, r12
nop
subc r11, r11
mov \TMP1, r0
add #\OUT_DIR, r5
mov.b @r3+, \TMP1
and r8, r11
mov.w @(r0,r9), r0
shll \TMP1
nop
xor r11, r0
3: mov.w r0, @(\OFF2,r5)
mov.l @r15+, r14
nop
mov.l @r15+, r0
nop
mov.w r0, @r10 /* Restore left edge */
add r14, r10
mov.l @r15+, r0
nop
mov.w r0, @r7 /* Restore right edge */
add r14, r7
END
mov.l @r15+, r14
mov.l @r15+, r11
mov.l @r15+, r13
mov.l @r15+, r12
mov.l @r15+, r10
EPILOGUE
.endm
_gint_image_p4_swapcolor:
tst #1, r0
bf 9f
GEN_SWAPCOLOR_LOOP 0, 4, r13, r14, 6, 0
9: GEN_SWAPCOLOR_LOOP 1, -4, r13, r14, 0, 6

View File

@ -0,0 +1,46 @@
#include <gint/display.h>
#include <gint/image.h>
void dimage_p4_swapcolor(int x, int y, image_t const *img, int eff,
int old_color, int new_color)
{
dsubimage_p4_swapcolor(x, y, img, 0, 0, img->width, img->height,
eff, old_color, new_color);
}
void dsubimage_p4_swapcolor(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, int old_index, int new_color)
{
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(!gint_image_mkcmd(&box, img, eff, true, true, &cmd, DWIDTH,
DHEIGHT)) return;
cmd.effect += 8;
cmd.color_1 = old_index;
cmd.color_2 = new_color;
cmd.loop = gint_image_p4_swapcolor;
gint_image_p4_loop(DWIDTH, &cmd);
}
void dimage_p4_addbg(int x, int y, image_t const *img, int eff,
int bg_color)
{
dsubimage_p4_addbg(x, y, img, 0, 0, img->width, img->height,
eff, bg_color);
}
void dsubimage_p4_addbg(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, int bg_color)
{
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(!gint_image_mkcmd(&box, img, eff, true, true, &cmd, DWIDTH,
DHEIGHT)) return;
cmd.effect += 8;
cmd.color_1 = img->alpha;
cmd.color_2 = bg_color;
cmd.loop = gint_image_p4_swapcolor;
gint_image_p4_loop(DWIDTH, &cmd);
}

View File

@ -0,0 +1,103 @@
.global _gint_image_p8_loop
/* gint's image renderer: 8-bit indexed entry point
P8 compacts images by indexing each pixel on a 256-color palette, thus
halving the amount of data per pixel. This comes at the cost of an
additional lookup during rendering. For these format, there is no way to
bundle pixels together, and the more advanced loops handle pixels
individually with a 2-unrolled 2-stage-pipeline structure to accelerate the
CPU processing when that is the bottleneck (which often means where there
are transparent pixels to skip).
For readers not familiar with loop optimization literature, the main idea is
that a simple loop which loads a pixel, processes it, and writes it, is too
inefficient because of RAW delays. To use the full speed of the CPU, one
needs to do more work in parallel and spread out actions on a single pixel,
which we do here with two loop transforms:
* _Pipelining_ the loop consists in handling a single pixel over several
iterations by doing a little bit of work in each iteration. The data for
the pixel would move from register to register at each iteration, with the
loop code doing one stage's worth of computation on each register. This
gives us more pixels to work on simultaneously, and more independent work
means less RAW limitations. Loops in this renderer have 2 stages at most.
* _Unrolling_ iterations of the loop consists in loading two (or more) pixels
at the start of each iteration so that we can work on one while waiting
for stalls and dependencies on the other. Unlike pipelining, pixels are
still confined within iterations. Non-trivial loops in this renderer
process 2 pixels per iteration.
Unrolling has one major flaw: handling pairs of pixels only works if the
total amount of pixels to draw is even. The usual way to handle this for n
pixels is to do n/2 iterations and handle the last pixel individually if n
is odd. This is extremely annoying, since every row must check the value of
n, and an extra copy of the loop code for a single pixel must be maintained
on the side, which takes more space and more effort.
However, we have a specialized solution here with *edge pixels*. The idea of
edge pixels is to round the number of pixels *up* and perform (n+1)/2 runs
of the inner loop. If n is odd, this will overwrite a single pixel at the
end of the line. We can cancel this error after-the-fact by saving the value
of the (n+1)-th pixel of the line before the loop, and restoring it
afterwards. Note that if n is even then the save/restore is a no-op.
This takes some caution however, as the temporary overwrite could be seen by
an interrupt. Some measures are put in place to reserve a couple of bytes on
each side of gint's VRAM and Azur's target fragment to avoid any problems.
r0: - (initially: cmd.effect)
r1: Number of lines remaining to draw
r2: Number of columns per line
r3: Input pointer
r4: Input stride
r5: Output pointer
r6: Output stride
r7: Right edge or [temporary]
r8: - (initially: cmd)
r9: - (initially: cmd.loop) */
_gint_image_p8_loop:
/* r4: int output_width (pixels)
r5: struct gint_image_cmd *cmd */
mov.b @(1,r5), r0 /* cmd.effect */
add #2, r5
mov.l r8, @-r15
mov r4, r6
mov.w @r5+, r2 /* cmd.columns */
mov r5, r8
/* For here on the command is r8 */
mov.l r9, @-r15
shlr r0 /* T bit is now VFLIP */
mov.w @r8+, r4 /* cmd.input_stride */
sub r2, r6
mov.b @r8+, r1 /* cmd.lines */
add r6, r6
mov.b @r8+, r9 /* cmd.edge_1 - don't care */
nop
mov.l @r8+, r9
extu.b r1, r1
mov.l @r8+, r5 /* cmd.output */
nop
bf.s _NO_VFLIP
mov.l @r8+, r3 /* cmd.input */
_VFLIP:
neg r4, r4
nop
_NO_VFLIP:
jmp @r9
sub r2, r4

View File

@ -0,0 +1,42 @@
#include <gint/image.h>
#include <gint/display.h>
void dimage_p8(int x, int y, image_t const *img, int eff)
{
dsubimage_p8(x, y, img, 0, 0, img->width, img->height, eff);
}
void dsubimage_p8(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff)
{
if(img->profile == IMAGE_P8_RGB565A)
return dsubimage_p8_clearbg(x, y, img, left, top, w, h, eff,
img->alpha);
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
DHEIGHT)) return;
cmd.loop = gint_image_p8_normal;
gint_image_p8_loop(DWIDTH, &cmd);
}
void dimage_p8_clearbg(int x, int y, image_t const *img, int eff, int bg)
{
dsubimage_p8_clearbg(x, y, img, 0, 0, img->width, img->height, eff,bg);
}
void dsubimage_p8_clearbg(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, int bg_color)
{
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(!gint_image_mkcmd(&box, img, eff, false, true, &cmd, DWIDTH,
DHEIGHT)) return;
cmd.effect += 4;
cmd.color_1 = bg_color;
cmd.loop = gint_image_p8_clearbg;
gint_image_p8_loop(DWIDTH, &cmd);
}

View File

@ -0,0 +1,147 @@
.global _gint_image_p8_clearbg
#include "image_macros.S"
/* P8 CLEARBG, RAM version: by NULL canceling.
This function is one of the few that can still be bottlenecked by CPU in the
RAM model. This is because transparent pixels can be skipped over as fast as
the CPU allows without worrying about the writing speed of the RAM.
For some reason that I have yet to uncover, branches are way slower than the
SH4AL-DSP manual suggests, and even slower while inside of DSP loops. This
completely favors branchless methods, and the one used here is one I call
"NULL canceling".
The idea is that a write can be turned into a no-op by either writing the
value that is already in memory, or by writing somewhere else. The first
option is pretty slow, especially because it requires a selection operation
(rn = condition ? rn : rm) which is like the most general branchless trick.
NULL canceling abuses the fact that NULL is mapped read-only on the platform
to turn the target pointer in NULL with the following identity:
target & -(condition) = (condition ? target : NULL)
The term -(condition) is materialized with an [addc #-1, #0] instruction
after the test, then the result is applied onto the target pointer with
[and], completing the trick in only 2 EX instructions. It does take more
registers, and prevents from using pre-decrement on the target.
r0: [temporary]
r7: Right edge pointer
r8: Alpha value
r9: Palette
r10: Nullable output pointer
r11: 0 (to neutralize addc during NULL-cancelling)
r12: Right edge stride
r13: [temporary]
r14: [temporary]
Spilled to stack:
@(-4,r15): Right edge value */
.macro GEN_CLEARBG_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
mov.l @r8+, r9 /* cmd.palette */
shlr r2
mov.w @r8+, r7 /* cmd.edge_2 */
mov r2, r0
mov.l r12, @-r15
shll2 r0
mov.l r10, @-r15
shll r7
mov.l r11, @-r15
add r5, r7
mov r0, r12
add r6, r12
mov.l r13, @-r15
add #-4, r5
mov.l r14, @-r15
add #-2, r4 /* Input stride compensation for pipelining */
mov.w @r8, r8 /* cmd.color_1 ≤ 255, thus zero-extended */
mov #0, r11
.if \HFLIP
add r0, r5
nop
shll r0
nop
add r0, r6
nop
.endif
START
mov.b @r3+, \TMP2
nop
mov.w @r7, r0 /* Save right edge */
nop
mov.l r0, @-r15
cmp/eq \TMP2, r8
mov.b @r3+, \TMP1
add \TMP2, \TMP2
2: mov #-1, r10
addc r11, r10 /* r10 is now the mask */
and r5, r10
mov \TMP2, r0
cmp/eq \TMP1, r8
mov.w @(r0, r9), r0
mov.w r0, @(\OFF1, r10)
add #\OUT_DIR, r5
mov.b @r3+, \TMP2
nop
mov #-1, r10
addc r11, r10
add \TMP1, \TMP1
mov \TMP1, r0
mov.b @r3+, \TMP1
and r5, r10
mov.w @(r0, r9), r0
cmp/eq \TMP2, r8
mov.w r0, @(\OFF2, r10)
3: add \TMP2, \TMP2
mov.l @r15+, r0
nop
mov.w r0, @r7 /* Restore right edge */
add r12, r7
END
mov.l @r15+, r14
mov.l @r15+, r13
mov.l @r15+, r11
mov.l @r15+, r10
mov.l @r15+, r12
EPILOGUE
.endm
_gint_image_p8_clearbg:
tst #1, r0
bf 9f
GEN_CLEARBG_LOOP 0, 4, r13, r14, 4, 2
9: GEN_CLEARBG_LOOP 1, -4, r13, r14, 2, 4

View File

@ -0,0 +1,115 @@
.global _gint_image_p8_dye
#include "image_macros.S"
/* P8 DYE, RAM version: by NULL canceling.
This effect basically removes all the complexity out of P8 because we no
longer need to index the palette. We only keep the tight loop so that the
CPU can speed in areas with many transparent pixels. This gives some
acceleration over bopti.
See P8 CLEARBG for an explanation of NULL canceling.
r0: Dye value
r7: Right edge pointer
r8: Alpha value
r9: Right edge value
r10: Nullable output pointer
r11: 0 (to neutralize addc during NULL-cancelling)
r12: Right edge stride
r13: [temporary]
r14: [temporary] */
.macro GEN_DYE_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
mov.l @r8+, r9 /* cmd.palette (don't care) */
shlr r2
mov.w @r8+, r7 /* cmd.edge_2 */
mov r2, r0
mov.l r12, @-r15
shll2 r0
mov.l r10, @-r15
shll r7
mov.l r11, @-r15
add r5, r7
mov r0, r12
add r6, r12
mov.l r13, @-r15
add #-4, r5
mov.l r14, @-r15
add #-2, r4 /* Input stride compensation for pipelining */
.if \HFLIP
add r0, r5
nop
shll r0
nop
add r0, r6
nop
.endif
mov.w @(2,r8), r0 /* cmd.color_2 (dye value) */
nop
mov.w @r8, r8 /* cmd.color_1 ≤ 255, thus zero-extended */
mov #0, r11
START
mov.b @r3+, \TMP2
nop
mov.w @r7, r9 /* Save right edge */
nop
mov.b @r3+, \TMP1
cmp/eq \TMP2, r8
2: mov #-1, r10
addc r11, r10 /* r10 is now the mask */
and r5, r10
nop
mov.b @r3+, \TMP2
cmp/eq \TMP1, r8
mov.w r0, @(\OFF1, r10)
add #\OUT_DIR, r5
mov #-1, r10
addc r11, r10
mov.b @r3+, \TMP1
and r5, r10
cmp/eq \TMP2, r8
3: mov.w r0, @(\OFF2, r10)
mov.w r9, @r7 /* Restore right edge */
add r12, r7
END
mov.l @r15+, r14
mov.l @r15+, r13
mov.l @r15+, r11
mov.l @r15+, r10
mov.l @r15+, r12
EPILOGUE
.endm
_gint_image_p8_dye:
tst #1, r0
bf 9f
GEN_DYE_LOOP 0, 4, r13, r14, 4, 2
9: GEN_DYE_LOOP 1, -4, r13, r14, 2, 4

View File

@ -0,0 +1,23 @@
#include <gint/display.h>
#include <gint/image.h>
void dimage_p8_dye(int x, int y, image_t const *img, int eff, int dye_color)
{
dsubimage_p8_dye(x, y, img, 0, 0, img->width, img->height, eff,
dye_color);
}
void dsubimage_p8_dye(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, int dye_color)
{
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(!gint_image_mkcmd(&box, img, eff, false, true, &cmd, DWIDTH,
DHEIGHT)) return;
cmd.effect += 12;
cmd.color_1 = img->alpha;
cmd.color_2 = dye_color;
cmd.loop = gint_image_p8_dye;
gint_image_p8_loop(DWIDTH, &cmd);
}

View File

@ -0,0 +1,32 @@
#include <gint/image.h>
void dsubimage_p8_effect(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, ...)
{
va_list args;
va_start(args, eff);
if(eff & IMAGE_CLEARBG) {
int bg = va_arg(args, int);
dsubimage_p8_clearbg(x, y, img, left, top, w, h, eff, bg);
}
else if(eff & IMAGE_SWAPCOLOR) {
int from = va_arg(args, int);
int to = va_arg(args, int);
dsubimage_p8_swapcolor(x, y, img, left, top, w, h, eff, from,
to);
}
else if(eff & IMAGE_ADDBG) {
int bg = va_arg(args, int);
dsubimage_p8_addbg(x, y, img, left, top, w, h, eff, bg);
}
else if(eff & IMAGE_DYE) {
int dye = va_arg(args, int);
dsubimage_p8_dye(x, y, img, left, top, w, h, eff, dye);
}
else {
dsubimage_p8(x, y, img, left, top, w, h, eff);
}
va_end(args);
}

View File

@ -0,0 +1,42 @@
.global _gint_image_p8_normal
#include "image_macros.S"
/* P8 Opaque rendering, RAM version: trivial.
As usual with RAM it is fairly easy to bottleneck writing speed, and so
there is no need for complex methods. Building longwords could be an option,
but it would require output alignment with edges, which is painful. */
.macro GEN_NORMAL_LOOP HFLIP, OUT_DIR
mov.l @r8+, r9 /* cmd.palette */
.if \HFLIP
add #-2, r5
mov r2, r0
shll r0
add r0, r5
shll r0
add r0, r6
.endif
1: mov r2, r8
2: mov.b @r3+, r0
shll r0
mov.w @(r0, r9), r0
mov.w r0, @r5
3: dt r8
bf.s 2b
add #\OUT_DIR, r5
END
EPILOGUE
.endm
_gint_image_p8_normal:
tst #1, r0
bf 9f
GEN_NORMAL_LOOP 0, 2
9: GEN_NORMAL_LOOP 1, -2

View File

@ -0,0 +1,77 @@
.global _gint_image_p8_swapcolor
#include "image_macros.S"
/* P8 SWAPCOLOR, RAM version: by branchless xor selection.
The core action of this loop is to render full pixels while replacing any
occurrence of cmd.color_1 (x) with the value cmd.color_2 (y). Branching is
too slow as often, so instead we use the fact that both x and y are fixed to
use the identity
c ^ ((x ^ y) & -(c == x)) = (c == x ? y : c)
We materialize -(c == x) by subtracting a register from itself with subc
after the comparison (which is delightfully elegant), while (x ^ y) is pre-
computed. This way, the selection is performed in one [subc], one [and] and
one [xor] for a total of 3 EX slots. This is slower than NULL-cancelling
(which only takes 2 EX slots) but still better than symmetric alternatives.
Since we have a palette, we further trick by comparing against the index but
selecting against the palette entry, ie. we do
palette[c] ^ ((palette[x] ^ y) & -(c == x)) = (c == x ? y : palette[c])
which allows the computation to occur in parallel with the palette access
and does not require the replacement value to be located at a valid index.
r0: [temporary]
r7: cmd.color_1
r8: palette[cmd.color_1] ^ cmd.color_2 (ie. x ^ y)
r9: Palette
r10: Holds (x ^ y) & -(c == x) during selection */
.macro GEN_SWAPCOLOR_LOOP HFLIP, OUT_DIR
mov.l @r8+, r9 /* cmd.palette */
mov.w @r8+, r0 /* cmd.edge_2 (don't care) */
mov.w @r8+, r7 /* cmd.color_1 */
mov.l r10, @-r15
exts.b r7, r7
mov r7, r0
mov.w @r8, r8 /* cmd.color_2 */
add r0, r0
mov.w @(r0, r9), r0
xor r0, r8
.if \HFLIP
add #-2, r5
mov r2, r0
shll r0
add r0, r5
shll r0
add r0, r6
.endif
START
2: mov.b @r3+, r0
cmp/eq r0, r7
add r0, r0
subc r10, r10
mov.w @(r0, r9), r0
and r8, r10
xor r10, r0
mov.w r0, @r5
3: add #\OUT_DIR, r5
END
mov.l @r15+, r10
EPILOGUE
.endm
_gint_image_p8_swapcolor:
tst #1, r0
bf 9f
GEN_SWAPCOLOR_LOOP 0, 2
9: GEN_SWAPCOLOR_LOOP 1, -2

View File

@ -0,0 +1,46 @@
#include <gint/display.h>
#include <gint/image.h>
void dimage_p8_swapcolor(int x, int y, image_t const *img, int eff,
int old_color, int new_color)
{
dsubimage_p8_swapcolor(x, y, img, 0, 0, img->width, img->height,
eff, old_color, new_color);
}
void dsubimage_p8_swapcolor(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, int old_index, int new_color)
{
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
DHEIGHT)) return;
cmd.effect += 8;
cmd.color_1 = old_index;
cmd.color_2 = new_color;
cmd.loop = gint_image_p8_swapcolor;
gint_image_p8_loop(DWIDTH, &cmd);
}
void dimage_p8_addbg(int x, int y, image_t const *img, int eff,
int bg_color)
{
dsubimage_p8_addbg(x, y, img, 0, 0, img->width, img->height,
eff, bg_color);
}
void dsubimage_p8_addbg(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, int bg_color)
{
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
DHEIGHT)) return;
cmd.effect += 8;
cmd.color_1 = img->alpha;
cmd.color_2 = bg_color;
cmd.loop = gint_image_p8_swapcolor;
gint_image_p8_loop(DWIDTH, &cmd);
}

View File

@ -0,0 +1,69 @@
.global _gint_image_rgb16_loop
/* gint's image renderer: 16-bit RGB entry piont
These formats are the simplest of the bunch. RGB565 can use longword access
in cases when alignment is favorable and no geometric effect is applied. In
other cases, pixels are handled individually; geometric effects affect the
input/output logic while color effects change the computations themselves.
r0: - (initially: cmd.effect)
r1: Number of lines remaining to draw
r2: Number of columns per line
r3: Input pointer
r4: Input stride
r5: Output pointer
r6: Output stride
r7: Right edge (only used in Azur) or [temporary]
r8: - (initially: cmd)
r9: - (initially: cmd.loop) */
_gint_image_rgb16_loop:
/* r4: int output_width (pixels)
r5: struct gint_image_cmd *cmd */
mov.b @(1,r5), r0 /* cmd.effect */
add #2, r5
mov.l r8, @-r15
mov r4, r6
mov.w @r5+, r2 /* cmd.columns */
mov r5, r8
/* For here on the command is r8 */
mov.l r9, @-r15
shlr r0 /* T bit is now VFLIP */
mov.w @r8+, r4 /* cmd.input_stride */
sub r2, r6
mov.b @r8+, r1 /* cmd.lines */
add r6, r6
mov.b @r8+, r9 /* cmd.edge_1 (don't care) */
nop
mov.l @r8+, r9
extu.b r1, r1
mov.l @r8+, r5 /* cmd.output */
nop
mov.l @r8+, r3 /* cmd.input */
nop
bf.s _NO_VFLIP
add #4, r8 /* cmd.palette (don't care) */
_VFLIP:
neg r4, r4
nop
_NO_VFLIP:
sub r2, r4
nop
jmp @r9
add r4, r4

View File

@ -0,0 +1,43 @@
#include <gint/image.h>
#include <gint/display.h>
void dimage_rgb16(int x, int y, image_t const *img, int eff)
{
dsubimage_rgb16(x, y, img, 0, 0, img->width, img->height, eff);
}
void dsubimage_rgb16(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff)
{
if(img->profile == IMAGE_RGB565A)
return dsubimage_rgb16_clearbg(x, y, img, left, top, w, h, eff,
img->alpha);
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
DHEIGHT)) return;
cmd.loop = gint_image_rgb16_normal;
gint_image_rgb16_loop(DWIDTH, &cmd);
}
void dimage_rgb16_clearbg(int x, int y, image_t const *img, int eff,int bg)
{
dsubimage_rgb16_clearbg(x, y, img, 0, 0, img->width, img->height, eff,
bg);
}
void dsubimage_rgb16_clearbg(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, int bg_color)
{
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
DHEIGHT)) return;
cmd.effect += 4;
cmd.color_1 = bg_color;
cmd.loop = gint_image_rgb16_clearbg;
gint_image_rgb16_loop(DWIDTH, &cmd);
}

View File

@ -0,0 +1,53 @@
.global _gint_image_rgb16_clearbg
.global _gint_image_rgb16_dye
#include "image_macros.S"
/* RGB16 CLEARBG and DYE, RAM version: trivial.
This function handles both CLEARBG and DYE; in RGB16 they are the same,
except that DYE writes not the pixel value (TMP) but a fixed color (SRC). As
if often the case, the RAM speed is limiting, so there is no point in
improving speed of the code on the CPU side. */
.macro GEN_CLEARBG_DYE_LOOP HFLIP, OUT_DIR, TMP, SRC
mov.w @r8+, r0 /* cmd.edge_2 (don't care) */
mov.w @r8+, r9 /* cmd.color_1 (alpha color) */
mov.w @r8+, r0 /* cmd.color_2 (dye color) */
.if \HFLIP
add #-2, r5
mov r2, r8
shll r8
add r8, r5
shll r8
add r8, r6
.endif
1: mov r2, r8
2: mov.w @r3+, \TMP
cmp/eq \TMP, r9
bt 3f
mov.w \SRC, @r5
3: dt r8
bf.s 2b
add #(\OUT_DIR/2), r5
END
EPILOGUE
.endm
_gint_image_rgb16_clearbg:
tst #1, r0
bf 9f
GEN_CLEARBG_DYE_LOOP 0, 4, r0, r0
9: GEN_CLEARBG_DYE_LOOP 1, -4, r0, r0
_gint_image_rgb16_dye:
tst #1, r0
bf 9f
GEN_CLEARBG_DYE_LOOP 0, 4, r7, r0
9: GEN_CLEARBG_DYE_LOOP 1, -4, r7, r0

View File

@ -0,0 +1,23 @@
#include <gint/display.h>
#include <gint/image.h>
void dimage_rgb16_dye(int x, int y, image_t const *img, int eff, int dye_color)
{
dsubimage_rgb16_dye(x, y, img, 0, 0, img->width, img->height, eff,
dye_color);
}
void dsubimage_rgb16_dye(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, int dye_color)
{
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
DHEIGHT)) return;
cmd.effect += 12;
cmd.color_1 = img->alpha;
cmd.color_2 = dye_color;
cmd.loop = gint_image_rgb16_dye;
gint_image_rgb16_loop(DWIDTH, &cmd);
}

View File

@ -0,0 +1,32 @@
#include <gint/image.h>
void dsubimage_rgb16_effect(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, ...)
{
va_list args;
va_start(args, eff);
if(eff & IMAGE_CLEARBG) {
int bg = va_arg(args, int);
dsubimage_rgb16_clearbg(x, y, img, left, top, w, h, eff, bg);
}
else if(eff & IMAGE_SWAPCOLOR) {
int from = va_arg(args, int);
int to = va_arg(args, int);
dsubimage_rgb16_swapcolor(x, y, img, left, top, w, h, eff,
from, to);
}
else if(eff & IMAGE_ADDBG) {
int bg = va_arg(args, int);
dsubimage_rgb16_addbg(x, y, img, left, top, w, h, eff, bg);
}
else if(eff & IMAGE_DYE) {
int dye = va_arg(args, int);
dsubimage_rgb16_dye(x, y, img, left, top, w, h, eff, dye);
}
else {
dsubimage_rgb16(x, y, img, left, top, w, h, eff);
}
va_end(args);
}

View File

@ -0,0 +1,201 @@
.global _gint_image_rgb16_normal
#include "image_macros.S"
/* RGB16 Opaque rendering, RAM version: by longword access.
This function of the image renderer is designed for the RAM model only. At
default overclock levels, the RAM can register a write every 13-14 cycles,
regardless of size. Since this amount of time is more than enough to build a
target longword regardless of alignment and geometry considerations, the
main and only focus of this function is to only write longwords.
Since longwords can only be written at 4-aligned addresses and always make
pairs of pixels, there are variations on the loop depending on the rendered
width and destination. These are marked with the following convention:
* w1 / w2 denotes the parity of the command width;
* o2 / o4 denotes the alignment of the output.
There is a forward and a backward variation for all four combinations of
these parameters, noted F_ and B_ in label names. Some word-based variations
are provided for width 8, which is just a way to ensure that the longword-
based loops always have a least one interation, since they're implemented as
do/while.
The loops themselves are nowhere near tight on the CPU side and entirely
bottlenecked by the RAM, hence the simplicity and complete disregard for
superscalar parallelism. */
_gint_image_rgb16_normal:
/* We use word copy for width 8; this is to ensure that there is at
least one longword in the non-trivial loop, simplifying checks */
tst #1, r0
mov #8, r0
bf.s .BACKWARD
cmp/ge r2, r0
.FORWARD:
bt _FORWARD_WORD_COPY
nop
bra _FORWARD_LONG_COPY
nop
.BACKWARD:
mov r2, r0
add r0, r0
add r0, r5
add r0, r0
bt.s _BACKWARD_WORD_COPY
add r0, r6
bra _BACKWARD_LONG_COPY
nop
_FORWARD_WORD_COPY:
START
2: movs.w @r3+, x0
3: movs.w x0, @r5+
END
EPILOGUE
_BACKWARD_WORD_COPY:
START
2: movs.w @r3+, x0
3: movs.w x0, @-r5
END
EPILOGUE
_FORWARD_LONG_COPY:
shlr r2 /* Test width parity */
mov #2, r0
bt .F_w1
nop
.F_w2: tst r0, r5 /* Test alignment of output */
bf .F_w2o2
.F_w2o4:
START
2: mov.w @r3+, r0
mov.w @r3+, r7
shll16 r7
xtrct r0, r7
mov.l r7, @r5
3: add #4, r5
END
EPILOGUE
.F_w2o2:
add #-1, r2
START
mov.w @r3+, r0
mov.w r0, @r5
add #2, r5
2: mov.w @r3+, r0
mov.w @r3+, r7
shll16 r7
xtrct r0, r7
mov.l r7, @r5
3: add #4, r5
mov.w @r3+, r0
mov.w r0, @r5
add #2, r5
END
EPILOGUE
.F_w1: tst r0, r5 /* Test alignment of output */
bf .F_w1o2
.F_w1o4:
START
2: mov.w @r3+, r0
mov.w @r3+, r7
shll16 r7
xtrct r0, r7
mov.l r7, @r5
3: add #4, r5
mov.w @r3+, r0
mov.w r0, @r5
add #2, r5
END
EPILOGUE
.F_w1o2:
START
mov.w @r3+, r0
mov.w r0, @r5
add #2, r5
2: mov.w @r3+, r0
mov.w @r3+, r7
shll16 r7
xtrct r0, r7
mov.l r7, @r5
3: add #4, r5
END
EPILOGUE
_BACKWARD_LONG_COPY:
shlr r2 /* Test width parity */
mov #2, r0
bt .B_w1
nop
.B_w2: tst r0, r5 /* Test alignment of output */
bf .B_w2o2
.B_w2o4:
START
2: mov.w @r3+, r0
mov.w @r3+, r7
shll16 r0
xtrct r7, r0
3: mov.l r0, @-r5
END
EPILOGUE
.B_w2o2:
add #-1, r2
START
mov.w @r3+, r0
mov.w r0, @-r5
2: mov.w @r3+, r0
mov.w @r3+, r7
shll16 r0
xtrct r7, r0
3: mov.l r0, @-r5
mov.w @r3+, r0
mov.w r0, @-r5
END
EPILOGUE
.B_w1: tst r0, r5 /* Test alignment of output */
bf .B_w1o2
.B_w1o4:
START
2: mov.w @r3+, r0
mov.w @r3+, r7
shll16 r0
xtrct r7, r0
3: mov.l r0, @-r5
mov.w @r3+, r0
mov.w r0, @-r5
END
EPILOGUE
.B_w1o2:
START
mov.w @r3+, r0
mov.w r0, @-r5
2: mov.w @r3+, r0
mov.w @r3+, r7
shll16 r0
xtrct r7, r0
3: mov.l r0, @-r5
END
EPILOGUE

View File

@ -0,0 +1,45 @@
.global _gint_image_rgb16_swapcolor
#include "image_macros.S"
/* RGB16 SWAPCOLOR, RAM version: trivial.
This function is once again bottlenecked by RAM. Generating longwords would
be tight and require significant adjustments, so we stick to words, and the
trivial bopti-style version already maxes out the output rate. */
.macro GEN_SWAPCOLOR_LOOP HFLIP, OUT_DIR
mov.w @r8+, r0 /* cmd.edge_2 (don't care) */
mov.w @r8+, r9 /* cmd.color_1 */
mov.w @r8+, r7 /* cmd.color_2 */
.if \HFLIP
add #-2, r5
mov r2, r0
shll r0
add r0, r5
shll r0
add r0, r6
.endif
1: mov r2, r8
2: mov.w @r3+, r0
cmp/eq r0, r9
bf 4f
mov r7, r0
4: mov.w r0, @r5
3: dt r8
bf.s 2b
add #\OUT_DIR, r5
END
EPILOGUE
.endm
_gint_image_rgb16_swapcolor:
tst #1, r0
bf 9f
GEN_SWAPCOLOR_LOOP 0, 2
9: GEN_SWAPCOLOR_LOOP 1, -2

View File

@ -0,0 +1,46 @@
#include <gint/display.h>
#include <gint/image.h>
void dimage_rgb16_swapcolor(int x, int y, image_t const *img, int eff,
int old_color, int new_color)
{
dsubimage_rgb16_swapcolor(x, y, img, 0, 0, img->width, img->height,
eff, old_color, new_color);
}
void dsubimage_rgb16_swapcolor(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, int old_color, int new_color)
{
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
DHEIGHT)) return;
cmd.effect += 8;
cmd.color_1 = old_color;
cmd.color_2 = new_color;
cmd.loop = gint_image_rgb16_swapcolor;
gint_image_rgb16_loop(DWIDTH, &cmd);
}
void dimage_rgb16_addbg(int x, int y, image_t const *img, int eff,
int bg_color)
{
dsubimage_rgb16_addbg(x, y, img, 0, 0, img->width, img->height,
eff, bg_color);
}
void dsubimage_rgb16_addbg(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, int bg_color)
{
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
DHEIGHT)) return;
cmd.effect += 8;
cmd.color_1 = img->alpha;
cmd.color_2 = bg_color;
cmd.loop = gint_image_rgb16_swapcolor;
gint_image_rgb16_loop(DWIDTH, &cmd);
}