diff --git a/CMakeLists.txt b/CMakeLists.txt index 780b82b..a5e5ca3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -169,6 +169,35 @@ set(SOURCES_CG src/render-cg/gint_dline.c src/render-cg/topti-asm.s src/render-cg/topti.c + # Fast image renderer + src/render-cg/image/image.c + src/render-cg/image/image_rgb16.S + src/render-cg/image/image_rgb16_normal.S + src/render-cg/image/image_rgb16_clearbg_dye.S + src/render-cg/image/image_rgb16_swapcolor.S + src/render-cg/image/image_p8.S + src/render-cg/image/image_p8_normal.S + src/render-cg/image/image_p8_clearbg.S + src/render-cg/image/image_p8_swapcolor.S + src/render-cg/image/image_p8_dye.S + src/render-cg/image/image_p4.S + src/render-cg/image/image_p4_normal.S + src/render-cg/image/image_p4_clearbg.S + src/render-cg/image/image_p4_swapcolor.S + src/render-cg/image/image_p4_dye.S + # Interface to the fast image renderer + src/render-cg/image/image_rgb16.c + src/render-cg/image/image_rgb16_effect.c + src/render-cg/image/image_rgb16_swapcolor.c + src/render-cg/image/image_rgb16_dye.c + src/render-cg/image/image_p8.c + src/render-cg/image/image_p8_effect.c + src/render-cg/image/image_p8_swapcolor.c + src/render-cg/image/image_p8_dye.c + src/render-cg/image/image_p4.c + src/render-cg/image/image_p4_effect.c + src/render-cg/image/image_p4_swapcolor.c + src/render-cg/image/image_p4_dye.c ) set(ASSETS_FX src/font5x7.png) diff --git a/include/gint/display-cg.h b/include/gint/display-cg.h index d4d5916..2bc580c 100644 --- a/include/gint/display-cg.h +++ b/include/gint/display-cg.h @@ -1,11 +1,15 @@ //--- -// gint:display-cg - fxcg50 rendering functions +// gint:display-cg - fx-CG 50 rendering functions // -// This module covers all 16-bit opaque rendering functions. For -// gamma-related functions, color composition, check out a color library. +// This module covers rendering functions specific to the fx-CG 50. In addition +// to triple-buffering management, this mainly includes image manipulation +// tools as well as the very versatile dimage_effect() and dsubimage_effect() +// functions that support high-performance image rendering with a number of +// geometric and color effects. // -// All the functions in this module work on a 396x224 resolution - gint -// lets you use the full surface! +// The fx-CG OS restricts the display to a 384x216 rectangle rougly around the +// center, leaving margins on three sides. However, gint configures the display +// to use the full 396x224 surface! //--- #ifndef GINT_DISPLAY_CG @@ -18,6 +22,7 @@ extern "C" { #endif #include +#include /* Dimensions of the VRAM */ #define DWIDTH 396 @@ -57,49 +62,9 @@ enum { green is not used). */ #define C_RGB(r,g,b) (((r) << 11) | ((g) << 6) | (b)) -//--- -// Image rendering (bopti) -//--- +/* See for the details on image manipulation. */ +typedef image_t bopti_image_t; -/* bopti_image_t: Image files encoded for bopti - This format is created by the fxSDK's [fxconv] tool from standard images. */ -typedef struct -{ - /* Color profile (type of palette), could be extended into a bit field - later on */ - uint16_t profile; - - /* Color code assigned to transparent pixels (unused in 16-bit). In - P8_RGB565A, the value assigned to alpha is always 0. */ - uint16_t alpha; - - /* Full width and height, in pixels */ - uint16_t width; - uint16_t height; - - /* Here we lose structure because of the flexible array. - - RGB565, RGB565A: - * Pixels in row-major order, 16 bits per pixel - P8: - * Palette with 256 entries (512 bytes total) - * Pixels in row-major order, 8 bits per pixel - P8_RGB565A, P8_RGB565: - * Number of entries in palette, N (2 bytes) - * Palette with N entries (2N bytes) - * Pixels in row-major order, 8 bits per pixel (signed indices in - an uint16_t array starting at +<256 bytes>) - P4/P4_RGB565A, P4_RGB565: - * Palette with 16 entries (32 bytes total) - * Pixels in row-major order, 4 bits per pixel, each row - byte-padded */ - uint16_t data[]; - -} GPACKED(4) bopti_image_t; - -/* Old alias to image_t, now deprecated because of libimg */ -typedef bopti_image_t image_t __attribute__((deprecated( - "image_t has been renamed to bopti_image_t"))); //--- // Video RAM management diff --git a/include/gint/image.h b/include/gint/image.h new file mode 100644 index 0000000..b54dd75 --- /dev/null +++ b/include/gint/image.h @@ -0,0 +1,365 @@ +//--- +// gint:image - Image manipulation and rendering +// +// Note: this module is currently only available on fx-CG. +// +// This header provides image manipulation functions. This mainly consists of a +// reference-based image format, various access and modification functions, and +// a number of high-performance transformations and rendering effects. If you +// find yourself limited by rendering time, note that RAM writing speed is +// often the bottleneck, and image rendering is much faster in Azur (which is +// what the renderer was initially designed for). +// +// We support 3 bit depths: full-color 16-bit (RGB565), indexed 8-bit (P8) and +// indexed 4-bit (P4). All three have an "alpha" variation where one color is +// treated as transparent, leading to 6 total formats. +// +// The image renderers support so-called *dynamic effects*, which are image +// transformations performed on-the-fly while rendering, without generating an +// intermediate image. They comprise straightforward transformations that +// achieve similar performance to straight rendering and can be combined to +// some extent, which makes them reliable whenever applicable. +// +// TODO: Switch to libimg-style image refs. +//--- + +#ifndef GINT_IMAGE +#define GINT_IMAGE + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef FXCG50 +#error is only supported on FXCG50 +#else + +#include +#include + +//--- +// Image structures +//--- + +/* Image formats. Note that transparency really only indicates the default + rendering method, as a transparent background can always be added or removed + by a dynamic effect on any image. */ +enum { + IMAGE_RGB565 = 0, /* RGB565 without alpha */ + IMAGE_RGB565A = 1, /* RGB565 with one transparent color */ + IMAGE_P8_RGB565 = 4, /* 8-bit palette, all opaque colors */ + IMAGE_P8_RGB565A = 5, /* 8-bit with one transparent color */ + IMAGE_P4_RGB565 = 6, /* 4-bit palette, all opaque colors */ + IMAGE_P4_RGB565A = 3, /* 4-bit with one transparent color */ + + IMAGE_DEPRECATED_P8 = 2, +}; + +/* image_t: gint's native bitmap image format + Images of this format can be created through this header's API but also by + using the fxSDK's built-in image converters with fxconv. */ +typedef struct +{ + /* Color format, one of the IMAGE_* values defined above. */ + uint16_t profile; + /* For formats with alpha, value or index used for transparency. */ + uint16_t alpha; + /* Full width and height, in pixels */ + uint16_t width; + uint16_t height; + + /* Here we lose structure because of the flexible array. + + RGB565, RGB565A: + * Pixels in row-major order, 16 bits per pixel + P8: + * Palette with 256 entries (512 bytes total) + * Pixels in row-major order, 8 bits per pixel + P8_RGB565A, P8_RGB565: + * Number of entries in palette, N (2 bytes) + * Palette with N entries (2N bytes) + * Pixels in row-major order, 8 bits per pixel (signed indices in + an uint16_t array starting at +<256 bytes>) + P4/P4_RGB565A, P4_RGB565: + * Palette with 16 entries (32 bytes total) + * Pixels in row-major order, 4 bits per pixel, each row + byte-padded */ + uint16_t data[]; + +} GPACKED(4) image_t; + +/* Dynamic effects: these transformations can be applied on images while + rendering. Not all effects can be combined; unless specified otherwise: + - HFLIP and VFLIP can both be added regardless of any other effect + - At most one color effect can be applied */ +enum { + /* Value 0x01 is reserved, because it is DIMAGE_NOCLIP, which although + part of the old API still needs to be supported. */ + + /* [Any]: Skip clipping the command against the source image */ + IMAGE_NOCLIP_INPUT = 0x04, + /* [Any]: Skip clipping the command against the output VRAM */ + IMAGE_NOCLIP_OUTPUT = 0x08, + /* [Any]: Skip clipping both */ + IMAGE_NOCLIP = IMAGE_NOCLIP_INPUT | IMAGE_NOCLIP_OUTPUT, + + // Geometric effects. These values should remain at exactly bit 8 and + // following, or change gint_image_mkcmd() along with it. + + /* [Any]: Flip image vertically */ + IMAGE_VFLIP = 0x0100, + /* [Any]: Flip image horizontally */ + IMAGE_HFLIP = 0x0200, + + // Color effects + + /* [RGB565, P8_RGB565, P4_RGB565]: Make a color transparent + Adds one argument: + * Color to clear (RGB16: 16-bit value; P8/P4: palette index) */ + IMAGE_CLEARBG = 0x10, + /* [RGB565, P8_RGB565, P4_RGB565]: Turn a color into another + Adds two arguments: + * Color to replace (RGB16: 16-bit value; P8/P4: palette index) + * Replacement color (16-bit value) */ + IMAGE_SWAPCOLOR = 0x20, + /* [RGB565A, P8_RGB565A, P4_RGB565A]: Add a background + Adds one argument: + * Background color (16-bit value) */ + IMAGE_ADDBG = 0x40, + /* [RGB565A, P8_RGB565A, P4_RGB565A]: Dye all non-transparent pixels + Adds one argument: + * Dye color (16-bit value) */ + IMAGE_DYE = 0x80, +}; + +//--- +// Image access and information +//--- + +/* TODO: Expand */ + +int image_get_pixel(image_t const *img, int x, int y); + +int image_decode_pixel(image_t const *img, int pixel); + +//--- +// Image rendering functions +// +// The following functions extend dimage() and dsubimage(). The [effects] +// parameter takes a combination of IMAGE_* flags and effects, limited to the +// combinations previously described, with additional arguments depending on +// the color effect being applied. +// +// dimage_effect(x, y, img, effects, ...) +// dsubimage_effect(x, y, img, left, top, w, h, effects, ...) +// +// However if you use these super-generic functions you will link the code for +// all effects and all formats into your add-in, which takes a fair amount of +// space. If that's a problem, you can use the more specific functions below: +// +// * dimage__() for one particular format (rgb16, p8, p4) along +// with one particular color effect (clearbg, swapcolor, addbg, dye). +// * dimage_() is like the above when no color effect is applied. +// +// All of them support the HFLIP and VFLIP flags. For effect-specific functions +// the corresponding effect flag can be omitted (fi. IMAGE_CLEARBG is implicit +// when using dimage_p8_clearbg()). +//--- + +/* dimage_effect(): Generalized dimage() supporting dynamic effects */ +#define dimage_effect(x, y, img, eff, ...) \ + dsubimage_effect(x, y, img, 0, 0, (img)->width, (img)->height, eff, \ + ##__VA_ARGS__) +/* dsubimage_effect(): Generalized dsubimage() supporting dynamic effects */ +void dsubimage_effect(int x, int y, image_t const *img, + int left, int top, int w, int h, int effects, ...); + +/* Specific versions for each format */ +#define DIMAGE_SIG1(NAME, ...) \ + void dimage_ ## NAME(int x, int y, image_t const *img,##__VA_ARGS__); \ + void dsubimage_ ## NAME(int x, int y, image_t const *img, \ + int left, int top, int w, int h, ##__VA_ARGS__); +#define DIMAGE_SIG(NAME, ...) \ + DIMAGE_SIG1(rgb16 ## NAME, ##__VA_ARGS__) \ + DIMAGE_SIG1(p8 ## NAME, ##__VA_ARGS__) \ + DIMAGE_SIG1(p4 ## NAME, ##__VA_ARGS__) + +/* d[sub]image_{rgb16,p8,p4}_effect(..., effects, ) */ +DIMAGE_SIG(_effect, int effects, ...) +/* d[sub]image_{rgb16,p8,p4}(..., effects) (no color effect, like dimage()) */ +DIMAGE_SIG(, int effects) +/* d[sub]image_{rgb16,p8,p4}_clearbg(..., effects, bg_color_or_index) */ +DIMAGE_SIG(_clearbg, int effects, int bg_color_or_index) +/* d[sub]image_{rgb16,p8,p4}_swapcolor(..., effects, source, replacement) */ +DIMAGE_SIG(_swapcolor, int effects, int source, int replacement) +/* d[sub]image_{rgb16,p8,p4}_addbg(..., effects, bg_color) */ +DIMAGE_SIG(_addbg, int effects, int bg_color) +/* d[sub]image_{rgb16,p8,p4}_dye(..., effects, dye_color) */ +DIMAGE_SIG(_dye, int effects, int dye_color) + +#define dimage_rgb16_effect(x, y, img, eff, ...) \ + dsubimage_rgb16_effect(x, y, img, 0, 0, (img)->width, (img)->height, \ + eff, ##__VA_ARGS__) +#define dimage_p8_effect(x, y, img, eff, ...) \ + dsubimage_p8_effect(x, y, img, 0, 0, (img)->width, (img)->height, \ + eff, ##__VA_ARGS__) +#define dimage_p4_effect(x, y, img, eff, ...) \ + dsubimage_p4_effect(x, y, img, 0, 0, (img)->width, (img)->height, \ + eff, ##__VA_ARGS__) + +#undef DIMAGE_SIG +#undef DIMAGE_SIG1 + +//--- +// Clipping utilities +//--- + +/* Double box specifying both a source and target area */ +struct gint_image_box +{ + /* Target location of top-left corner */ + int x, y; + /* Width and height of rendered sub-image */ + int w, h; + /* Source bounding box (low included, high excluded) */ + int left, top; +}; + +/* Clip the provided box against the input. If, after clipping, the box no + longer intersects the output (whose size is specified as out_w/out_h), + returns false. Otherwise, returns true. */ +bool gint_image_clip_input(image_t const *img, struct gint_image_box *box, + int out_w, int out_h); + +/* Clip the provided box against the output. */ +void gint_image_clip_output(struct gint_image_box *b, int out_w, int out_h); + +//--- +// Internal image rendering routines +// +// The following functions (or non-functions) are implemented in assembler and +// make up the internal interface of the image renderer. If you just want to +// display images, use dimage() and variations; these are only useful if you +// have a different rendering system and wish to use image rendering with +// dynamic effects in it. +//--- + +/* Renderer command. This structure includes most of the information used by + the image renderer to perform blits. Some of the information on the target + is also passed as direct arguments, which is more convenient and slightly + faster. + + Most of the values here can be set with gint_image_mkcmd(). The last two + members, along with the return values of the gint_image_FORMAT_loop() + functions, are used to update the command if one needs to draw *parts* of + the image and resume the rendering later. This is used in Azur. */ +struct gint_image_cmd +{ + /* Shader ID. This is used in Azur, and ignored in gint */ + uint8_t shader_id; + /* Dynamic effects + Bit 0: VFLIP + Bit 1: HFLIP + Bits 2-7: 0=NONE, 1=CLEARBG, 2=SWAPCOLOR, 3=DYE */ + uint8_t effect; + + /* Number of pixels to render per line. For formats that force either x + or width alignment (most of them), this is already adjusted to a + suitable multiple (usually a multiple of 2). */ + int16_t columns; + + /* Stride of the input image (number of pixels between each row), in + pixels, without subtracting the number of columns */ + int16_t input_stride; + + /* Number of lines in the command. This can be adjusted freely, and is + particularly useful in Azur for fragmented rendering. */ + uint8_t lines; + + /* [Any effect]: Offset of first edge */ + int8_t edge_1; + + /* Core loop; this is an internal label of the renderer */ + void const *loop; + /* Output pixel array, offset by target x/y */ + void const *output; + /* Input pixel array, offset by source x/y. For formats that force x + alignment, this is already adjusted. */ + void const *input; + /* Palette, when applicable */ + uint16_t const *palette; + + /* [Any effect]: Offset of right edge */ + int16_t edge_2; + /* [CLEARBG, SWAPCOLOR]: Source color */ + uint16_t color_1; + /* [SWAPCOLOR]: Destination color */ + uint16_t color_2; + + /* Remaining height (for updates between fragments) */ + int16_t height; + /* Local x position (for updates between fragments) */ + int16_t x; +}; + +/* gint_image_mkcmd(): Prepare a rendering command with dynamic effects + + This function crafts an image renderer command. It loads all the settings + except for effect-dependent parameters: the [.loop] label, the color section + of [.effect], and color effect settings. See the effect-specific functions + to see how they are defined. + + The benefit of this approach is that the rendering code does not need to be + linked in unless an effect is actually used, which avoids blowing up the + size of the add-in as the number of support dynamic effects increases. + + @box Requested on-screen box (will be clipped depending on effects) + @img Source image + @effects Set of dynamic effects to be applied, as an [IMAGE_*] bitmask + @left_edge Whether to force 2-alignment on the input (box->left) + @right_edge Whether to force 2-alignment on the width + @cmd Command to be filled + @out_width Output width (usually DWIDTH) + @out_height Output height (usually DHEIGHT) + + Returns false if there is nothing to render because of clipping (in which + case [cmd] is unchanged), true otherwise. [*box] is also updated to reflect + the final box after clipping but not accounting for edges. */ +bool gint_image_mkcmd(struct gint_image_box *box, image_t const *img, + int effects, bool left_edge, bool right_edge, + struct gint_image_cmd *cmd, int out_width, int out_height); + +/* Entry point of the renderers. These functions can be called normally as long + as you can build the commands (eg. by using gint_image_mkcmd() then filling + the effect-specific information). */ +void *gint_image_rgb16_loop (int output_width, struct gint_image_cmd *cmd); +void *gint_image_p8_loop (int output_width, struct gint_image_cmd *cmd); +void *gint_image_p4_loop (int output_width, struct gint_image_cmd *cmd); + +/* Renderer fragments. The following can absolutely not be called from C code + as they aren't full functions (and this isn't their prototype). These are + continuations to be specified in the [.loop] field of a command before using + one of the functions above. */ + +void gint_image_rgb16_normal(void); +void gint_image_rgb16_clearbg(void); +void gint_image_rgb16_swapcolor(void); +void gint_image_rgb16_dye(void); + +void gint_image_p8_normal(void); +void gint_image_p8_clearbg(void); +void gint_image_p8_swapcolor(void); +void gint_image_p8_dye(void); + +void gint_image_p4_normal(void); +void gint_image_p4_clearbg(void); +void gint_image_p4_swapcolor(void); +void gint_image_p4_dye(void); + +#endif /* FXCG50 */ + +#ifdef __cplusplus +} +#endif + +#endif /* GINT_IMAGE */ diff --git a/src/render-cg/image/image.c b/src/render-cg/image/image.c new file mode 100644 index 0000000..66b678d --- /dev/null +++ b/src/render-cg/image/image.c @@ -0,0 +1,107 @@ +#include +#include + +bool gint_image_clip_input(image_t const *img, struct gint_image_box *b, + int out_w, int out_h) +{ + /* Adjust the bounding box of the input image */ + if(b->left < 0) b->w += b->left, b->x -= b->left, b->left = 0; + if(b->top < 0) b->h += b->top, b->y -= b->top, b->top = 0; + if(b->left + b->w > img->width) b->w = img->width - b->left; + if(b->top + b->h > img->height) b->h = img->height - b->top; + + /* Check whether the box intersects the screen */ + if(b->w <= 0 || b->h <= 0) + return false; + if(b->x + b->w <= 0 || b->x >= out_w) + return false; + if(b->y + b->w <= 0 || b->y >= out_h) + return false; + + return true; +} + +void gint_image_clip_output(struct gint_image_box *b, int out_w, int out_h) +{ + /* Intersect with the bounding box on-screen */ + if(b->y < 0) b->top -= b->y, b->h += b->y, b->y = 0; + if(b->y + b->h > out_h) b->h = (out_h - b->y); + if(b->x < 0) b->left -= b->x, b->w += b->x, b->x = 0; + if(b->x + b->w > out_w) b->w = (out_w - b->x); +} + +bool gint_image_mkcmd(struct gint_image_box *box, image_t const *img, + int effects, bool left_edge, bool right_edge, + struct gint_image_cmd *cmd, int out_width, int out_height) +{ + /* Convert the old DIMAGE_NOCLIP flag */ + if(effects & DIMAGE_NOCLIP) + effects |= IMAGE_NOCLIP; + + if(!(effects & IMAGE_NOCLIP_INPUT)) { + if(!gint_image_clip_input(img, box, out_width, out_height)) + return false; + } + if(!(effects & IMAGE_NOCLIP_OUTPUT)) + gint_image_clip_output(box, out_width, out_height); + + cmd->effect = (effects & (IMAGE_VFLIP | IMAGE_HFLIP)) >> 8; + cmd->columns = box->w; + cmd->input_stride = img->width; + cmd->x = box->x; + cmd->edge_1 = -1; + cmd->edge_2 = -1; + + int p = img->profile; + int input_row = (effects & IMAGE_VFLIP) ? box->top+box->h-1 : box->top; + + if(p == IMAGE_RGB565 || p == IMAGE_RGB565A) { + cmd->input_stride += (cmd->input_stride & 1); + cmd->input = (void *)img->data + + (input_row * cmd->input_stride + box->left) * 2; + } + else if(p == IMAGE_P8_RGB565 || p == IMAGE_P8_RGB565A) { + cmd->input = (void *)img->data + img->data[0] * 2 + 2 + + (input_row * img->width + box->left); + cmd->palette = (void *)img->data + 258; + } + else { + cmd->input = (void *)img->data + 32 + + input_row * ((img->width + 1) >> 1) + (box->left >> 1); + cmd->palette = img->data; + /* By default, use edge_1 to indicate (box->left & 1), so that + functions that don't use edge_1 can still work properly */ + if(!left_edge) + cmd->edge_1 = (box->left & 1); + } + + if(left_edge && (box->left & 1)) { + if(effects & IMAGE_HFLIP) { + cmd->edge_1 = cmd->columns; + } + else { + cmd->x--; + cmd->edge_1 = 0; + } + cmd->columns++; + } + if(right_edge && (cmd->columns & 1)) { + if(effects & IMAGE_HFLIP) { + cmd->x--; + cmd->edge_1++; + cmd->edge_2 = 0; + } + else { + cmd->edge_2 = cmd->columns; + } + cmd->columns++; + } + + /* Settings for further updates */ + cmd->height = box->h; + + /* This is the default for gint, but Azur overwrites it */ + cmd->lines = box->h; + cmd->output = (void *)gint_vram + (DWIDTH * box->y + cmd->x) * 2; + return true; +} diff --git a/src/render-cg/image/image_macros.S b/src/render-cg/image/image_macros.S new file mode 100644 index 0000000..5d5cf41 --- /dev/null +++ b/src/render-cg/image/image_macros.S @@ -0,0 +1,25 @@ +/* START: Sets up the inner and outer loop. The outer loop is anything between + the calls to macros START and END, while the inner loop is the code between + labels 2: and 3: (both *INCLUDED*). */ +.macro START + ldrs 2f + ldre 3f +1: ldrc r2 + nop +.endm + +/* END: Finishes the outer loop and adds strides. */ +.macro END + dt r1 + add r4, r3 + bf.s 1b + add r6, r5 +.endm + +/* EPILOGUE: Finishes the call by reloading registers saved in the prologue. */ +.macro EPILOGUE + mov.l @r15+, r9 + mov r3, r0 + rts + mov.l @r15+, r8 +.endm diff --git a/src/render-cg/image/image_p4.S b/src/render-cg/image/image_p4.S new file mode 100644 index 0000000..71240b7 --- /dev/null +++ b/src/render-cg/image/image_p4.S @@ -0,0 +1,86 @@ +.global _gint_image_p4_loop + +/* gint's image renderer: 4-bit indexed entry point + + P4 compacts pixel data further than P8 by restricting values to a 16-color + palette and packing 2 pixels in each byte. This severely restricts our + ability to use sub-images because odd positions land within bytes. + + Fortunately, we can solve this by using more edge pixels. The simplest way + to write a P4 loop is to process 2 pixels from a 2-aligned source image + position in a single iteration. Other structures don't even come close in + terms of CPU performance (which, as a reminder, is the main bottleneck in + Azur but not in gint): selecting nibbles individually is too long, while not + unrolling is still clearly inefficient. So it becomes very important to + forcibly align the sub-image on byte-aligned input boundaries and stick to + that grid. + + Obviously, this approach causes up to one extra pixel to be overwritten on + each side of every line. We solve this problem by adding *another* edge + pixel on the left side. In the renderer this is called the left edge or + edge_1, while the standard one is called right edge or edge_2. + + r0: - (initially: cmd.effect) + r1: Number of lines remaining to draw + r2: Number of columns per line + r3: Input pointer + r4: Input stride + r5: Output pointer + r6: Output stride + r7: Right edge pointer + r8: - (initially: cmd) + r9: - (initially: cmd.loop) + r10: Left edge pointer */ + +_gint_image_p4_loop: + /* r4: int output_width (pixels) + r5: struct gint_image_cmd *cmd */ + + mov.b @(1,r5), r0 /* cmd.effect */ + add #2, r5 + + mov.w @r5+, r2 /* cmd.columns */ + mov r4, r6 + + mov.l r8, @-r15 + mov r5, r8 + + /* For here on the command is r8 */ + + mov.l r9, @-r15 + sub r2, r6 + + mov.w @r8+, r4 /* cmd.input_stride */ + add r6, r6 + + mov.b @r8+, r1 /* cmd.lines */ + shlr r4 + + mov.l r10, @-r15 + extu.b r1, r1 + + mov.b @r8+, r10 /* cmd.edge_1 */ + nop + + mov #0, r9 + addc r9, r4 /* r4 = (img.width + 1) >> 1 */ + + mov.l @r8+, r9 + shlr r0 /* T bit is now VFLIP */ + + mov.l @r8+, r5 /* cmd.output */ + nop + + bf.s _NO_VFLIP + mov.l @r8+, r3 /* cmd.input */ + +_VFLIP: + neg r4, r4 + nop + +_NO_VFLIP: + mov r2, r7 + shlr r7 + + jmp @r9 + subc r7, r4 diff --git a/src/render-cg/image/image_p4.c b/src/render-cg/image/image_p4.c new file mode 100644 index 0000000..b81f31a --- /dev/null +++ b/src/render-cg/image/image_p4.c @@ -0,0 +1,42 @@ +#include +#include + +void dimage_p4(int x, int y, image_t const *img, int eff) +{ + dsubimage_p4(x, y, img, 0, 0, img->width, img->height, eff); +} + +void dsubimage_p4(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff) +{ + if(img->profile == IMAGE_P4_RGB565A) + return dsubimage_p4_clearbg(x, y, img, left, top, w, h, eff, + img->alpha); + + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH, + DHEIGHT)) return; + cmd.loop = gint_image_p4_normal; + gint_image_p4_loop(DWIDTH, &cmd); +} + +void dimage_p4_clearbg(int x, int y, image_t const *img, int eff, int bg) +{ + dsubimage_p4_clearbg(x, y, img, 0, 0, img->width, img->height, eff,bg); +} + +void dsubimage_p4_clearbg(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, int bg_color) +{ + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(!gint_image_mkcmd(&box, img, eff, true, true, &cmd, DWIDTH, + DHEIGHT)) return; + cmd.effect += 4; + cmd.color_1 = bg_color; + cmd.loop = gint_image_p4_clearbg; + gint_image_p4_loop(DWIDTH, &cmd); +} diff --git a/src/render-cg/image/image_p4_clearbg.S b/src/render-cg/image/image_p4_clearbg.S new file mode 100644 index 0000000..7fd1f54 --- /dev/null +++ b/src/render-cg/image/image_p4_clearbg.S @@ -0,0 +1,153 @@ +.global _gint_image_p4_clearbg +#include "image_macros.S" + +/* P4 CLEARBG, RAM version: by NULL canceling. + + This function is similar to P8 CLEARBG. Transparent pixels are not limited + by RAM writing speed, so a tight CPU loop is used. See P8 CLEARBG for an + explanation of NULL canceling. + + r0: [temporary] + r7: Right edge pointer + r8: Alpha value + r9: Palette + r10: Left edge pointer + r11: Nullable output pointer + r12: 0 (in outer loop: edge stride) + r13: [temporary] + r14: [temporary] + + Spilled to stack: + @(-12,r15): Right edge value + @(-8,r15): Left edge value + @(-4,r15): Edge stride */ + +.macro GEN_CLEARBG_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2 + shlr r2 + nop + + add r10, r10 + nop + + mov.l @r8+, r9 /* cmd.palette */ + mov r2, r0 + + mov.w @r8+, r7 /* cmd.edge_2 */ + shll2 r0 + + mov.l r12, @-r15 + shll r7 + + mov.l r11, @-r15 + add r5, r7 + + mov r0, r12 + add r6, r12 + + mov.l r13, @-r15 + add r5, r10 + + mov.l r14, @-r15 + add #-4, r5 + + mov.w @r8, r8 /* cmd.color_1 */ + add #-1, r4 /* Input stride compensation for pipelining */ + + .if \HFLIP + add r0, r5 + nop + + shll r0 + nop + + add r0, r6 + nop + .endif + + shll r8 /* alpha*2 compares against palette offsets */ + nop + + START + + mov.b @r3+, \TMP1 + nop + + mov.w @r7, r0 /* Save right edge */ + nop + + mov.l r0, @-r15 + shll \TMP1 + + mov.w @r10, r0 /* Save left edge */ + nop + + mov.l r0, @-r15 + nop + + mov.l r12, @-r15 + mov #0, r12 + +2: mov \TMP1, r0 + and #0x1e, r0 + + cmp/eq r0, r8 + mov #-1, r11 + + addc r12, r11 + mov #-4, \TMP2 + + and r5, r11 + mov.w @(r0,r9), r0 + + shld \TMP2, \TMP1 + mov #0x1e, \TMP2 + + and \TMP2, \TMP1 + mov.w r0, @(\OFF1,r11) + + cmp/eq \TMP1, r8 + mov #-1, r11 + + addc r12, r11 + mov \TMP1, r0 + + and r5, r11 + mov.b @r3+, \TMP1 + + add #\OUT_DIR, r5 + mov.w @(r0,r9), r0 + + mov.w r0, @(\OFF2,r11) +3: shll \TMP1 + + mov.l @r15+, r12 + nop + + mov.l @r15+, r0 + nop + + mov.w r0, @r10 /* Restore left edge */ + add r12, r10 + + mov.l @r15+, r0 + nop + + mov.w r0, @r7 /* Restore right edge */ + add r12, r7 + + END + + mov.l @r15+, r14 + mov.l @r15+, r13 + mov.l @r15+, r11 + mov.l @r15+, r12 + mov.l @r15+, r10 + EPILOGUE +.endm + +_gint_image_p4_clearbg: + tst #1, r0 + bf 9f + + GEN_CLEARBG_LOOP 0, 4, r13, r14, 6, 4 +9: GEN_CLEARBG_LOOP 1, -4, r13, r14, 0, 2 diff --git a/src/render-cg/image/image_p4_dye.S b/src/render-cg/image/image_p4_dye.S new file mode 100644 index 0000000..d1cc9d6 --- /dev/null +++ b/src/render-cg/image/image_p4_dye.S @@ -0,0 +1,147 @@ +.global _gint_image_p4_dye +#include "image_macros.S" + +/* P4 DYE, RAM version: by NULL canceling. + + Like with P8, this effect removes most of the complexity because there is no + longer any need to index the palette. However the decoding still takes a lot + of EX work so the performance is not as good. Since there are transparent + areas, Azur's CPU-bound version is at least to some extent faster than + bopti, so that's what we're using. + + See P8 CLEARBG for an explanation of NULL canceling. + + r0: Dye value + r7: Right edge pointer + r8: Alpha value + r9: 0 (to neutralize addc during NULL-cancelling) + r10: Left edge pointer + r11: Nullable output pointer + r12: Edge stride + r13: [temporary] + r14: [temporary] + + Spilled to stack: + @(-8,r15): Right edge value + @(-4,r15): Left edge value */ + +.macro GEN_DYE_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2 + shlr r2 + nop + + add r10, r10 + nop + + mov.l @r8+, r0 /* cmd.palette (don't care) */ + mov r2, r0 + + mov.w @r8+, r7 /* cmd.edge_2 */ + shll2 r0 + + mov.l r12, @-r15 + shll r7 + + mov.l r11, @-r15 + add r5, r7 + + mov r0, r12 + add r6, r12 + + mov.l r13, @-r15 + add r5, r10 + + mov.l r14, @-r15 + add #-4, r5 + + .if \HFLIP + add r0, r5 + nop + + shll r0 + nop + + add r0, r6 + nop + .endif + + mov.w @(2,r8), r0 /* cmd.color_2 (dye value) */ + add #-1, r4 /* Input stride compensation for pipelining */ + + mov.w @r8, r8 /* cmd.color_1 (alpha value) */ + nop + + START + + mov.b @r3+, \TMP1 + nop + + mov.w @r7, \TMP2 /* Save right edge */ + nop + + mov.l \TMP2, @-r15 + mov #0x0f, \TMP2 + + mov.w @r10, r9 /* Save left edge */ + and \TMP1, \TMP2 + + mov.l r9, @-r15 + mov #0, r9 + +2: cmp/eq \TMP2, r8 + mov #-1, r11 + + addc r9, r11 + mov #-4, \TMP2 + + and r5, r11 + nop + + shld \TMP2, \TMP1 + mov #0x0f, \TMP2 + + and \TMP2, \TMP1 + mov.w r0, @(\OFF1,r11) + + cmp/eq \TMP1, r8 + mov #-1, r11 + + addc r9, r11 + mov.b @r3+, \TMP1 + + and r5, r11 + nop + + mov #0x0f, \TMP2 + and \TMP1, \TMP2 + + add #\OUT_DIR, r5 +3: mov.w r0, @(\OFF2,r11) + + mov.l @r15+, \TMP2 + nop + + mov.w \TMP2, @r10 /* Restore left edge */ + add r12, r10 + + mov.l @r15+, \TMP2 + nop + + mov.w \TMP2, @r7 /* Restore right edge */ + add r12, r7 + + END + + mov.l @r15+, r14 + mov.l @r15+, r13 + mov.l @r15+, r11 + mov.l @r15+, r12 + mov.l @r15+, r10 + EPILOGUE +.endm + +_gint_image_p4_dye: + tst #1, r0 + bf 9f + + GEN_DYE_LOOP 0, 4, r13, r14, 6, 4 +9: GEN_DYE_LOOP 1, -4, r13, r14, 0, 2 diff --git a/src/render-cg/image/image_p4_dye.c b/src/render-cg/image/image_p4_dye.c new file mode 100644 index 0000000..81eaa52 --- /dev/null +++ b/src/render-cg/image/image_p4_dye.c @@ -0,0 +1,23 @@ +#include +#include + +void dimage_p4_dye(int x, int y, image_t const *img, int eff, int dye_color) +{ + dsubimage_p4_dye(x, y, img, 0, 0, img->width, img->height, eff, + dye_color); +} + +void dsubimage_p4_dye(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, int dye_color) +{ + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(!gint_image_mkcmd(&box, img, eff, true, true, &cmd, DWIDTH, + DHEIGHT)) return; + cmd.effect += 12; + cmd.color_1 = img->alpha; + cmd.color_2 = dye_color; + cmd.loop = gint_image_p4_dye; + gint_image_p4_loop(DWIDTH, &cmd); +} diff --git a/src/render-cg/image/image_p4_effect.c b/src/render-cg/image/image_p4_effect.c new file mode 100644 index 0000000..1af0497 --- /dev/null +++ b/src/render-cg/image/image_p4_effect.c @@ -0,0 +1,32 @@ +#include + +void dsubimage_p4_effect(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, ...) +{ + va_list args; + va_start(args, eff); + + if(eff & IMAGE_CLEARBG) { + int bg = va_arg(args, int); + dsubimage_p4_clearbg(x, y, img, left, top, w, h, eff, bg); + } + else if(eff & IMAGE_SWAPCOLOR) { + int from = va_arg(args, int); + int to = va_arg(args, int); + dsubimage_p4_swapcolor(x, y, img, left, top, w, h, eff, from, + to); + } + else if(eff & IMAGE_ADDBG) { + int bg = va_arg(args, int); + dsubimage_p4_addbg(x, y, img, left, top, w, h, eff, bg); + } + else if(eff & IMAGE_DYE) { + int dye = va_arg(args, int); + dsubimage_p4_dye(x, y, img, left, top, w, h, eff, dye); + } + else { + dsubimage_p4(x, y, img, left, top, w, h, eff); + } + + va_end(args); +} diff --git a/src/render-cg/image/image_p4_normal.S b/src/render-cg/image/image_p4_normal.S new file mode 100644 index 0000000..15652ab --- /dev/null +++ b/src/render-cg/image/image_p4_normal.S @@ -0,0 +1,125 @@ +.global _gint_image_p4_normal +#include "image_macros.S" + +/* P4 Opaque rendering, VRAM version: by unrolling without edge pixels. + + This is the most unique function in the renderer, Azur included. A P4 image + cannot reasonably be decoded on a per-pixel basis because extracting half- + bytes is too slow. But using edge pixels results in extra write surface that + makes us slower than bopti in gint 2.7. + + This loop is thus the only one to implement 2-unrolling (no pipeline) while + manually avoiding the writes that a pair of edge pixels usually fix. Subtle + adjustments to strides are involved, making this function one of the most + tricky. + + A slight change is made to the command for the purpose of this function; + cmd.edge_1 (which is r10) is set to indicate whether the [left] side of the + box is even (r10=0) or odd (r10=1). This allows us to enter the loop at the + correct position. + + r0: [temporary] + r7: [temporary] + r8: Column counter + r9: Palette + r10: box->left & 1 + r11: [temporary] */ + +.macro GEN_NORMAL_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2 + mov.l @r8+, r9 /* cmd.palette */ + add #-4, r5 /* Better positioning for @(OFF[12], r5) */ + + /* The following arithmetic is to decrease r4 if the width is even + (r2 & 1) and left is odd (r10 = 1), since that means both the first + and last pixel load a full byte but use only half */ + + mov r2, r0 + xor #1, r0 + + mov.w @r8+, r7 /* cmd.edge_2 (don't care) */ + and r10, r0 + + mov.l r11, @-r15 + sub r0, r4 + + .if \HFLIP + mov r2, r0 + shll r0 + + add r0, r5 + nop + + shll r0 + nop + + add r0, r6 + nop + .endif + +1: mov r2, r8 + tst r10, r10 /* Check whether to do an extra half iter. */ + + bt 2f + nop + + /* Additional half-iteration if box->left = 1 */ + + mov.b @r3+, r0 + shll r0 + and #0x1e, r0 + mov.w @(r0, r9), r0 + dt r8 + mov.w r0, @(\OFF1, r5) + bt.s 3f + add #\OUT_DIR, r5 + + /* The main loop needs to load pixels in output order. This is not + ideal for CPU usage, but we have some margins */ + +2: mov.b @r3+, \TMP1 + mov #-4, \TMP2 + + /* Stall */ + + shll \TMP1 + mov \TMP1, r0 + + shld \TMP2, r0 + nop + + and #0x1e, r0 + mov #0x1e, \TMP2 + + /* Stall */ + + mov.w @(r0,r9), r0 + and \TMP2, \TMP1 + + dt r8 + mov.w r0, @(\OFF1,r5) + + bt.s 3f + add #\OUT_DIR, r5 + + mov \TMP1, r0 + add #\OUT_DIR, r5 + + dt r8 + mov.w @(r0,r9), r0 + + bf.s 2b + mov.w r0, @(\OFF2,r5) + +3: END + + mov.l @r15+, r11 + mov.l @r15+, r10 + EPILOGUE +.endm + +_gint_image_p4_normal: + tst #1, r0 + bf 9f + + GEN_NORMAL_LOOP 0, 2, r7, r11, 4, 2 +9: GEN_NORMAL_LOOP 1, -2, r7, r11, 2, 4 diff --git a/src/render-cg/image/image_p4_swapcolor.S b/src/render-cg/image/image_p4_swapcolor.S new file mode 100644 index 0000000..3d35d34 --- /dev/null +++ b/src/render-cg/image/image_p4_swapcolor.S @@ -0,0 +1,175 @@ +.global _gint_image_p4_swapcolor +#include "image_macros.S" + +/* P4 SWAPCOLOR, RAM version: by branchless xor selection. + + I'm not sure whether this is the most optimized version for RAM. But it's + about 7-8% slower than bopti, and the effort of writing yet another + variation of P4's arduous loops doesn't seem worth it for a rare dynamic + effect. This is Azur's version. + + See P8 SWAPCOLOR for an explanation of branchless xor selection. + + r0: [temporary] + r7: Right edge pointer + r8: palette[cmd.color_1] ^ cmd.color_2 (ie. x ^ y) + r9: Palette + r10: Left edge pointer + r11: Holds (x ^ y) & -(c == x) during selection + r12: cmd.color_1 + r13: [temporary] + r14: [temporary] (in outer loop: edge stride) + + Spilled to stack: + @(-12,r15): Right edge value + @(-8,r15): Left edge value + @(-4,r15): Edge stride */ + +.macro GEN_SWAPCOLOR_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2 + shlr r2 + nop + + add r10, r10 + nop + + mov.l @r8+, r9 /* cmd.palette */ + mov r2, r0 + + mov.w @r8+, r7 /* cmd.edge_2 */ + shll2 r0 + + mov.l r12, @-r15 + shll r7 + + mov.l r13, @-r15 + add r5, r7 + + mov.w @r8+, r13 /* cmd.color_1 */ + add r5, r10 + + mov.l r11, @-r15 + add #-4, r5 + + mov r13, r12 + shll r13 + + mov.l r14, @-r15 + add r9, r13 + + mov.w @r8, r8 /* cmd.color_2 */ + add #-1, r4 /* Input stride compensation for pipelining */ + + mov.w @r13, r13 + mov r0, r14 + + add r6, r14 + nop + + xor r13, r8 + nop + + .if \HFLIP + add r0, r5 + nop + + shll r0 + nop + + add r0, r6 + nop + .endif + + shll r12 /* Compare color_1 * 2 with shifted values */ + nop + + START + + mov.b @r3+, \TMP1 + nop + + mov.w @r7, r0 /* Save right edge */ + nop + + mov.l r0, @-r15 + shll \TMP1 + + mov.w @r10, r0 /* Save left edge */ + nop + + mov.l r0, @-r15 + nop + + mov.l r14, @-r15 + nop + +2: mov \TMP1, r0 + and #0x1e, r0 + + cmp/eq r0, r12 + mov #-4, \TMP2 + + subc r11, r11 + nop + + mov.w @(r0,r9), r0 + and r8, r11 + + shld \TMP2, \TMP1 + mov #0x1e, \TMP2 + + xor r11, r0 + mov.w r0, @(\OFF1,r5) + + and \TMP2, \TMP1 + nop + + cmp/eq \TMP1, r12 + nop + + subc r11, r11 + mov \TMP1, r0 + + add #\OUT_DIR, r5 + mov.b @r3+, \TMP1 + + and r8, r11 + mov.w @(r0,r9), r0 + + shll \TMP1 + nop + + xor r11, r0 +3: mov.w r0, @(\OFF2,r5) + + + mov.l @r15+, r14 + nop + + mov.l @r15+, r0 + nop + + mov.w r0, @r10 /* Restore left edge */ + add r14, r10 + + mov.l @r15+, r0 + nop + + mov.w r0, @r7 /* Restore right edge */ + add r14, r7 + + END + + mov.l @r15+, r14 + mov.l @r15+, r11 + mov.l @r15+, r13 + mov.l @r15+, r12 + mov.l @r15+, r10 + EPILOGUE +.endm + +_gint_image_p4_swapcolor: + tst #1, r0 + bf 9f + + GEN_SWAPCOLOR_LOOP 0, 4, r13, r14, 6, 0 +9: GEN_SWAPCOLOR_LOOP 1, -4, r13, r14, 0, 6 diff --git a/src/render-cg/image/image_p4_swapcolor.c b/src/render-cg/image/image_p4_swapcolor.c new file mode 100644 index 0000000..d3630bf --- /dev/null +++ b/src/render-cg/image/image_p4_swapcolor.c @@ -0,0 +1,46 @@ +#include +#include + +void dimage_p4_swapcolor(int x, int y, image_t const *img, int eff, + int old_color, int new_color) +{ + dsubimage_p4_swapcolor(x, y, img, 0, 0, img->width, img->height, + eff, old_color, new_color); +} + +void dsubimage_p4_swapcolor(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, int old_index, int new_color) +{ + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(!gint_image_mkcmd(&box, img, eff, true, true, &cmd, DWIDTH, + DHEIGHT)) return; + cmd.effect += 8; + cmd.color_1 = old_index; + cmd.color_2 = new_color; + cmd.loop = gint_image_p4_swapcolor; + gint_image_p4_loop(DWIDTH, &cmd); +} + +void dimage_p4_addbg(int x, int y, image_t const *img, int eff, + int bg_color) +{ + dsubimage_p4_addbg(x, y, img, 0, 0, img->width, img->height, + eff, bg_color); +} + +void dsubimage_p4_addbg(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, int bg_color) +{ + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(!gint_image_mkcmd(&box, img, eff, true, true, &cmd, DWIDTH, + DHEIGHT)) return; + cmd.effect += 8; + cmd.color_1 = img->alpha; + cmd.color_2 = bg_color; + cmd.loop = gint_image_p4_swapcolor; + gint_image_p4_loop(DWIDTH, &cmd); +} diff --git a/src/render-cg/image/image_p8.S b/src/render-cg/image/image_p8.S new file mode 100644 index 0000000..7ea1af3 --- /dev/null +++ b/src/render-cg/image/image_p8.S @@ -0,0 +1,103 @@ +.global _gint_image_p8_loop + +/* gint's image renderer: 8-bit indexed entry point + + P8 compacts images by indexing each pixel on a 256-color palette, thus + halving the amount of data per pixel. This comes at the cost of an + additional lookup during rendering. For these format, there is no way to + bundle pixels together, and the more advanced loops handle pixels + individually with a 2-unrolled 2-stage-pipeline structure to accelerate the + CPU processing when that is the bottleneck (which often means where there + are transparent pixels to skip). + + For readers not familiar with loop optimization literature, the main idea is + that a simple loop which loads a pixel, processes it, and writes it, is too + inefficient because of RAW delays. To use the full speed of the CPU, one + needs to do more work in parallel and spread out actions on a single pixel, + which we do here with two loop transforms: + + * _Pipelining_ the loop consists in handling a single pixel over several + iterations by doing a little bit of work in each iteration. The data for + the pixel would move from register to register at each iteration, with the + loop code doing one stage's worth of computation on each register. This + gives us more pixels to work on simultaneously, and more independent work + means less RAW limitations. Loops in this renderer have 2 stages at most. + + * _Unrolling_ iterations of the loop consists in loading two (or more) pixels + at the start of each iteration so that we can work on one while waiting + for stalls and dependencies on the other. Unlike pipelining, pixels are + still confined within iterations. Non-trivial loops in this renderer + process 2 pixels per iteration. + + Unrolling has one major flaw: handling pairs of pixels only works if the + total amount of pixels to draw is even. The usual way to handle this for n + pixels is to do ⌊n/2⌋ iterations and handle the last pixel individually if n + is odd. This is extremely annoying, since every row must check the value of + n, and an extra copy of the loop code for a single pixel must be maintained + on the side, which takes more space and more effort. + + However, we have a specialized solution here with *edge pixels*. The idea of + edge pixels is to round the number of pixels *up* and perform ⌊(n+1)/2⌋ runs + of the inner loop. If n is odd, this will overwrite a single pixel at the + end of the line. We can cancel this error after-the-fact by saving the value + of the (n+1)-th pixel of the line before the loop, and restoring it + afterwards. Note that if n is even then the save/restore is a no-op. + + This takes some caution however, as the temporary overwrite could be seen by + an interrupt. Some measures are put in place to reserve a couple of bytes on + each side of gint's VRAM and Azur's target fragment to avoid any problems. + + r0: - (initially: cmd.effect) + r1: Number of lines remaining to draw + r2: Number of columns per line + r3: Input pointer + r4: Input stride + r5: Output pointer + r6: Output stride + r7: Right edge or [temporary] + r8: - (initially: cmd) + r9: - (initially: cmd.loop) */ + +_gint_image_p8_loop: + /* r4: int output_width (pixels) + r5: struct gint_image_cmd *cmd */ + + mov.b @(1,r5), r0 /* cmd.effect */ + add #2, r5 + + mov.l r8, @-r15 + mov r4, r6 + + mov.w @r5+, r2 /* cmd.columns */ + mov r5, r8 + + /* For here on the command is r8 */ + + mov.l r9, @-r15 + shlr r0 /* T bit is now VFLIP */ + + mov.w @r8+, r4 /* cmd.input_stride */ + sub r2, r6 + + mov.b @r8+, r1 /* cmd.lines */ + add r6, r6 + + mov.b @r8+, r9 /* cmd.edge_1 - don't care */ + nop + + mov.l @r8+, r9 + extu.b r1, r1 + + mov.l @r8+, r5 /* cmd.output */ + nop + + bf.s _NO_VFLIP + mov.l @r8+, r3 /* cmd.input */ + +_VFLIP: + neg r4, r4 + nop + +_NO_VFLIP: + jmp @r9 + sub r2, r4 diff --git a/src/render-cg/image/image_p8.c b/src/render-cg/image/image_p8.c new file mode 100644 index 0000000..ec734b4 --- /dev/null +++ b/src/render-cg/image/image_p8.c @@ -0,0 +1,42 @@ +#include +#include + +void dimage_p8(int x, int y, image_t const *img, int eff) +{ + dsubimage_p8(x, y, img, 0, 0, img->width, img->height, eff); +} + +void dsubimage_p8(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff) +{ + if(img->profile == IMAGE_P8_RGB565A) + return dsubimage_p8_clearbg(x, y, img, left, top, w, h, eff, + img->alpha); + + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH, + DHEIGHT)) return; + cmd.loop = gint_image_p8_normal; + gint_image_p8_loop(DWIDTH, &cmd); +} + +void dimage_p8_clearbg(int x, int y, image_t const *img, int eff, int bg) +{ + dsubimage_p8_clearbg(x, y, img, 0, 0, img->width, img->height, eff,bg); +} + +void dsubimage_p8_clearbg(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, int bg_color) +{ + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(!gint_image_mkcmd(&box, img, eff, false, true, &cmd, DWIDTH, + DHEIGHT)) return; + cmd.effect += 4; + cmd.color_1 = bg_color; + cmd.loop = gint_image_p8_clearbg; + gint_image_p8_loop(DWIDTH, &cmd); +} diff --git a/src/render-cg/image/image_p8_clearbg.S b/src/render-cg/image/image_p8_clearbg.S new file mode 100644 index 0000000..da661f0 --- /dev/null +++ b/src/render-cg/image/image_p8_clearbg.S @@ -0,0 +1,147 @@ +.global _gint_image_p8_clearbg +#include "image_macros.S" + +/* P8 CLEARBG, RAM version: by NULL canceling. + + This function is one of the few that can still be bottlenecked by CPU in the + RAM model. This is because transparent pixels can be skipped over as fast as + the CPU allows without worrying about the writing speed of the RAM. + + For some reason that I have yet to uncover, branches are way slower than the + SH4AL-DSP manual suggests, and even slower while inside of DSP loops. This + completely favors branchless methods, and the one used here is one I call + "NULL canceling". + + The idea is that a write can be turned into a no-op by either writing the + value that is already in memory, or by writing somewhere else. The first + option is pretty slow, especially because it requires a selection operation + (rn = condition ? rn : rm) which is like the most general branchless trick. + + NULL canceling abuses the fact that NULL is mapped read-only on the platform + to turn the target pointer in NULL with the following identity: + + target & -(condition) = (condition ? target : NULL) + + The term -(condition) is materialized with an [addc #-1, #0] instruction + after the test, then the result is applied onto the target pointer with + [and], completing the trick in only 2 EX instructions. It does take more + registers, and prevents from using pre-decrement on the target. + + r0: [temporary] + r7: Right edge pointer + r8: Alpha value + r9: Palette + r10: Nullable output pointer + r11: 0 (to neutralize addc during NULL-cancelling) + r12: Right edge stride + r13: [temporary] + r14: [temporary] + + Spilled to stack: + @(-4,r15): Right edge value */ + +.macro GEN_CLEARBG_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2 + mov.l @r8+, r9 /* cmd.palette */ + shlr r2 + + mov.w @r8+, r7 /* cmd.edge_2 */ + mov r2, r0 + + mov.l r12, @-r15 + shll2 r0 + + mov.l r10, @-r15 + shll r7 + + mov.l r11, @-r15 + add r5, r7 + + mov r0, r12 + add r6, r12 + + mov.l r13, @-r15 + add #-4, r5 + + mov.l r14, @-r15 + add #-2, r4 /* Input stride compensation for pipelining */ + + mov.w @r8, r8 /* cmd.color_1 ≤ 255, thus zero-extended */ + mov #0, r11 + + .if \HFLIP + add r0, r5 + nop + + shll r0 + nop + + add r0, r6 + nop + .endif + + START + + mov.b @r3+, \TMP2 + nop + + mov.w @r7, r0 /* Save right edge */ + nop + + mov.l r0, @-r15 + cmp/eq \TMP2, r8 + + mov.b @r3+, \TMP1 + add \TMP2, \TMP2 + +2: mov #-1, r10 + addc r11, r10 /* r10 is now the mask */ + + and r5, r10 + mov \TMP2, r0 + + cmp/eq \TMP1, r8 + mov.w @(r0, r9), r0 + + mov.w r0, @(\OFF1, r10) + add #\OUT_DIR, r5 + + mov.b @r3+, \TMP2 + nop + + mov #-1, r10 + addc r11, r10 + + add \TMP1, \TMP1 + mov \TMP1, r0 + + mov.b @r3+, \TMP1 + and r5, r10 + + mov.w @(r0, r9), r0 + cmp/eq \TMP2, r8 + + mov.w r0, @(\OFF2, r10) +3: add \TMP2, \TMP2 + + mov.l @r15+, r0 + nop + + mov.w r0, @r7 /* Restore right edge */ + add r12, r7 + + END + + mov.l @r15+, r14 + mov.l @r15+, r13 + mov.l @r15+, r11 + mov.l @r15+, r10 + mov.l @r15+, r12 + EPILOGUE +.endm + +_gint_image_p8_clearbg: + tst #1, r0 + bf 9f + + GEN_CLEARBG_LOOP 0, 4, r13, r14, 4, 2 +9: GEN_CLEARBG_LOOP 1, -4, r13, r14, 2, 4 diff --git a/src/render-cg/image/image_p8_dye.S b/src/render-cg/image/image_p8_dye.S new file mode 100644 index 0000000..6a8130e --- /dev/null +++ b/src/render-cg/image/image_p8_dye.S @@ -0,0 +1,115 @@ +.global _gint_image_p8_dye +#include "image_macros.S" + +/* P8 DYE, RAM version: by NULL canceling. + + This effect basically removes all the complexity out of P8 because we no + longer need to index the palette. We only keep the tight loop so that the + CPU can speed in areas with many transparent pixels. This gives some + acceleration over bopti. + + See P8 CLEARBG for an explanation of NULL canceling. + + r0: Dye value + r7: Right edge pointer + r8: Alpha value + r9: Right edge value + r10: Nullable output pointer + r11: 0 (to neutralize addc during NULL-cancelling) + r12: Right edge stride + r13: [temporary] + r14: [temporary] */ + +.macro GEN_DYE_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2 + mov.l @r8+, r9 /* cmd.palette (don't care) */ + shlr r2 + + mov.w @r8+, r7 /* cmd.edge_2 */ + mov r2, r0 + + mov.l r12, @-r15 + shll2 r0 + + mov.l r10, @-r15 + shll r7 + + mov.l r11, @-r15 + add r5, r7 + + mov r0, r12 + add r6, r12 + + mov.l r13, @-r15 + add #-4, r5 + + mov.l r14, @-r15 + add #-2, r4 /* Input stride compensation for pipelining */ + + .if \HFLIP + add r0, r5 + nop + + shll r0 + nop + + add r0, r6 + nop + .endif + + mov.w @(2,r8), r0 /* cmd.color_2 (dye value) */ + nop + + mov.w @r8, r8 /* cmd.color_1 ≤ 255, thus zero-extended */ + mov #0, r11 + + START + + mov.b @r3+, \TMP2 + nop + + mov.w @r7, r9 /* Save right edge */ + nop + + mov.b @r3+, \TMP1 + cmp/eq \TMP2, r8 + +2: mov #-1, r10 + addc r11, r10 /* r10 is now the mask */ + + and r5, r10 + nop + + mov.b @r3+, \TMP2 + cmp/eq \TMP1, r8 + + mov.w r0, @(\OFF1, r10) + add #\OUT_DIR, r5 + + mov #-1, r10 + addc r11, r10 + + mov.b @r3+, \TMP1 + and r5, r10 + + cmp/eq \TMP2, r8 +3: mov.w r0, @(\OFF2, r10) + + mov.w r9, @r7 /* Restore right edge */ + add r12, r7 + + END + + mov.l @r15+, r14 + mov.l @r15+, r13 + mov.l @r15+, r11 + mov.l @r15+, r10 + mov.l @r15+, r12 + EPILOGUE +.endm + +_gint_image_p8_dye: + tst #1, r0 + bf 9f + + GEN_DYE_LOOP 0, 4, r13, r14, 4, 2 +9: GEN_DYE_LOOP 1, -4, r13, r14, 2, 4 diff --git a/src/render-cg/image/image_p8_dye.c b/src/render-cg/image/image_p8_dye.c new file mode 100644 index 0000000..aa0e4b7 --- /dev/null +++ b/src/render-cg/image/image_p8_dye.c @@ -0,0 +1,23 @@ +#include +#include + +void dimage_p8_dye(int x, int y, image_t const *img, int eff, int dye_color) +{ + dsubimage_p8_dye(x, y, img, 0, 0, img->width, img->height, eff, + dye_color); +} + +void dsubimage_p8_dye(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, int dye_color) +{ + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(!gint_image_mkcmd(&box, img, eff, false, true, &cmd, DWIDTH, + DHEIGHT)) return; + cmd.effect += 12; + cmd.color_1 = img->alpha; + cmd.color_2 = dye_color; + cmd.loop = gint_image_p8_dye; + gint_image_p8_loop(DWIDTH, &cmd); +} diff --git a/src/render-cg/image/image_p8_effect.c b/src/render-cg/image/image_p8_effect.c new file mode 100644 index 0000000..00b301a --- /dev/null +++ b/src/render-cg/image/image_p8_effect.c @@ -0,0 +1,32 @@ +#include + +void dsubimage_p8_effect(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, ...) +{ + va_list args; + va_start(args, eff); + + if(eff & IMAGE_CLEARBG) { + int bg = va_arg(args, int); + dsubimage_p8_clearbg(x, y, img, left, top, w, h, eff, bg); + } + else if(eff & IMAGE_SWAPCOLOR) { + int from = va_arg(args, int); + int to = va_arg(args, int); + dsubimage_p8_swapcolor(x, y, img, left, top, w, h, eff, from, + to); + } + else if(eff & IMAGE_ADDBG) { + int bg = va_arg(args, int); + dsubimage_p8_addbg(x, y, img, left, top, w, h, eff, bg); + } + else if(eff & IMAGE_DYE) { + int dye = va_arg(args, int); + dsubimage_p8_dye(x, y, img, left, top, w, h, eff, dye); + } + else { + dsubimage_p8(x, y, img, left, top, w, h, eff); + } + + va_end(args); +} diff --git a/src/render-cg/image/image_p8_normal.S b/src/render-cg/image/image_p8_normal.S new file mode 100644 index 0000000..7cc5c53 --- /dev/null +++ b/src/render-cg/image/image_p8_normal.S @@ -0,0 +1,42 @@ +.global _gint_image_p8_normal +#include "image_macros.S" + +/* P8 Opaque rendering, RAM version: trivial. + + As usual with RAM it is fairly easy to bottleneck writing speed, and so + there is no need for complex methods. Building longwords could be an option, + but it would require output alignment with edges, which is painful. */ + +.macro GEN_NORMAL_LOOP HFLIP, OUT_DIR + mov.l @r8+, r9 /* cmd.palette */ + + .if \HFLIP + add #-2, r5 + mov r2, r0 + shll r0 + add r0, r5 + shll r0 + add r0, r6 + .endif + +1: mov r2, r8 + +2: mov.b @r3+, r0 + shll r0 + mov.w @(r0, r9), r0 + mov.w r0, @r5 + +3: dt r8 + bf.s 2b + add #\OUT_DIR, r5 + + END + EPILOGUE +.endm + +_gint_image_p8_normal: + tst #1, r0 + bf 9f + + GEN_NORMAL_LOOP 0, 2 +9: GEN_NORMAL_LOOP 1, -2 diff --git a/src/render-cg/image/image_p8_swapcolor.S b/src/render-cg/image/image_p8_swapcolor.S new file mode 100644 index 0000000..2166d5b --- /dev/null +++ b/src/render-cg/image/image_p8_swapcolor.S @@ -0,0 +1,77 @@ +.global _gint_image_p8_swapcolor +#include "image_macros.S" + +/* P8 SWAPCOLOR, RAM version: by branchless xor selection. + + The core action of this loop is to render full pixels while replacing any + occurrence of cmd.color_1 (x) with the value cmd.color_2 (y). Branching is + too slow as often, so instead we use the fact that both x and y are fixed to + use the identity + + c ^ ((x ^ y) & -(c == x)) = (c == x ? y : c) + + We materialize -(c == x) by subtracting a register from itself with subc + after the comparison (which is delightfully elegant), while (x ^ y) is pre- + computed. This way, the selection is performed in one [subc], one [and] and + one [xor] for a total of 3 EX slots. This is slower than NULL-cancelling + (which only takes 2 EX slots) but still better than symmetric alternatives. + + Since we have a palette, we further trick by comparing against the index but + selecting against the palette entry, ie. we do + + palette[c] ^ ((palette[x] ^ y) & -(c == x)) = (c == x ? y : palette[c]) + + which allows the computation to occur in parallel with the palette access + and does not require the replacement value to be located at a valid index. + + r0: [temporary] + r7: cmd.color_1 + r8: palette[cmd.color_1] ^ cmd.color_2 (ie. x ^ y) + r9: Palette + r10: Holds (x ^ y) & -(c == x) during selection */ + +.macro GEN_SWAPCOLOR_LOOP HFLIP, OUT_DIR + mov.l @r8+, r9 /* cmd.palette */ + mov.w @r8+, r0 /* cmd.edge_2 (don't care) */ + mov.w @r8+, r7 /* cmd.color_1 */ + mov.l r10, @-r15 + exts.b r7, r7 + mov r7, r0 + mov.w @r8, r8 /* cmd.color_2 */ + add r0, r0 + mov.w @(r0, r9), r0 + xor r0, r8 + + .if \HFLIP + add #-2, r5 + mov r2, r0 + shll r0 + add r0, r5 + shll r0 + add r0, r6 + .endif + + START + +2: mov.b @r3+, r0 + cmp/eq r0, r7 + add r0, r0 + subc r10, r10 + mov.w @(r0, r9), r0 + and r8, r10 + xor r10, r0 + mov.w r0, @r5 +3: add #\OUT_DIR, r5 + + END + + mov.l @r15+, r10 + EPILOGUE +.endm + +_gint_image_p8_swapcolor: + tst #1, r0 + bf 9f + + GEN_SWAPCOLOR_LOOP 0, 2 +9: GEN_SWAPCOLOR_LOOP 1, -2 diff --git a/src/render-cg/image/image_p8_swapcolor.c b/src/render-cg/image/image_p8_swapcolor.c new file mode 100644 index 0000000..55fe81f --- /dev/null +++ b/src/render-cg/image/image_p8_swapcolor.c @@ -0,0 +1,46 @@ +#include +#include + +void dimage_p8_swapcolor(int x, int y, image_t const *img, int eff, + int old_color, int new_color) +{ + dsubimage_p8_swapcolor(x, y, img, 0, 0, img->width, img->height, + eff, old_color, new_color); +} + +void dsubimage_p8_swapcolor(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, int old_index, int new_color) +{ + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH, + DHEIGHT)) return; + cmd.effect += 8; + cmd.color_1 = old_index; + cmd.color_2 = new_color; + cmd.loop = gint_image_p8_swapcolor; + gint_image_p8_loop(DWIDTH, &cmd); +} + +void dimage_p8_addbg(int x, int y, image_t const *img, int eff, + int bg_color) +{ + dsubimage_p8_addbg(x, y, img, 0, 0, img->width, img->height, + eff, bg_color); +} + +void dsubimage_p8_addbg(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, int bg_color) +{ + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH, + DHEIGHT)) return; + cmd.effect += 8; + cmd.color_1 = img->alpha; + cmd.color_2 = bg_color; + cmd.loop = gint_image_p8_swapcolor; + gint_image_p8_loop(DWIDTH, &cmd); +} diff --git a/src/render-cg/image/image_rgb16.S b/src/render-cg/image/image_rgb16.S new file mode 100644 index 0000000..4ee1b2f --- /dev/null +++ b/src/render-cg/image/image_rgb16.S @@ -0,0 +1,69 @@ +.global _gint_image_rgb16_loop + +/* gint's image renderer: 16-bit RGB entry piont + + These formats are the simplest of the bunch. RGB565 can use longword access + in cases when alignment is favorable and no geometric effect is applied. In + other cases, pixels are handled individually; geometric effects affect the + input/output logic while color effects change the computations themselves. + + r0: - (initially: cmd.effect) + r1: Number of lines remaining to draw + r2: Number of columns per line + r3: Input pointer + r4: Input stride + r5: Output pointer + r6: Output stride + r7: Right edge (only used in Azur) or [temporary] + r8: - (initially: cmd) + r9: - (initially: cmd.loop) */ + +_gint_image_rgb16_loop: + /* r4: int output_width (pixels) + r5: struct gint_image_cmd *cmd */ + + mov.b @(1,r5), r0 /* cmd.effect */ + add #2, r5 + + mov.l r8, @-r15 + mov r4, r6 + + mov.w @r5+, r2 /* cmd.columns */ + mov r5, r8 + + /* For here on the command is r8 */ + + mov.l r9, @-r15 + shlr r0 /* T bit is now VFLIP */ + + mov.w @r8+, r4 /* cmd.input_stride */ + sub r2, r6 + + mov.b @r8+, r1 /* cmd.lines */ + add r6, r6 + + mov.b @r8+, r9 /* cmd.edge_1 (don't care) */ + nop + + mov.l @r8+, r9 + extu.b r1, r1 + + mov.l @r8+, r5 /* cmd.output */ + nop + + mov.l @r8+, r3 /* cmd.input */ + nop + + bf.s _NO_VFLIP + add #4, r8 /* cmd.palette (don't care) */ + +_VFLIP: + neg r4, r4 + nop + +_NO_VFLIP: + sub r2, r4 + nop + + jmp @r9 + add r4, r4 diff --git a/src/render-cg/image/image_rgb16.c b/src/render-cg/image/image_rgb16.c new file mode 100644 index 0000000..b0dbf68 --- /dev/null +++ b/src/render-cg/image/image_rgb16.c @@ -0,0 +1,43 @@ +#include +#include + +void dimage_rgb16(int x, int y, image_t const *img, int eff) +{ + dsubimage_rgb16(x, y, img, 0, 0, img->width, img->height, eff); +} + +void dsubimage_rgb16(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff) +{ + if(img->profile == IMAGE_RGB565A) + return dsubimage_rgb16_clearbg(x, y, img, left, top, w, h, eff, + img->alpha); + + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH, + DHEIGHT)) return; + cmd.loop = gint_image_rgb16_normal; + gint_image_rgb16_loop(DWIDTH, &cmd); +} + +void dimage_rgb16_clearbg(int x, int y, image_t const *img, int eff,int bg) +{ + dsubimage_rgb16_clearbg(x, y, img, 0, 0, img->width, img->height, eff, + bg); +} + +void dsubimage_rgb16_clearbg(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, int bg_color) +{ + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH, + DHEIGHT)) return; + cmd.effect += 4; + cmd.color_1 = bg_color; + cmd.loop = gint_image_rgb16_clearbg; + gint_image_rgb16_loop(DWIDTH, &cmd); +} diff --git a/src/render-cg/image/image_rgb16_clearbg_dye.S b/src/render-cg/image/image_rgb16_clearbg_dye.S new file mode 100644 index 0000000..904aa3d --- /dev/null +++ b/src/render-cg/image/image_rgb16_clearbg_dye.S @@ -0,0 +1,53 @@ +.global _gint_image_rgb16_clearbg +.global _gint_image_rgb16_dye +#include "image_macros.S" + +/* RGB16 CLEARBG and DYE, RAM version: trivial. + + This function handles both CLEARBG and DYE; in RGB16 they are the same, + except that DYE writes not the pixel value (TMP) but a fixed color (SRC). As + if often the case, the RAM speed is limiting, so there is no point in + improving speed of the code on the CPU side. */ + +.macro GEN_CLEARBG_DYE_LOOP HFLIP, OUT_DIR, TMP, SRC + mov.w @r8+, r0 /* cmd.edge_2 (don't care) */ + mov.w @r8+, r9 /* cmd.color_1 (alpha color) */ + mov.w @r8+, r0 /* cmd.color_2 (dye color) */ + + .if \HFLIP + add #-2, r5 + mov r2, r8 + shll r8 + add r8, r5 + shll r8 + add r8, r6 + .endif + +1: mov r2, r8 + +2: mov.w @r3+, \TMP + cmp/eq \TMP, r9 + bt 3f + mov.w \SRC, @r5 + +3: dt r8 + bf.s 2b + add #(\OUT_DIR/2), r5 + + END + EPILOGUE +.endm + +_gint_image_rgb16_clearbg: + tst #1, r0 + bf 9f + + GEN_CLEARBG_DYE_LOOP 0, 4, r0, r0 +9: GEN_CLEARBG_DYE_LOOP 1, -4, r0, r0 + +_gint_image_rgb16_dye: + tst #1, r0 + bf 9f + + GEN_CLEARBG_DYE_LOOP 0, 4, r7, r0 +9: GEN_CLEARBG_DYE_LOOP 1, -4, r7, r0 diff --git a/src/render-cg/image/image_rgb16_dye.c b/src/render-cg/image/image_rgb16_dye.c new file mode 100644 index 0000000..0757fe8 --- /dev/null +++ b/src/render-cg/image/image_rgb16_dye.c @@ -0,0 +1,23 @@ +#include +#include + +void dimage_rgb16_dye(int x, int y, image_t const *img, int eff, int dye_color) +{ + dsubimage_rgb16_dye(x, y, img, 0, 0, img->width, img->height, eff, + dye_color); +} + +void dsubimage_rgb16_dye(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, int dye_color) +{ + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH, + DHEIGHT)) return; + cmd.effect += 12; + cmd.color_1 = img->alpha; + cmd.color_2 = dye_color; + cmd.loop = gint_image_rgb16_dye; + gint_image_rgb16_loop(DWIDTH, &cmd); +} diff --git a/src/render-cg/image/image_rgb16_effect.c b/src/render-cg/image/image_rgb16_effect.c new file mode 100644 index 0000000..64c9b5c --- /dev/null +++ b/src/render-cg/image/image_rgb16_effect.c @@ -0,0 +1,32 @@ +#include + +void dsubimage_rgb16_effect(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, ...) +{ + va_list args; + va_start(args, eff); + + if(eff & IMAGE_CLEARBG) { + int bg = va_arg(args, int); + dsubimage_rgb16_clearbg(x, y, img, left, top, w, h, eff, bg); + } + else if(eff & IMAGE_SWAPCOLOR) { + int from = va_arg(args, int); + int to = va_arg(args, int); + dsubimage_rgb16_swapcolor(x, y, img, left, top, w, h, eff, + from, to); + } + else if(eff & IMAGE_ADDBG) { + int bg = va_arg(args, int); + dsubimage_rgb16_addbg(x, y, img, left, top, w, h, eff, bg); + } + else if(eff & IMAGE_DYE) { + int dye = va_arg(args, int); + dsubimage_rgb16_dye(x, y, img, left, top, w, h, eff, dye); + } + else { + dsubimage_rgb16(x, y, img, left, top, w, h, eff); + } + + va_end(args); +} diff --git a/src/render-cg/image/image_rgb16_normal.S b/src/render-cg/image/image_rgb16_normal.S new file mode 100644 index 0000000..bc9d087 --- /dev/null +++ b/src/render-cg/image/image_rgb16_normal.S @@ -0,0 +1,201 @@ +.global _gint_image_rgb16_normal +#include "image_macros.S" + +/* RGB16 Opaque rendering, RAM version: by longword access. + + This function of the image renderer is designed for the RAM model only. At + default overclock levels, the RAM can register a write every 13-14 cycles, + regardless of size. Since this amount of time is more than enough to build a + target longword regardless of alignment and geometry considerations, the + main and only focus of this function is to only write longwords. + + Since longwords can only be written at 4-aligned addresses and always make + pairs of pixels, there are variations on the loop depending on the rendered + width and destination. These are marked with the following convention: + + * w1 / w2 denotes the parity of the command width; + * o2 / o4 denotes the alignment of the output. + + There is a forward and a backward variation for all four combinations of + these parameters, noted F_ and B_ in label names. Some word-based variations + are provided for width ≤ 8, which is just a way to ensure that the longword- + based loops always have a least one interation, since they're implemented as + do/while. + + The loops themselves are nowhere near tight on the CPU side and entirely + bottlenecked by the RAM, hence the simplicity and complete disregard for + superscalar parallelism. */ + +_gint_image_rgb16_normal: + /* We use word copy for width ≤ 8; this is to ensure that there is at + least one longword in the non-trivial loop, simplifying checks */ + tst #1, r0 + mov #8, r0 + + bf.s .BACKWARD + cmp/ge r2, r0 + +.FORWARD: + bt _FORWARD_WORD_COPY + nop + + bra _FORWARD_LONG_COPY + nop + +.BACKWARD: + mov r2, r0 + add r0, r0 + add r0, r5 + add r0, r0 + + bt.s _BACKWARD_WORD_COPY + add r0, r6 + + bra _BACKWARD_LONG_COPY + nop + +_FORWARD_WORD_COPY: + START +2: movs.w @r3+, x0 +3: movs.w x0, @r5+ + END + EPILOGUE + +_BACKWARD_WORD_COPY: + START +2: movs.w @r3+, x0 +3: movs.w x0, @-r5 + END + EPILOGUE + +_FORWARD_LONG_COPY: + shlr r2 /* Test width parity */ + mov #2, r0 + + bt .F_w1 + nop + +.F_w2: tst r0, r5 /* Test alignment of output */ + bf .F_w2o2 + +.F_w2o4: + START +2: mov.w @r3+, r0 + mov.w @r3+, r7 + shll16 r7 + xtrct r0, r7 + mov.l r7, @r5 +3: add #4, r5 + END + EPILOGUE + +.F_w2o2: + add #-1, r2 + START + mov.w @r3+, r0 + mov.w r0, @r5 + add #2, r5 +2: mov.w @r3+, r0 + mov.w @r3+, r7 + shll16 r7 + xtrct r0, r7 + mov.l r7, @r5 +3: add #4, r5 + mov.w @r3+, r0 + mov.w r0, @r5 + add #2, r5 + END + EPILOGUE + +.F_w1: tst r0, r5 /* Test alignment of output */ + bf .F_w1o2 + +.F_w1o4: + START +2: mov.w @r3+, r0 + mov.w @r3+, r7 + shll16 r7 + xtrct r0, r7 + mov.l r7, @r5 +3: add #4, r5 + mov.w @r3+, r0 + mov.w r0, @r5 + add #2, r5 + END + EPILOGUE + +.F_w1o2: + START + mov.w @r3+, r0 + mov.w r0, @r5 + add #2, r5 +2: mov.w @r3+, r0 + mov.w @r3+, r7 + shll16 r7 + xtrct r0, r7 + mov.l r7, @r5 +3: add #4, r5 + END + EPILOGUE + +_BACKWARD_LONG_COPY: + shlr r2 /* Test width parity */ + mov #2, r0 + + bt .B_w1 + nop + +.B_w2: tst r0, r5 /* Test alignment of output */ + bf .B_w2o2 + +.B_w2o4: + START +2: mov.w @r3+, r0 + mov.w @r3+, r7 + shll16 r0 + xtrct r7, r0 +3: mov.l r0, @-r5 + END + EPILOGUE + +.B_w2o2: + add #-1, r2 + START + mov.w @r3+, r0 + mov.w r0, @-r5 +2: mov.w @r3+, r0 + mov.w @r3+, r7 + shll16 r0 + xtrct r7, r0 +3: mov.l r0, @-r5 + mov.w @r3+, r0 + mov.w r0, @-r5 + END + EPILOGUE + +.B_w1: tst r0, r5 /* Test alignment of output */ + bf .B_w1o2 + +.B_w1o4: + START +2: mov.w @r3+, r0 + mov.w @r3+, r7 + shll16 r0 + xtrct r7, r0 +3: mov.l r0, @-r5 + mov.w @r3+, r0 + mov.w r0, @-r5 + END + EPILOGUE + +.B_w1o2: + START + mov.w @r3+, r0 + mov.w r0, @-r5 +2: mov.w @r3+, r0 + mov.w @r3+, r7 + shll16 r0 + xtrct r7, r0 +3: mov.l r0, @-r5 + END + EPILOGUE diff --git a/src/render-cg/image/image_rgb16_swapcolor.S b/src/render-cg/image/image_rgb16_swapcolor.S new file mode 100644 index 0000000..924b9fb --- /dev/null +++ b/src/render-cg/image/image_rgb16_swapcolor.S @@ -0,0 +1,45 @@ +.global _gint_image_rgb16_swapcolor +#include "image_macros.S" + +/* RGB16 SWAPCOLOR, RAM version: trivial. + + This function is once again bottlenecked by RAM. Generating longwords would + be tight and require significant adjustments, so we stick to words, and the + trivial bopti-style version already maxes out the output rate. */ + +.macro GEN_SWAPCOLOR_LOOP HFLIP, OUT_DIR + mov.w @r8+, r0 /* cmd.edge_2 (don't care) */ + mov.w @r8+, r9 /* cmd.color_1 */ + mov.w @r8+, r7 /* cmd.color_2 */ + + .if \HFLIP + add #-2, r5 + mov r2, r0 + shll r0 + add r0, r5 + shll r0 + add r0, r6 + .endif + +1: mov r2, r8 + +2: mov.w @r3+, r0 + cmp/eq r0, r9 + bf 4f + mov r7, r0 +4: mov.w r0, @r5 + +3: dt r8 + bf.s 2b + add #\OUT_DIR, r5 + + END + EPILOGUE +.endm + +_gint_image_rgb16_swapcolor: + tst #1, r0 + bf 9f + + GEN_SWAPCOLOR_LOOP 0, 2 +9: GEN_SWAPCOLOR_LOOP 1, -2 diff --git a/src/render-cg/image/image_rgb16_swapcolor.c b/src/render-cg/image/image_rgb16_swapcolor.c new file mode 100644 index 0000000..2167ce4 --- /dev/null +++ b/src/render-cg/image/image_rgb16_swapcolor.c @@ -0,0 +1,46 @@ +#include +#include + +void dimage_rgb16_swapcolor(int x, int y, image_t const *img, int eff, + int old_color, int new_color) +{ + dsubimage_rgb16_swapcolor(x, y, img, 0, 0, img->width, img->height, + eff, old_color, new_color); +} + +void dsubimage_rgb16_swapcolor(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, int old_color, int new_color) +{ + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH, + DHEIGHT)) return; + cmd.effect += 8; + cmd.color_1 = old_color; + cmd.color_2 = new_color; + cmd.loop = gint_image_rgb16_swapcolor; + gint_image_rgb16_loop(DWIDTH, &cmd); +} + +void dimage_rgb16_addbg(int x, int y, image_t const *img, int eff, + int bg_color) +{ + dsubimage_rgb16_addbg(x, y, img, 0, 0, img->width, img->height, + eff, bg_color); +} + +void dsubimage_rgb16_addbg(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, int bg_color) +{ + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH, + DHEIGHT)) return; + cmd.effect += 8; + cmd.color_1 = img->alpha; + cmd.color_2 = bg_color; + cmd.loop = gint_image_rgb16_swapcolor; + gint_image_rgb16_loop(DWIDTH, &cmd); +}