render-cg: add new image rendering functions with dynamic effects

2022-05-04 17:27:02 +01:00 · 2022-05-04 17:27:02 +01:00 · f219e5c882
parent 904ab74984
commit f219e5c882
31 changed files with 2506 additions and 47 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -169,6 +169,35 @@ set(SOURCES_CG
  src/render-cg/gint_dline.c
  src/render-cg/topti-asm.s
  src/render-cg/topti.c
+  # Fast image renderer
+  src/render-cg/image/image.c
+  src/render-cg/image/image_rgb16.S
+  src/render-cg/image/image_rgb16_normal.S
+  src/render-cg/image/image_rgb16_clearbg_dye.S
+  src/render-cg/image/image_rgb16_swapcolor.S
+  src/render-cg/image/image_p8.S
+  src/render-cg/image/image_p8_normal.S
+  src/render-cg/image/image_p8_clearbg.S
+  src/render-cg/image/image_p8_swapcolor.S
+  src/render-cg/image/image_p8_dye.S
+  src/render-cg/image/image_p4.S
+  src/render-cg/image/image_p4_normal.S
+  src/render-cg/image/image_p4_clearbg.S
+  src/render-cg/image/image_p4_swapcolor.S
+  src/render-cg/image/image_p4_dye.S
+  # Interface to the fast image renderer
+  src/render-cg/image/image_rgb16.c
+  src/render-cg/image/image_rgb16_effect.c
+  src/render-cg/image/image_rgb16_swapcolor.c
+  src/render-cg/image/image_rgb16_dye.c
+  src/render-cg/image/image_p8.c
+  src/render-cg/image/image_p8_effect.c
+  src/render-cg/image/image_p8_swapcolor.c
+  src/render-cg/image/image_p8_dye.c
+  src/render-cg/image/image_p4.c
+  src/render-cg/image/image_p4_effect.c
+  src/render-cg/image/image_p4_swapcolor.c
+  src/render-cg/image/image_p4_dye.c
 )

 set(ASSETS_FX src/font5x7.png)
--- a/include/gint/display-cg.h
+++ b/include/gint/display-cg.h
@ -1,11 +1,15 @@
 //---
-//	gint:display-cg - fxcg50 rendering functions
+// gint:display-cg - fx-CG 50 rendering functions
 //
-//	This module covers all 16-bit opaque rendering functions. For
-//	gamma-related functions, color composition, check out a color library.
+// This module covers rendering functions specific to the fx-CG 50. In addition
+// to triple-buffering management, this mainly includes image manipulation
+// tools as well as the very versatile dimage_effect() and dsubimage_effect()
+// functions that support high-performance image rendering with a number of
+// geometric and color effects.
 //
-//	All the functions in this module work on a 396x224 resolution - gint
-//	lets you use the full surface!
+// The fx-CG OS restricts the display to a 384x216 rectangle rougly around the
+// center, leaving margins on three sides. However, gint configures the display
+// to use the full 396x224 surface!
 //---

 #ifndef GINT_DISPLAY_CG
@ -18,6 +22,7 @@ extern "C" {
 #endif

 #include <gint/defs/types.h>
+#include <gint/image.h>

 /* Dimensions of the VRAM */
 #define DWIDTH 396
@ -57,49 +62,9 @@ enum {
   green is not used). */
 #define C_RGB(r,g,b) (((r) << 11) | ((g) << 6) | (b))

-//---
-//	Image rendering (bopti)
-//---
+/* See <gint/image.h> for the details on image manipulation. */
+typedef image_t bopti_image_t;

-/* bopti_image_t: Image files encoded for bopti
-   This format is created by the fxSDK's [fxconv] tool from standard images. */
-typedef struct
-{
-	/* Color profile (type of palette), could be extended into a bit field
-	   later on */
-	uint16_t profile;
-
-	/* Color code assigned to transparent pixels (unused in 16-bit). In
-	   P8_RGB565A, the value assigned to alpha is always 0. */
-	uint16_t alpha;
-
-	/* Full width and height, in pixels */
-	uint16_t width;
-	uint16_t height;
-
-	/* Here we lose structure because of the flexible array.
-
-	   RGB565, RGB565A:
-	     * Pixels in row-major order, 16 bits per pixel
-	   P8:
-	     * Palette with 256 entries (512 bytes total)
-	     * Pixels in row-major order, 8 bits per pixel
-	   P8_RGB565A, P8_RGB565:
-	     * Number of entries in palette, N (2 bytes)
-	     * Palette with N entries (2N bytes)
-	     * Pixels in row-major order, 8 bits per pixel (signed indices in
-	       an uint16_t array starting at <palette>+<256 bytes>)
-	   P4/P4_RGB565A, P4_RGB565:
-	     * Palette with 16 entries (32 bytes total)
-	     * Pixels in row-major order, 4 bits per pixel, each row
-	       byte-padded */
-	uint16_t data[];
-
-} GPACKED(4) bopti_image_t;
-
-/* Old alias to image_t, now deprecated because of libimg */
-typedef bopti_image_t image_t __attribute__((deprecated(
-	"image_t has been renamed to bopti_image_t")));

 //---
 //	Video RAM management
--- a/include/gint/image.h
+++ b/include/gint/image.h
@ -0,0 +1,365 @@
+//---
+// gint:image - Image manipulation and rendering
+//
+// Note: this module is currently only available on fx-CG.
+//
+// This header provides image manipulation functions. This mainly consists of a
+// reference-based image format, various access and modification functions, and
+// a number of high-performance transformations and rendering effects. If you
+// find yourself limited by rendering time, note that RAM writing speed is
+// often the bottleneck, and image rendering is much faster in Azur (which is
+// what the renderer was initially designed for).
+//
+// We support 3 bit depths: full-color 16-bit (RGB565), indexed 8-bit (P8) and
+// indexed 4-bit (P4). All three have an "alpha" variation where one color is
+// treated as transparent, leading to 6 total formats.
+//
+// The image renderers support so-called *dynamic effects*, which are image
+// transformations performed on-the-fly while rendering, without generating an
+// intermediate image. They comprise straightforward transformations that
+// achieve similar performance to straight rendering and can be combined to
+// some extent, which makes them reliable whenever applicable.
+//
+// TODO: Switch to libimg-style image refs.
+//---
+
+#ifndef GINT_IMAGE
+#define GINT_IMAGE
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef FXCG50
+#error <gint/image.h> is only supported on FXCG50
+#else
+
+#include <gint/defs/attributes.h>
+#include <gint/defs/types.h>
+
+//---
+// Image structures
+//---
+
+/* Image formats. Note that transparency really only indicates the default
+   rendering method, as a transparent background can always be added or removed
+   by a dynamic effect on any image. */
+enum {
+	IMAGE_RGB565      = 0,  /* RGB565 without alpha */
+	IMAGE_RGB565A     = 1,  /* RGB565 with one transparent color */
+	IMAGE_P8_RGB565   = 4,  /* 8-bit palette, all opaque colors */
+	IMAGE_P8_RGB565A  = 5,  /* 8-bit with one transparent color */
+	IMAGE_P4_RGB565   = 6,  /* 4-bit palette, all opaque colors */
+	IMAGE_P4_RGB565A  = 3,  /* 4-bit with one transparent color */
+
+	IMAGE_DEPRECATED_P8 = 2,
+};
+
+/* image_t: gint's native bitmap image format
+   Images of this format can be created through this header's API but also by
+   using the fxSDK's built-in image converters with fxconv. */
+typedef struct
+{
+	/* Color format, one of the IMAGE_* values defined above. */
+	uint16_t profile;
+	/* For formats with alpha, value or index used for transparency. */
+	uint16_t alpha;
+	/* Full width and height, in pixels */
+	uint16_t width;
+	uint16_t height;
+
+	/* Here we lose structure because of the flexible array.
+
+	   RGB565, RGB565A:
+	     * Pixels in row-major order, 16 bits per pixel
+	   P8:
+	     * Palette with 256 entries (512 bytes total)
+	     * Pixels in row-major order, 8 bits per pixel
+	   P8_RGB565A, P8_RGB565:
+	     * Number of entries in palette, N (2 bytes)
+	     * Palette with N entries (2N bytes)
+	     * Pixels in row-major order, 8 bits per pixel (signed indices in
+	       an uint16_t array starting at <palette>+<256 bytes>)
+	   P4/P4_RGB565A, P4_RGB565:
+	     * Palette with 16 entries (32 bytes total)
+	     * Pixels in row-major order, 4 bits per pixel, each row
+	       byte-padded */
+	uint16_t data[];
+
+} GPACKED(4) image_t;
+
+/* Dynamic effects: these transformations can be applied on images while
+   rendering. Not all effects can be combined; unless specified otherwise:
+   - HFLIP and VFLIP can both be added regardless of any other effect
+   - At most one color effect can be applied */
+enum {
+	/* Value 0x01 is reserved, because it is DIMAGE_NOCLIP, which although
+	   part of the old API still needs to be supported. */
+
+	/* [Any]: Skip clipping the command against the source image */
+	IMAGE_NOCLIP_INPUT   = 0x04,
+	/* [Any]: Skip clipping the command against the output VRAM */
+	IMAGE_NOCLIP_OUTPUT  = 0x08,
+	/* [Any]: Skip clipping both */
+	IMAGE_NOCLIP         = IMAGE_NOCLIP_INPUT | IMAGE_NOCLIP_OUTPUT,
+
+	// Geometric effects. These values should remain at exactly bit 8 and
+	// following, or change gint_image_mkcmd() along with it.
+
+	/* [Any]: Flip image vertically */
+	IMAGE_VFLIP          = 0x0100,
+	/* [Any]: Flip image horizontally */
+	IMAGE_HFLIP          = 0x0200,
+
+	// Color effects
+
+	/* [RGB565, P8_RGB565, P4_RGB565]: Make a color transparent
+	   Adds one argument:
+	   * Color to clear (RGB16: 16-bit value; P8/P4: palette index) */
+	IMAGE_CLEARBG        = 0x10,
+	/* [RGB565, P8_RGB565, P4_RGB565]: Turn a color into another
+	   Adds two arguments:
+	   * Color to replace (RGB16: 16-bit value; P8/P4: palette index)
+	   * Replacement color (16-bit value) */
+	IMAGE_SWAPCOLOR      = 0x20,
+	/* [RGB565A, P8_RGB565A, P4_RGB565A]: Add a background
+	    Adds one argument:
+	    * Background color (16-bit value) */
+	IMAGE_ADDBG          = 0x40,
+	/* [RGB565A, P8_RGB565A, P4_RGB565A]: Dye all non-transparent pixels
+	   Adds one argument:
+	   * Dye color (16-bit value) */
+	IMAGE_DYE            = 0x80,
+};
+
+//---
+// Image access and information
+//---
+
+/* TODO: Expand */
+
+int image_get_pixel(image_t const *img, int x, int y);
+
+int image_decode_pixel(image_t const *img, int pixel);
+
+//---
+// Image rendering functions
+//
+// The following functions extend dimage() and dsubimage(). The [effects]
+// parameter takes a combination of IMAGE_* flags and effects, limited to the
+// combinations previously described, with additional arguments depending on
+// the color effect being applied.
+//
+//   dimage_effect(x, y, img, effects, ...)
+//   dsubimage_effect(x, y, img, left, top, w, h, effects, ...)
+//
+// However if you use these super-generic functions you will link the code for
+// all effects and all formats into your add-in, which takes a fair amount of
+// space. If that's a problem, you can use the more specific functions below:
+//
+// * dimage_<FORMAT>_<EFFECT>() for one particular format (rgb16, p8, p4) along
+//   with one particular color effect (clearbg, swapcolor, addbg, dye).
+// * dimage_<FORMAT>() is like the above when no color effect is applied.
+//
+// All of them support the HFLIP and VFLIP flags. For effect-specific functions
+// the corresponding effect flag can be omitted (fi. IMAGE_CLEARBG is implicit
+// when using dimage_p8_clearbg()).
+//---
+
+/* dimage_effect(): Generalized dimage() supporting dynamic effects */
+#define dimage_effect(x, y, img, eff, ...) \
+	dsubimage_effect(x, y, img, 0, 0, (img)->width, (img)->height, eff, \
+		##__VA_ARGS__)
+/* dsubimage_effect(): Generalized dsubimage() supporting dynamic effects */
+void dsubimage_effect(int x, int y, image_t const *img,
+	int left, int top, int w, int h, int effects, ...);
+
+/* Specific versions for each format */
+#define DIMAGE_SIG1(NAME, ...) \
+	void dimage_ ## NAME(int x, int y, image_t const *img,##__VA_ARGS__); \
+	void dsubimage_ ## NAME(int x, int y, image_t const *img, \
+		int left, int top, int w, int h, ##__VA_ARGS__);
+#define DIMAGE_SIG(NAME, ...) \
+	DIMAGE_SIG1(rgb16 ## NAME, ##__VA_ARGS__) \
+	DIMAGE_SIG1(p8 ## NAME, ##__VA_ARGS__) \
+	DIMAGE_SIG1(p4 ## NAME, ##__VA_ARGS__)
+
+/* d[sub]image_{rgb16,p8,p4}_effect(..., effects, <extra arguments>) */
+DIMAGE_SIG(_effect, int effects, ...)
+/* d[sub]image_{rgb16,p8,p4}(..., effects) (no color effect, like dimage()) */
+DIMAGE_SIG(, int effects)
+/* d[sub]image_{rgb16,p8,p4}_clearbg(..., effects, bg_color_or_index) */
+DIMAGE_SIG(_clearbg, int effects, int bg_color_or_index)
+/* d[sub]image_{rgb16,p8,p4}_swapcolor(..., effects, source, replacement) */
+DIMAGE_SIG(_swapcolor, int effects, int source, int replacement)
+/* d[sub]image_{rgb16,p8,p4}_addbg(..., effects, bg_color) */
+DIMAGE_SIG(_addbg, int effects, int bg_color)
+/* d[sub]image_{rgb16,p8,p4}_dye(..., effects, dye_color) */
+DIMAGE_SIG(_dye, int effects, int dye_color)
+
+#define dimage_rgb16_effect(x, y, img, eff, ...) \
+	dsubimage_rgb16_effect(x, y, img, 0, 0, (img)->width, (img)->height, \
+		eff, ##__VA_ARGS__)
+#define dimage_p8_effect(x, y, img, eff, ...) \
+	dsubimage_p8_effect(x, y, img, 0, 0, (img)->width, (img)->height, \
+		eff, ##__VA_ARGS__)
+#define dimage_p4_effect(x, y, img, eff, ...) \
+	dsubimage_p4_effect(x, y, img, 0, 0, (img)->width, (img)->height, \
+		eff, ##__VA_ARGS__)
+
+#undef DIMAGE_SIG
+#undef DIMAGE_SIG1
+
+//---
+// Clipping utilities
+//---
+
+/* Double box specifying both a source and target area */
+struct gint_image_box
+{
+	/* Target location of top-left corner */
+	int x, y;
+	/* Width and height of rendered sub-image */
+	int w, h;
+	/* Source bounding box (low included, high excluded) */
+	int left, top;
+};
+
+/* Clip the provided box against the input. If, after clipping, the box no
+   longer intersects the output (whose size is specified as out_w/out_h),
+   returns false. Otherwise, returns true. */
+bool gint_image_clip_input(image_t const *img, struct gint_image_box *box,
+    int out_w, int out_h);
+
+/* Clip the provided box against the output. */
+void gint_image_clip_output(struct gint_image_box *b, int out_w, int out_h);
+
+//---
+// Internal image rendering routines
+//
+// The following functions (or non-functions) are implemented in assembler and
+// make up the internal interface of the image renderer. If you just want to
+// display images, use dimage() and variations; these are only useful if you
+// have a different rendering system and wish to use image rendering with
+// dynamic effects in it.
+//---
+
+/* Renderer command. This structure includes most of the information used by
+   the image renderer to perform blits. Some of the information on the target
+   is also passed as direct arguments, which is more convenient and slightly
+   faster.
+
+   Most of the values here can be set with gint_image_mkcmd(). The last two
+   members, along with the return values of the gint_image_FORMAT_loop()
+   functions, are used to update the command if one needs to draw *parts* of
+   the image and resume the rendering later. This is used in Azur. */
+struct gint_image_cmd
+{
+	/* Shader ID. This is used in Azur, and ignored in gint */
+	uint8_t shader_id;
+	/* Dynamic effects
+	    Bit 0:     VFLIP
+	    Bit 1:     HFLIP
+	    Bits 2-7:  0=NONE, 1=CLEARBG, 2=SWAPCOLOR, 3=DYE */
+	uint8_t effect;
+
+	/* Number of pixels to render per line. For formats that force either x
+	   or width alignment (most of them), this is already adjusted to a
+	   suitable multiple (usually a multiple of 2). */
+	int16_t columns;
+
+	/* Stride of the input image (number of pixels between each row), in
+	   pixels, without subtracting the number of columns */
+	int16_t input_stride;
+
+	/* Number of lines in the command. This can be adjusted freely, and is
+	   particularly useful in Azur for fragmented rendering. */
+	uint8_t lines;
+
+	/* [Any effect]: Offset of first edge */
+	int8_t edge_1;
+
+	/* Core loop; this is an internal label of the renderer */
+	void const *loop;
+	/* Output pixel array, offset by target x/y */
+	void const *output;
+	/* Input pixel array, offset by source x/y. For formats that force x
+	   alignment, this is already adjusted. */
+	void const *input;
+	/* Palette, when applicable */
+	uint16_t const *palette;
+
+	/* [Any effect]: Offset of right edge */
+	int16_t edge_2;
+	/* [CLEARBG, SWAPCOLOR]: Source color */
+	uint16_t color_1;
+	/* [SWAPCOLOR]: Destination color */
+	uint16_t color_2;
+
+	/* Remaining height (for updates between fragments) */
+	int16_t height;
+	/* Local x position (for updates between fragments) */
+	int16_t x;
+};
+
+/* gint_image_mkcmd(): Prepare a rendering command with dynamic effects
+
+   This function crafts an image renderer command. It loads all the settings
+   except for effect-dependent parameters: the [.loop] label, the color section
+   of [.effect], and color effect settings. See the effect-specific functions
+   to see how they are defined.
+
+   The benefit of this approach is that the rendering code does not need to be
+   linked in unless an effect is actually used, which avoids blowing up the
+   size of the add-in as the number of support dynamic effects increases.
+
+   @box         Requested on-screen box (will be clipped depending on effects)
+   @img         Source image
+   @effects     Set of dynamic effects to be applied, as an [IMAGE_*] bitmask
+   @left_edge   Whether to force 2-alignment on the input (box->left)
+   @right_edge  Whether to force 2-alignment on the width
+   @cmd         Command to be filled
+   @out_width   Output width (usually DWIDTH)
+   @out_height  Output height (usually DHEIGHT)
+
+   Returns false if there is nothing to render because of clipping (in which
+   case [cmd] is unchanged), true otherwise. [*box] is also updated to reflect
+   the final box after clipping but not accounting for edges.  */
+bool gint_image_mkcmd(struct gint_image_box *box, image_t const *img,
+	int effects, bool left_edge, bool right_edge,
+	struct gint_image_cmd *cmd, int out_width, int out_height);
+
+/* Entry point of the renderers. These functions can be called normally as long
+   as you can build the commands (eg. by using gint_image_mkcmd() then filling
+   the effect-specific information). */
+void *gint_image_rgb16_loop  (int output_width, struct gint_image_cmd *cmd);
+void *gint_image_p8_loop     (int output_width, struct gint_image_cmd *cmd);
+void *gint_image_p4_loop     (int output_width, struct gint_image_cmd *cmd);
+
+/* Renderer fragments. The following can absolutely not be called from C code
+   as they aren't full functions (and this isn't their prototype). These are
+   continuations to be specified in the [.loop] field of a command before using
+   one of the functions above. */
+
+void gint_image_rgb16_normal(void);
+void gint_image_rgb16_clearbg(void);
+void gint_image_rgb16_swapcolor(void);
+void gint_image_rgb16_dye(void);
+
+void gint_image_p8_normal(void);
+void gint_image_p8_clearbg(void);
+void gint_image_p8_swapcolor(void);
+void gint_image_p8_dye(void);
+
+void gint_image_p4_normal(void);
+void gint_image_p4_clearbg(void);
+void gint_image_p4_swapcolor(void);
+void gint_image_p4_dye(void);
+
+#endif /* FXCG50 */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* GINT_IMAGE */
--- a/src/render-cg/image/image.c
+++ b/src/render-cg/image/image.c
@ -0,0 +1,107 @@
+#include <gint/image.h>
+#include <gint/display.h>
+
+bool gint_image_clip_input(image_t const *img, struct gint_image_box *b,
+	int out_w, int out_h)
+{
+	/* Adjust the bounding box of the input image */
+	if(b->left < 0) b->w += b->left, b->x -= b->left, b->left = 0;
+	if(b->top  < 0) b->h += b->top,  b->y -= b->top,  b->top  = 0;
+	if(b->left + b->w > img->width)  b->w = img->width  - b->left;
+	if(b->top  + b->h > img->height) b->h = img->height - b->top;
+
+	/* Check whether the box intersects the screen */
+	if(b->w <= 0 || b->h <= 0)
+		return false;
+	if(b->x + b->w <= 0 || b->x >= out_w)
+		return false;
+	if(b->y + b->w <= 0 || b->y >= out_h)
+		return false;
+
+	return true;
+}
+
+void gint_image_clip_output(struct gint_image_box *b, int out_w, int out_h)
+{
+	/* Intersect with the bounding box on-screen */
+	if(b->y < 0) b->top -= b->y, b->h += b->y, b->y = 0;
+	if(b->y + b->h > out_h) b->h = (out_h - b->y);
+	if(b->x < 0) b->left -= b->x, b->w += b->x, b->x = 0;
+	if(b->x + b->w > out_w) b->w = (out_w - b->x);
+}
+
+bool gint_image_mkcmd(struct gint_image_box *box, image_t const *img,
+	int effects, bool left_edge, bool right_edge,
+	struct gint_image_cmd *cmd, int out_width, int out_height)
+{
+	/* Convert the old DIMAGE_NOCLIP flag */
+	if(effects & DIMAGE_NOCLIP)
+		effects |= IMAGE_NOCLIP;
+
+	if(!(effects & IMAGE_NOCLIP_INPUT)) {
+		if(!gint_image_clip_input(img, box, out_width, out_height))
+			return false;
+	}
+	if(!(effects & IMAGE_NOCLIP_OUTPUT))
+		gint_image_clip_output(box, out_width, out_height);
+
+	cmd->effect = (effects & (IMAGE_VFLIP | IMAGE_HFLIP)) >> 8;
+	cmd->columns = box->w;
+	cmd->input_stride = img->width;
+	cmd->x = box->x;
+	cmd->edge_1 = -1;
+	cmd->edge_2 = -1;
+
+	int p = img->profile;
+	int input_row = (effects & IMAGE_VFLIP) ? box->top+box->h-1 : box->top;
+
+	if(p == IMAGE_RGB565 || p == IMAGE_RGB565A) {
+		cmd->input_stride += (cmd->input_stride & 1);
+		cmd->input = (void *)img->data +
+			(input_row * cmd->input_stride + box->left) * 2;
+	}
+	else if(p == IMAGE_P8_RGB565 || p == IMAGE_P8_RGB565A) {
+		cmd->input = (void *)img->data + img->data[0] * 2 + 2 +
+			(input_row * img->width + box->left);
+		cmd->palette = (void *)img->data + 258;
+	}
+	else {
+		cmd->input = (void *)img->data + 32 +
+			input_row * ((img->width + 1) >> 1) + (box->left >> 1);
+		cmd->palette = img->data;
+		/* By default, use edge_1 to indicate (box->left & 1), so that
+		   functions that don't use edge_1 can still work properly */
+		if(!left_edge)
+			cmd->edge_1 = (box->left & 1);
+	}
+
+	if(left_edge && (box->left & 1)) {
+		if(effects & IMAGE_HFLIP) {
+			cmd->edge_1 = cmd->columns;
+		}
+		else {
+			cmd->x--;
+			cmd->edge_1 = 0;
+		}
+		cmd->columns++;
+	}
+	if(right_edge && (cmd->columns & 1)) {
+		if(effects & IMAGE_HFLIP) {
+			cmd->x--;
+			cmd->edge_1++;
+			cmd->edge_2 = 0;
+		}
+		else {
+			cmd->edge_2 = cmd->columns;
+		}
+		cmd->columns++;
+	}
+
+	/* Settings for further updates */
+	cmd->height = box->h;
+
+	/* This is the default for gint, but Azur overwrites it */
+	cmd->lines = box->h;
+	cmd->output = (void *)gint_vram + (DWIDTH * box->y + cmd->x) * 2;
+	return true;
+}
--- a/src/render-cg/image/image_macros.S
+++ b/src/render-cg/image/image_macros.S
@ -0,0 +1,25 @@
+/* START: Sets up the inner and outer loop. The outer loop is anything between
+   the calls to macros START and END, while the inner loop is the code between
+   labels 2: and 3: (both *INCLUDED*). */
+.macro START
+	ldrs 2f
+	ldre 3f
+1:	ldrc r2
+	nop
+.endm
+
+/* END: Finishes the outer loop and adds strides. */
+.macro END
+	dt	r1
+	add	r4, r3
+	bf.s	1b
+	add	r6, r5
+.endm
+
+/* EPILOGUE: Finishes the call by reloading registers saved in the prologue. */
+.macro EPILOGUE
+	mov.l	@r15+, r9
+	mov	r3, r0
+	rts
+	mov.l	@r15+, r8
+.endm
--- a/src/render-cg/image/image_p4.S
+++ b/src/render-cg/image/image_p4.S
@ -0,0 +1,86 @@
+.global _gint_image_p4_loop
+
+/* gint's image renderer: 4-bit indexed entry point
+
+   P4 compacts pixel data further than P8 by restricting values to a 16-color
+   palette and packing 2 pixels in each byte. This severely restricts our
+   ability to use sub-images because odd positions land within bytes.
+
+   Fortunately, we can solve this by using more edge pixels. The simplest way
+   to write a P4 loop is to process 2 pixels from a 2-aligned source image
+   position in a single iteration. Other structures don't even come close in
+   terms of CPU performance (which, as a reminder, is the main bottleneck in
+   Azur but not in gint): selecting nibbles individually is too long, while not
+   unrolling is still clearly inefficient. So it becomes very important to
+   forcibly align the sub-image on byte-aligned input boundaries and stick to
+   that grid.
+
+   Obviously, this approach causes up to one extra pixel to be overwritten on
+   each side of every line. We solve this problem by adding *another* edge
+   pixel on the left side. In the renderer this is called the left edge or
+   edge_1, while the standard one is called right edge or edge_2.
+
+   r0: - (initially: cmd.effect)
+   r1:  Number of lines remaining to draw
+   r2:  Number of columns per line
+   r3:  Input pointer
+   r4:  Input stride
+   r5:  Output pointer
+   r6:  Output stride
+   r7:  Right edge pointer
+   r8:  - (initially: cmd)
+   r9:  - (initially: cmd.loop)
+   r10: Left edge pointer */
+
+_gint_image_p4_loop:
+	/* r4: int output_width (pixels)
+	   r5: struct gint_image_cmd *cmd */
+
+	mov.b	@(1,r5), r0	/* cmd.effect */
+	add	#2, r5
+
+	mov.w	@r5+, r2	/* cmd.columns */
+	mov	r4, r6
+
+	mov.l	r8, @-r15
+	mov	r5, r8
+
+	/* For here on the command is r8 */
+
+	mov.l	r9, @-r15
+	sub	r2, r6
+
+	mov.w	@r8+, r4	/* cmd.input_stride */
+	add	r6, r6
+
+	mov.b	@r8+, r1	/* cmd.lines */
+	shlr	r4
+
+	mov.l	r10, @-r15
+	extu.b	r1, r1
+
+	mov.b	@r8+, r10	/* cmd.edge_1 */
+	nop
+
+	mov	#0, r9
+	addc	r9, r4		/* r4 = (img.width + 1) >> 1 */
+
+	mov.l	@r8+, r9
+	shlr	r0		/* T bit is now VFLIP */
+
+	mov.l	@r8+, r5	/* cmd.output */
+	nop
+
+	bf.s	_NO_VFLIP
+	mov.l	@r8+, r3	/* cmd.input */
+
+_VFLIP:
+	neg	r4, r4
+	nop
+
+_NO_VFLIP:
+	mov	r2, r7
+	shlr	r7
+
+	jmp	@r9
+	subc	r7, r4
--- a/src/render-cg/image/image_p4.c
+++ b/src/render-cg/image/image_p4.c
@ -0,0 +1,42 @@
+#include <gint/image.h>
+#include <gint/display.h>
+
+void dimage_p4(int x, int y, image_t const *img, int eff)
+{
+	dsubimage_p4(x, y, img, 0, 0, img->width, img->height, eff);
+}
+
+void dsubimage_p4(int x, int y, image_t const *img,
+	int left, int top, int w, int h, int eff)
+{
+	if(img->profile == IMAGE_P4_RGB565A)
+		return dsubimage_p4_clearbg(x, y, img, left, top, w, h, eff,
+			img->alpha);
+
+	struct gint_image_box box = { x, y, w, h, left, top };
+	struct gint_image_cmd cmd;
+
+	if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
+		DHEIGHT)) return;
+	cmd.loop = gint_image_p4_normal;
+	gint_image_p4_loop(DWIDTH, &cmd);
+}
+
+void dimage_p4_clearbg(int x, int y, image_t const *img, int eff, int bg)
+{
+	dsubimage_p4_clearbg(x, y, img, 0, 0, img->width, img->height, eff,bg);
+}
+
+void dsubimage_p4_clearbg(int x, int y, image_t const *img,
+	int left, int top, int w, int h, int eff, int bg_color)
+{
+	struct gint_image_box box = { x, y, w, h, left, top };
+	struct gint_image_cmd cmd;
+
+	if(!gint_image_mkcmd(&box, img, eff, true, true, &cmd, DWIDTH,
+		DHEIGHT)) return;
+	cmd.effect += 4;
+	cmd.color_1 = bg_color;
+	cmd.loop = gint_image_p4_clearbg;
+	gint_image_p4_loop(DWIDTH, &cmd);
+}
--- a/src/render-cg/image/image_p4_clearbg.S
+++ b/src/render-cg/image/image_p4_clearbg.S
@ -0,0 +1,153 @@
+.global _gint_image_p4_clearbg
+#include "image_macros.S"
+
+/* P4 CLEARBG, RAM version: by NULL canceling.
+
+   This function is similar to P8 CLEARBG. Transparent pixels are not limited
+   by RAM writing speed, so a tight CPU loop is used. See P8 CLEARBG for an
+   explanation of NULL canceling.
+
+   r0:  [temporary]
+   r7:  Right edge pointer
+   r8:  Alpha value
+   r9:  Palette
+   r10: Left edge pointer
+   r11: Nullable output pointer
+   r12: 0 (in outer loop: edge stride)
+   r13: [temporary]
+   r14: [temporary]
+
+   Spilled to stack:
+   @(-12,r15): Right edge value
+   @(-8,r15): Left edge value
+   @(-4,r15): Edge stride */
+
+.macro GEN_CLEARBG_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
+	shlr	r2
+	nop
+
+	add	r10, r10
+	nop
+
+	mov.l	@r8+, r9	/* cmd.palette */
+	mov	r2, r0
+
+	mov.w	@r8+, r7	/* cmd.edge_2 */
+	shll2	r0
+
+	mov.l	r12, @-r15
+	shll	r7
+
+	mov.l	r11, @-r15
+	add	r5, r7
+
+	mov	r0, r12
+	add	r6, r12
+
+	mov.l	r13, @-r15
+	add	r5, r10
+
+	mov.l	r14, @-r15
+	add	#-4, r5
+
+	mov.w	@r8, r8		/* cmd.color_1 */
+	add	#-1, r4		/* Input stride compensation for pipelining */
+
+ .if \HFLIP
+	add	r0, r5
+	nop
+
+	shll	r0
+	nop
+
+	add	r0, r6
+	nop
+ .endif
+
+	shll	r8		/* alpha*2 compares against palette offsets */
+	nop
+
+	START
+
+	mov.b	@r3+, \TMP1
+	nop
+
+	mov.w	@r7, r0		/* Save right edge */
+	nop
+
+	mov.l	r0, @-r15
+	shll	\TMP1
+
+	mov.w	@r10, r0	/* Save left edge */
+	nop
+
+	mov.l	r0, @-r15
+	nop
+
+	mov.l	r12, @-r15
+	mov	#0, r12
+
+2:	mov	\TMP1, r0
+	and	#0x1e, r0
+
+	cmp/eq	r0, r8
+	mov	#-1, r11
+
+	addc	r12, r11
+	mov	#-4, \TMP2
+
+	and	r5, r11
+	mov.w	@(r0,r9), r0
+
+	shld	\TMP2, \TMP1
+	mov	#0x1e, \TMP2
+
+	and	\TMP2, \TMP1
+	mov.w	r0, @(\OFF1,r11)
+
+	cmp/eq	\TMP1, r8
+	mov	#-1, r11
+
+	addc	r12, r11
+	mov	\TMP1, r0
+
+	and	r5, r11
+	mov.b	@r3+, \TMP1
+
+	add	#\OUT_DIR, r5
+	mov.w	@(r0,r9), r0
+
+	mov.w	r0, @(\OFF2,r11)
+3:	shll	\TMP1
+
+	mov.l	@r15+, r12
+	nop
+
+	mov.l	@r15+, r0
+	nop
+
+	mov.w	r0, @r10	/* Restore left edge */
+	add	r12, r10
+
+	mov.l	@r15+, r0
+	nop
+
+	mov.w	r0, @r7		/* Restore right edge */
+	add	r12, r7
+
+	END
+
+	mov.l	@r15+, r14
+	mov.l	@r15+, r13
+	mov.l	@r15+, r11
+	mov.l	@r15+, r12
+	mov.l	@r15+, r10
+	EPILOGUE
+.endm
+
+_gint_image_p4_clearbg:
+	tst	#1, r0
+	bf	9f
+
+	GEN_CLEARBG_LOOP 0, 4, r13, r14, 6, 4
+9:	GEN_CLEARBG_LOOP 1, -4, r13, r14, 0, 2
--- a/src/render-cg/image/image_p4_dye.S
+++ b/src/render-cg/image/image_p4_dye.S
@ -0,0 +1,147 @@
+.global _gint_image_p4_dye
+#include "image_macros.S"
+
+/* P4 DYE, RAM version: by NULL canceling.
+
+   Like with P8, this effect removes most of the complexity because there is no
+   longer any need to index the palette. However the decoding still takes a lot
+   of EX work so the performance is not as good. Since there are transparent
+   areas, Azur's CPU-bound version is at least to some extent faster than
+   bopti, so that's what we're using.
+
+   See P8 CLEARBG for an explanation of NULL canceling.
+
+   r0:  Dye value
+   r7:  Right edge pointer
+   r8:  Alpha value
+   r9:  0 (to neutralize addc during NULL-cancelling)
+   r10: Left edge pointer
+   r11: Nullable output pointer
+   r12: Edge stride
+   r13: [temporary]
+   r14: [temporary]
+
+   Spilled to stack:
+   @(-8,r15): Right edge value
+   @(-4,r15): Left edge value */
+
+.macro GEN_DYE_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
+	shlr	r2
+	nop
+
+	add	r10, r10
+	nop
+
+	mov.l	@r8+, r0	/* cmd.palette (don't care) */
+	mov	r2, r0
+
+	mov.w	@r8+, r7	/* cmd.edge_2 */
+	shll2	r0
+
+	mov.l	r12, @-r15
+	shll	r7
+
+	mov.l	r11, @-r15
+	add	r5, r7
+
+	mov	r0, r12
+	add	r6, r12
+
+	mov.l	r13, @-r15
+	add	r5, r10
+
+	mov.l	r14, @-r15
+	add	#-4, r5
+
+ .if \HFLIP
+	add	r0, r5
+	nop
+
+	shll	r0
+	nop
+
+	add	r0, r6
+	nop
+ .endif
+
+	mov.w	@(2,r8), r0	/* cmd.color_2 (dye value) */
+	add	#-1, r4		/* Input stride compensation for pipelining */
+
+	mov.w	@r8, r8		/* cmd.color_1 (alpha value) */
+	nop
+
+	START
+
+	mov.b	@r3+, \TMP1
+	nop
+
+	mov.w	@r7, \TMP2	/* Save right edge */
+	nop
+
+	mov.l	\TMP2, @-r15
+	mov	#0x0f, \TMP2
+
+	mov.w	@r10, r9	/* Save left edge */
+	and	\TMP1, \TMP2
+
+	mov.l	r9, @-r15
+	mov	#0, r9
+
+2:	cmp/eq	\TMP2, r8
+	mov	#-1, r11
+
+	addc	r9, r11
+	mov	#-4, \TMP2
+
+	and	r5, r11
+	nop
+
+	shld	\TMP2, \TMP1
+	mov	#0x0f, \TMP2
+
+	and	\TMP2, \TMP1
+	mov.w	r0, @(\OFF1,r11)
+
+	cmp/eq	\TMP1, r8
+	mov	#-1, r11
+
+	addc	r9, r11
+	mov.b	@r3+, \TMP1
+
+	and	r5, r11
+	nop
+
+	mov	#0x0f, \TMP2
+	and	\TMP1, \TMP2
+
+	add	#\OUT_DIR, r5
+3:	mov.w	r0, @(\OFF2,r11)
+
+	mov.l	@r15+, \TMP2
+	nop
+
+	mov.w	\TMP2, @r10	/* Restore left edge */
+	add	r12, r10
+
+	mov.l	@r15+, \TMP2
+	nop
+
+	mov.w	\TMP2, @r7	/* Restore right edge */
+	add	r12, r7
+
+	END
+
+	mov.l	@r15+, r14
+	mov.l	@r15+, r13
+	mov.l	@r15+, r11
+	mov.l	@r15+, r12
+	mov.l	@r15+, r10
+	EPILOGUE
+.endm
+
+_gint_image_p4_dye:
+	tst	#1, r0
+	bf	9f
+
+	GEN_DYE_LOOP 0, 4, r13, r14, 6, 4
+9:	GEN_DYE_LOOP 1, -4, r13, r14, 0, 2
--- a/src/render-cg/image/image_p4_dye.c
+++ b/src/render-cg/image/image_p4_dye.c
@ -0,0 +1,23 @@
+#include <gint/display.h>
+#include <gint/image.h>
+
+void dimage_p4_dye(int x, int y, image_t const *img, int eff, int dye_color)
+{
+	dsubimage_p4_dye(x, y, img, 0, 0, img->width, img->height, eff,
+		dye_color);
+}
+
+void dsubimage_p4_dye(int x, int y, image_t const *img,
+	int left, int top, int w, int h, int eff, int dye_color)
+{
+	struct gint_image_box box = { x, y, w, h, left, top };
+	struct gint_image_cmd cmd;
+
+	if(!gint_image_mkcmd(&box, img, eff, true, true, &cmd, DWIDTH,
+		DHEIGHT)) return;
+	cmd.effect += 12;
+	cmd.color_1 = img->alpha;
+	cmd.color_2 = dye_color;
+	cmd.loop = gint_image_p4_dye;
+	gint_image_p4_loop(DWIDTH, &cmd);
+}
--- a/src/render-cg/image/image_p4_effect.c
+++ b/src/render-cg/image/image_p4_effect.c
@ -0,0 +1,32 @@
+#include <gint/image.h>
+
+void dsubimage_p4_effect(int x, int y, image_t const *img,
+	int left, int top, int w, int h, int eff, ...)
+{
+	va_list args;
+	va_start(args, eff);
+
+	if(eff & IMAGE_CLEARBG) {
+		int bg = va_arg(args, int);
+		dsubimage_p4_clearbg(x, y, img, left, top, w, h, eff, bg);
+	}
+	else if(eff & IMAGE_SWAPCOLOR) {
+		int from = va_arg(args, int);
+		int to = va_arg(args, int);
+		dsubimage_p4_swapcolor(x, y, img, left, top, w, h, eff, from,
+			to);
+	}
+	else if(eff & IMAGE_ADDBG) {
+		int bg = va_arg(args, int);
+		dsubimage_p4_addbg(x, y, img, left, top, w, h, eff, bg);
+	}
+	else if(eff & IMAGE_DYE) {
+		int dye = va_arg(args, int);
+		dsubimage_p4_dye(x, y, img, left, top, w, h, eff, dye);
+	}
+	else {
+		dsubimage_p4(x, y, img, left, top, w, h, eff);
+	}
+
+	va_end(args);
+}
--- a/src/render-cg/image/image_p4_normal.S
+++ b/src/render-cg/image/image_p4_normal.S
@ -0,0 +1,125 @@
+.global _gint_image_p4_normal
+#include "image_macros.S"
+
+/* P4 Opaque rendering, VRAM version: by unrolling without edge pixels.
+
+   This is the most unique function in the renderer, Azur included. A P4 image
+   cannot reasonably be decoded on a per-pixel basis because extracting half-
+   bytes is too slow. But using edge pixels results in extra write surface that
+   makes us slower than bopti in gint 2.7.
+
+   This loop is thus the only one to implement 2-unrolling (no pipeline) while
+   manually avoiding the writes that a pair of edge pixels usually fix. Subtle
+   adjustments to strides are involved, making this function one of the most
+   tricky.
+
+   A slight change is made to the command for the purpose of this function;
+   cmd.edge_1 (which is r10) is set to indicate whether the [left] side of the
+   box is even (r10=0) or odd (r10=1). This allows us to enter the loop at the
+   correct position.
+
+   r0:  [temporary]
+   r7:  [temporary]
+   r8:  Column counter
+   r9:  Palette
+   r10: box->left & 1
+   r11: [temporary] */
+
+.macro GEN_NORMAL_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
+	mov.l	@r8+, r9	/* cmd.palette */
+	add	#-4, r5		/* Better positioning for @(OFF[12], r5) */
+
+	/* The following arithmetic is to decrease r4 if the width is even
+	   (r2 & 1) and left is odd (r10 = 1), since that means both the first
+	   and last pixel load a full byte but use only half */
+
+	mov	r2, r0
+	xor	#1, r0
+
+	mov.w	@r8+, r7	/* cmd.edge_2 (don't care) */
+	and	r10, r0
+
+	mov.l	r11, @-r15
+	sub	r0, r4
+
+ .if \HFLIP
+	mov	r2, r0
+	shll	r0
+
+	add	r0, r5
+	nop
+
+	shll	r0
+	nop
+
+	add	r0, r6
+	nop
+ .endif
+
+1:	mov	r2, r8
+	tst	r10, r10	/* Check whether to do an extra half iter. */
+
+	bt	2f
+	nop
+
+	/* Additional half-iteration if box->left = 1 */
+
+	mov.b	@r3+, r0
+	shll	r0
+	and	#0x1e, r0
+	mov.w	@(r0, r9), r0
+	dt	r8
+	mov.w	r0, @(\OFF1, r5)
+	bt.s	3f
+	add	#\OUT_DIR, r5
+
+	/* The main loop needs to load pixels in output order. This is not
+	   ideal for CPU usage, but we have some margins */
+
+2:	mov.b	@r3+, \TMP1
+	mov	#-4, \TMP2
+
+	/* Stall */
+
+	shll	\TMP1
+	mov	\TMP1, r0
+
+	shld	\TMP2, r0
+	nop
+
+	and	#0x1e, r0
+	mov	#0x1e, \TMP2
+
+	/* Stall */
+
+	mov.w	@(r0,r9), r0
+	and	\TMP2, \TMP1
+
+	dt	r8
+	mov.w	r0, @(\OFF1,r5)
+
+	bt.s	3f
+	add	#\OUT_DIR, r5
+
+	mov	\TMP1, r0
+	add	#\OUT_DIR, r5
+
+	dt	r8
+	mov.w	@(r0,r9), r0
+
+	bf.s	2b
+	mov.w	r0, @(\OFF2,r5)
+
+3:	END
+
+	mov.l	@r15+, r11
+	mov.l	@r15+, r10
+	EPILOGUE
+.endm
+
+_gint_image_p4_normal:
+	tst	#1, r0
+	bf	9f
+
+	GEN_NORMAL_LOOP 0, 2, r7, r11, 4, 2
+9:	GEN_NORMAL_LOOP 1, -2, r7, r11, 2, 4
--- a/src/render-cg/image/image_p4_swapcolor.S
+++ b/src/render-cg/image/image_p4_swapcolor.S
@ -0,0 +1,175 @@
+.global _gint_image_p4_swapcolor
+#include "image_macros.S"
+
+/* P4 SWAPCOLOR, RAM version: by branchless xor selection.
+
+   I'm not sure whether this is the most optimized version for RAM. But it's
+   about 7-8% slower than bopti, and the effort of writing yet another
+   variation of P4's arduous loops doesn't seem worth it for a rare dynamic
+   effect. This is Azur's version.
+
+   See P8 SWAPCOLOR for an explanation of branchless xor selection.
+
+   r0:  [temporary]
+   r7:  Right edge pointer
+   r8:  palette[cmd.color_1] ^ cmd.color_2 (ie. x ^ y)
+   r9:  Palette
+   r10: Left edge pointer
+   r11: Holds (x ^ y) & -(c == x) during selection
+   r12: cmd.color_1
+   r13: [temporary]
+   r14: [temporary] (in outer loop: edge stride)
+
+   Spilled to stack:
+   @(-12,r15): Right edge value
+   @(-8,r15): Left edge value
+   @(-4,r15): Edge stride */
+
+.macro GEN_SWAPCOLOR_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
+	shlr	r2
+	nop
+
+	add	r10, r10
+	nop
+
+	mov.l	@r8+, r9	/* cmd.palette */
+	mov	r2, r0
+
+	mov.w	@r8+, r7	/* cmd.edge_2 */
+	shll2	r0
+
+	mov.l	r12, @-r15
+	shll	r7
+
+	mov.l	r13, @-r15
+	add	r5, r7
+
+	mov.w	@r8+, r13	/* cmd.color_1 */
+	add	r5, r10
+
+	mov.l	r11, @-r15
+	add	#-4, r5
+
+	mov	r13, r12
+	shll	r13
+
+	mov.l	r14, @-r15
+	add	r9, r13
+
+	mov.w	@r8, r8		/* cmd.color_2 */
+	add	#-1, r4		/* Input stride compensation for pipelining */
+
+	mov.w	@r13, r13
+	mov	r0, r14
+
+	add	r6, r14
+	nop
+
+	xor	r13, r8
+	nop
+
+ .if \HFLIP
+	add	r0, r5
+	nop
+
+	shll	r0
+	nop
+
+	add	r0, r6
+	nop
+ .endif
+
+	shll	r12		/* Compare color_1 * 2 with shifted values */
+	nop
+
+	START
+
+	mov.b	@r3+, \TMP1
+	nop
+
+	mov.w	@r7, r0		/* Save right edge */
+	nop
+
+	mov.l	r0, @-r15
+	shll	\TMP1
+
+	mov.w	@r10, r0	/* Save left edge */
+	nop
+
+	mov.l	r0, @-r15
+	nop
+
+	mov.l	r14, @-r15
+	nop
+
+2:	mov	\TMP1, r0
+	and	#0x1e, r0
+
+	cmp/eq	r0, r12
+	mov	#-4, \TMP2
+
+	subc	r11, r11
+	nop
+
+	mov.w	@(r0,r9), r0
+	and	r8, r11
+
+	shld	\TMP2, \TMP1
+	mov	#0x1e, \TMP2
+
+	xor	r11, r0
+	mov.w	r0, @(\OFF1,r5)
+
+	and	\TMP2, \TMP1
+	nop
+
+	cmp/eq	\TMP1, r12
+	nop
+
+	subc	r11, r11
+	mov	\TMP1, r0
+
+	add	#\OUT_DIR, r5
+	mov.b	@r3+, \TMP1
+
+	and	r8, r11
+	mov.w	@(r0,r9), r0
+
+	shll	\TMP1
+	nop
+
+	xor	r11, r0
+3:	mov.w	r0, @(\OFF2,r5)
+
+
+	mov.l	@r15+, r14
+	nop
+
+	mov.l	@r15+, r0
+	nop
+
+	mov.w	r0, @r10	/* Restore left edge */
+	add	r14, r10
+
+	mov.l	@r15+, r0
+	nop
+
+	mov.w	r0, @r7		/* Restore right edge */
+	add	r14, r7
+
+	END
+
+	mov.l	@r15+, r14
+	mov.l	@r15+, r11
+	mov.l	@r15+, r13
+	mov.l	@r15+, r12
+	mov.l	@r15+, r10
+	EPILOGUE
+.endm
+
+_gint_image_p4_swapcolor:
+	tst	#1, r0
+	bf	9f
+
+	GEN_SWAPCOLOR_LOOP 0, 4, r13, r14, 6, 0
+9:	GEN_SWAPCOLOR_LOOP 1, -4, r13, r14, 0, 6
--- a/src/render-cg/image/image_p4_swapcolor.c
+++ b/src/render-cg/image/image_p4_swapcolor.c
@ -0,0 +1,46 @@
+#include <gint/display.h>
+#include <gint/image.h>
+
+void dimage_p4_swapcolor(int x, int y, image_t const *img, int eff,
+	int old_color, int new_color)
+{
+	dsubimage_p4_swapcolor(x, y, img, 0, 0, img->width, img->height,
+		eff, old_color, new_color);
+}
+
+void dsubimage_p4_swapcolor(int x, int y, image_t const *img,
+	int left, int top, int w, int h, int eff, int old_index, int new_color)
+{
+	struct gint_image_box box = { x, y, w, h, left, top };
+	struct gint_image_cmd cmd;
+
+	if(!gint_image_mkcmd(&box, img, eff, true, true, &cmd, DWIDTH,
+		DHEIGHT)) return;
+	cmd.effect += 8;
+	cmd.color_1 = old_index;
+	cmd.color_2 = new_color;
+	cmd.loop = gint_image_p4_swapcolor;
+	gint_image_p4_loop(DWIDTH, &cmd);
+}
+
+void dimage_p4_addbg(int x, int y, image_t const *img, int eff,
+	int bg_color)
+{
+	dsubimage_p4_addbg(x, y, img, 0, 0, img->width, img->height,
+		eff, bg_color);
+}
+
+void dsubimage_p4_addbg(int x, int y, image_t const *img,
+	int left, int top, int w, int h, int eff, int bg_color)
+{
+	struct gint_image_box box = { x, y, w, h, left, top };
+	struct gint_image_cmd cmd;
+
+	if(!gint_image_mkcmd(&box, img, eff, true, true, &cmd, DWIDTH,
+		DHEIGHT)) return;
+	cmd.effect += 8;
+	cmd.color_1 = img->alpha;
+	cmd.color_2 = bg_color;
+	cmd.loop = gint_image_p4_swapcolor;
+	gint_image_p4_loop(DWIDTH, &cmd);
+}
--- a/src/render-cg/image/image_p8.S
+++ b/src/render-cg/image/image_p8.S
@ -0,0 +1,103 @@
+.global _gint_image_p8_loop
+
+/* gint's image renderer: 8-bit indexed entry point
+
+   P8 compacts images by indexing each pixel on a 256-color palette, thus
+   halving the amount of data per pixel. This comes at the cost of an
+   additional lookup during rendering. For these format, there is no way to
+   bundle pixels together, and the more advanced loops handle pixels
+   individually with a 2-unrolled 2-stage-pipeline structure to accelerate the
+   CPU processing when that is the bottleneck (which often means where there
+   are transparent pixels to skip).
+
+   For readers not familiar with loop optimization literature, the main idea is
+   that a simple loop which loads a pixel, processes it, and writes it, is too
+   inefficient because of RAW delays. To use the full speed of the CPU, one
+   needs to do more work in parallel and spread out actions on a single pixel,
+   which we do here with two loop transforms:
+
+   * _Pipelining_ the loop consists in handling a single pixel over several
+     iterations by doing a little bit of work in each iteration. The data for
+     the pixel would move from register to register at each iteration, with the
+     loop code doing one stage's worth of computation on each register. This
+     gives us more pixels to work on simultaneously, and more independent work
+     means less RAW limitations. Loops in this renderer have 2 stages at most.
+
+  * _Unrolling_ iterations of the loop consists in loading two (or more) pixels
+     at the start of each iteration so that we can work on one while waiting
+     for stalls and dependencies on the other. Unlike pipelining, pixels are
+     still confined within iterations. Non-trivial loops in this renderer
+     process 2 pixels per iteration.
+
+   Unrolling has one major flaw: handling pairs of pixels only works if the
+   total amount of pixels to draw is even. The usual way to handle this for n
+   pixels is to do ⌊n/2⌋ iterations and handle the last pixel individually if n
+   is odd. This is extremely annoying, since every row must check the value of
+   n, and an extra copy of the loop code for a single pixel must be maintained
+   on the side, which takes more space and more effort.
+
+   However, we have a specialized solution here with *edge pixels*. The idea of
+   edge pixels is to round the number of pixels *up* and perform ⌊(n+1)/2⌋ runs
+   of the inner loop. If n is odd, this will overwrite a single pixel at the
+   end of the line. We can cancel this error after-the-fact by saving the value
+   of the (n+1)-th pixel of the line before the loop, and restoring it
+   afterwards. Note that if n is even then the save/restore is a no-op.
+
+   This takes some caution however, as the temporary overwrite could be seen by
+   an interrupt. Some measures are put in place to reserve a couple of bytes on
+   each side of gint's VRAM and Azur's target fragment to avoid any problems.
+
+   r0: - (initially: cmd.effect)
+   r1: Number of lines remaining to draw
+   r2: Number of columns per line
+   r3: Input pointer
+   r4: Input stride
+   r5: Output pointer
+   r6: Output stride
+   r7: Right edge or [temporary]
+   r8: - (initially: cmd)
+   r9: - (initially: cmd.loop) */
+
+_gint_image_p8_loop:
+	/* r4: int output_width (pixels)
+	   r5: struct gint_image_cmd *cmd */
+
+	mov.b	@(1,r5), r0	/* cmd.effect */
+	add	#2, r5
+
+	mov.l	r8, @-r15
+	mov	r4, r6
+
+	mov.w	@r5+, r2	/* cmd.columns */
+	mov	r5, r8
+
+	/* For here on the command is r8 */
+
+	mov.l	r9, @-r15
+	shlr	r0		/* T bit is now VFLIP */
+
+	mov.w	@r8+, r4	/* cmd.input_stride */
+	sub	r2, r6
+
+	mov.b	@r8+, r1	/* cmd.lines */
+	add	r6, r6
+
+	mov.b	@r8+, r9	/* cmd.edge_1 - don't care */
+	nop
+
+	mov.l	@r8+, r9
+	extu.b	r1, r1
+
+	mov.l	@r8+, r5	/* cmd.output */
+	nop
+
+	bf.s	_NO_VFLIP
+	mov.l	@r8+, r3	/* cmd.input */
+
+_VFLIP:
+	neg	r4, r4
+	nop
+
+_NO_VFLIP:
+	jmp	@r9
+	sub	r2, r4
--- a/src/render-cg/image/image_p8.c
+++ b/src/render-cg/image/image_p8.c
@ -0,0 +1,42 @@
+#include <gint/image.h>
+#include <gint/display.h>
+
+void dimage_p8(int x, int y, image_t const *img, int eff)
+{
+	dsubimage_p8(x, y, img, 0, 0, img->width, img->height, eff);
+}
+
+void dsubimage_p8(int x, int y, image_t const *img,
+	int left, int top, int w, int h, int eff)
+{
+	if(img->profile == IMAGE_P8_RGB565A)
+		return dsubimage_p8_clearbg(x, y, img, left, top, w, h, eff,
+			img->alpha);
+
+	struct gint_image_box box = { x, y, w, h, left, top };
+	struct gint_image_cmd cmd;
+
+	if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
+		DHEIGHT)) return;
+	cmd.loop = gint_image_p8_normal;
+	gint_image_p8_loop(DWIDTH, &cmd);
+}
+
+void dimage_p8_clearbg(int x, int y, image_t const *img, int eff, int bg)
+{
+	dsubimage_p8_clearbg(x, y, img, 0, 0, img->width, img->height, eff,bg);
+}
+
+void dsubimage_p8_clearbg(int x, int y, image_t const *img,
+	int left, int top, int w, int h, int eff, int bg_color)
+{
+	struct gint_image_box box = { x, y, w, h, left, top };
+	struct gint_image_cmd cmd;
+
+	if(!gint_image_mkcmd(&box, img, eff, false, true, &cmd, DWIDTH,
+		DHEIGHT)) return;
+	cmd.effect += 4;
+	cmd.color_1 = bg_color;
+	cmd.loop = gint_image_p8_clearbg;
+	gint_image_p8_loop(DWIDTH, &cmd);
+}
--- a/src/render-cg/image/image_p8_clearbg.S
+++ b/src/render-cg/image/image_p8_clearbg.S
@ -0,0 +1,147 @@
+.global _gint_image_p8_clearbg
+#include "image_macros.S"
+
+/* P8 CLEARBG, RAM version: by NULL canceling.
+
+   This function is one of the few that can still be bottlenecked by CPU in the
+   RAM model. This is because transparent pixels can be skipped over as fast as
+   the CPU allows without worrying about the writing speed of the RAM.
+
+   For some reason that I have yet to uncover, branches are way slower than the
+   SH4AL-DSP manual suggests, and even slower while inside of DSP loops. This
+   completely favors branchless methods, and the one used here is one I call
+   "NULL canceling".
+
+   The idea is that a write can be turned into a no-op by either writing the
+   value that is already in memory, or by writing somewhere else. The first
+   option is pretty slow, especially because it requires a selection operation
+   (rn = condition ? rn : rm) which is like the most general branchless trick.
+
+   NULL canceling abuses the fact that NULL is mapped read-only on the platform
+   to turn the target pointer in NULL with the following identity:
+
+      target & -(condition) = (condition ? target : NULL)
+
+   The term -(condition) is materialized with an [addc #-1, #0] instruction
+   after the test, then the result is applied onto the target pointer with
+   [and], completing the trick in only 2 EX instructions. It does take more
+   registers, and prevents from using pre-decrement on the target.
+
+   r0:  [temporary]
+   r7:  Right edge pointer
+   r8:  Alpha value
+   r9:  Palette
+   r10: Nullable output pointer
+   r11: 0 (to neutralize addc during NULL-cancelling)
+   r12: Right edge stride
+   r13: [temporary]
+   r14: [temporary]
+
+   Spilled to stack:
+   @(-4,r15): Right edge value */
+
+.macro GEN_CLEARBG_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
+	mov.l	@r8+, r9	/* cmd.palette */
+	shlr	r2
+
+	mov.w	@r8+, r7	/* cmd.edge_2 */
+	mov	r2, r0
+
+	mov.l	r12, @-r15
+	shll2	r0
+
+	mov.l	r10, @-r15
+	shll	r7
+
+	mov.l	r11, @-r15
+	add	r5, r7
+
+	mov	r0, r12
+	add	r6, r12
+
+	mov.l	r13, @-r15
+	add	#-4, r5
+
+	mov.l	r14, @-r15
+	add	#-2, r4		/* Input stride compensation for pipelining */
+
+	mov.w	@r8, r8		/* cmd.color_1 ≤ 255, thus zero-extended */
+	mov	#0, r11
+
+ .if \HFLIP
+	add	r0, r5
+	nop
+
+	shll	r0
+	nop
+
+	add	r0, r6
+	nop
+ .endif
+
+	START
+
+	mov.b	@r3+, \TMP2
+	nop
+
+	mov.w	@r7, r0		/* Save right edge */
+	nop
+
+	mov.l	r0, @-r15
+	cmp/eq	\TMP2, r8
+
+	mov.b	@r3+, \TMP1
+	add	\TMP2, \TMP2
+
+2:	mov	#-1, r10
+	addc	r11, r10	/* r10 is now the mask */
+
+	and	r5, r10
+	mov	\TMP2, r0
+
+	cmp/eq	\TMP1, r8
+	mov.w	@(r0, r9), r0
+
+	mov.w	r0, @(\OFF1, r10)
+	add	#\OUT_DIR, r5
+
+	mov.b	@r3+, \TMP2
+	nop
+
+	mov	#-1, r10
+	addc	r11, r10
+
+	add	\TMP1, \TMP1
+	mov	\TMP1, r0
+
+	mov.b	@r3+, \TMP1
+	and	r5, r10
+
+	mov.w	@(r0, r9), r0
+	cmp/eq	\TMP2, r8
+
+	mov.w	r0, @(\OFF2, r10)
+3:	add	\TMP2, \TMP2
+
+	mov.l	@r15+, r0
+	nop
+
+	mov.w	r0, @r7		/* Restore right edge */
+	add	r12, r7
+
+	END
+
+	mov.l	@r15+, r14
+	mov.l	@r15+, r13
+	mov.l	@r15+, r11
+	mov.l	@r15+, r10
+	mov.l	@r15+, r12
+	EPILOGUE
+.endm
+
+_gint_image_p8_clearbg:
+	tst	#1, r0
+	bf	9f
+
+	GEN_CLEARBG_LOOP 0, 4, r13, r14, 4, 2
+9:	GEN_CLEARBG_LOOP 1, -4, r13, r14, 2, 4
--- a/src/render-cg/image/image_p8_dye.S
+++ b/src/render-cg/image/image_p8_dye.S
@ -0,0 +1,115 @@
+.global _gint_image_p8_dye
+#include "image_macros.S"
+
+/* P8 DYE, RAM version: by NULL canceling.
+
+   This effect basically removes all the complexity out of P8 because we no
+   longer need to index the palette. We only keep the tight loop so that the
+   CPU can speed in areas with many transparent pixels. This gives some
+   acceleration over bopti.
+
+   See P8 CLEARBG for an explanation of NULL canceling.
+
+   r0:  Dye value
+   r7:  Right edge pointer
+   r8:  Alpha value
+   r9:  Right edge value
+   r10: Nullable output pointer
+   r11: 0 (to neutralize addc during NULL-cancelling)
+   r12: Right edge stride
+   r13: [temporary]
+   r14: [temporary] */
+
+.macro GEN_DYE_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
+	mov.l	@r8+, r9	/* cmd.palette (don't care) */
+	shlr	r2
+
+	mov.w	@r8+, r7	/* cmd.edge_2 */
+	mov	r2, r0
+
+	mov.l	r12, @-r15
+	shll2	r0
+
+	mov.l	r10, @-r15
+	shll	r7
+
+	mov.l	r11, @-r15
+	add	r5, r7
+
+	mov	r0, r12
+	add	r6, r12
+
+	mov.l	r13, @-r15
+	add	#-4, r5
+
+	mov.l	r14, @-r15
+	add	#-2, r4		/* Input stride compensation for pipelining */
+
+ .if \HFLIP
+	add	r0, r5
+	nop
+
+	shll	r0
+	nop
+
+	add	r0, r6
+	nop
+ .endif
+
+	mov.w	@(2,r8), r0	/* cmd.color_2 (dye value) */
+	nop
+
+	mov.w	@r8, r8		/* cmd.color_1 ≤ 255, thus zero-extended */
+	mov	#0, r11
+
+	START
+
+	mov.b	@r3+, \TMP2
+	nop
+
+	mov.w	@r7, r9		/* Save right edge */
+	nop
+
+	mov.b	@r3+, \TMP1
+	cmp/eq	\TMP2, r8
+
+2:	mov	#-1, r10
+	addc	r11, r10	/* r10 is now the mask */
+
+	and	r5, r10
+	nop
+
+	mov.b	@r3+, \TMP2
+	cmp/eq	\TMP1, r8
+
+	mov.w	r0, @(\OFF1, r10)
+	add	#\OUT_DIR, r5
+
+	mov	#-1, r10
+	addc	r11, r10
+
+	mov.b	@r3+, \TMP1
+	and	r5, r10
+
+	cmp/eq	\TMP2, r8
+3:	mov.w	r0, @(\OFF2, r10)
+
+	mov.w	r9, @r7		/* Restore right edge */
+	add	r12, r7
+
+	END
+
+	mov.l	@r15+, r14
+	mov.l	@r15+, r13
+	mov.l	@r15+, r11
+	mov.l	@r15+, r10
+	mov.l	@r15+, r12
+	EPILOGUE
+.endm
+
+_gint_image_p8_dye:
+	tst	#1, r0
+	bf	9f
+
+	GEN_DYE_LOOP 0, 4, r13, r14, 4, 2
+9:	GEN_DYE_LOOP 1, -4, r13, r14, 2, 4
--- a/src/render-cg/image/image_p8_dye.c
+++ b/src/render-cg/image/image_p8_dye.c
@ -0,0 +1,23 @@
+#include <gint/display.h>
+#include <gint/image.h>
+
+void dimage_p8_dye(int x, int y, image_t const *img, int eff, int dye_color)
+{
+	dsubimage_p8_dye(x, y, img, 0, 0, img->width, img->height, eff,
+		dye_color);
+}
+
+void dsubimage_p8_dye(int x, int y, image_t const *img,
+	int left, int top, int w, int h, int eff, int dye_color)
+{
+	struct gint_image_box box = { x, y, w, h, left, top };
+	struct gint_image_cmd cmd;
+
+	if(!gint_image_mkcmd(&box, img, eff, false, true, &cmd, DWIDTH,
+		DHEIGHT)) return;
+	cmd.effect += 12;
+	cmd.color_1 = img->alpha;
+	cmd.color_2 = dye_color;
+	cmd.loop = gint_image_p8_dye;
+	gint_image_p8_loop(DWIDTH, &cmd);
+}
--- a/src/render-cg/image/image_p8_effect.c
+++ b/src/render-cg/image/image_p8_effect.c
@ -0,0 +1,32 @@
+#include <gint/image.h>
+
+void dsubimage_p8_effect(int x, int y, image_t const *img,
+	int left, int top, int w, int h, int eff, ...)
+{
+	va_list args;
+	va_start(args, eff);
+
+	if(eff & IMAGE_CLEARBG) {
+		int bg = va_arg(args, int);
+		dsubimage_p8_clearbg(x, y, img, left, top, w, h, eff, bg);
+	}
+	else if(eff & IMAGE_SWAPCOLOR) {
+		int from = va_arg(args, int);
+		int to = va_arg(args, int);
+		dsubimage_p8_swapcolor(x, y, img, left, top, w, h, eff, from,
+			to);
+	}
+	else if(eff & IMAGE_ADDBG) {
+		int bg = va_arg(args, int);
+		dsubimage_p8_addbg(x, y, img, left, top, w, h, eff, bg);
+	}
+	else if(eff & IMAGE_DYE) {
+		int dye = va_arg(args, int);
+		dsubimage_p8_dye(x, y, img, left, top, w, h, eff, dye);
+	}
+	else {
+		dsubimage_p8(x, y, img, left, top, w, h, eff);
+	}
+
+	va_end(args);
+}
--- a/src/render-cg/image/image_p8_normal.S
+++ b/src/render-cg/image/image_p8_normal.S
@ -0,0 +1,42 @@
+.global _gint_image_p8_normal
+#include "image_macros.S"
+
+/* P8 Opaque rendering, RAM version: trivial.
+
+   As usual with RAM it is fairly easy to bottleneck writing speed, and so
+   there is no need for complex methods. Building longwords could be an option,
+   but it would require output alignment with edges, which is painful. */
+
+.macro GEN_NORMAL_LOOP HFLIP, OUT_DIR
+	mov.l	@r8+, r9	/* cmd.palette */
+
+ .if \HFLIP
+	add	#-2, r5
+	mov	r2, r0
+	shll	r0
+	add	r0, r5
+	shll	r0
+	add	r0, r6
+ .endif
+
+1:	mov	r2, r8
+
+2:	mov.b	@r3+, r0
+	shll	r0
+	mov.w	@(r0, r9), r0
+	mov.w	r0, @r5
+
+3:	dt	r8
+	bf.s	2b
+	add	#\OUT_DIR, r5
+
+	END
+	EPILOGUE
+.endm
+
+_gint_image_p8_normal:
+	tst	#1, r0
+	bf	9f
+
+	GEN_NORMAL_LOOP 0, 2
+9:	GEN_NORMAL_LOOP 1, -2
--- a/src/render-cg/image/image_p8_swapcolor.S
+++ b/src/render-cg/image/image_p8_swapcolor.S
@ -0,0 +1,77 @@
+.global _gint_image_p8_swapcolor
+#include "image_macros.S"
+
+/* P8 SWAPCOLOR, RAM version: by branchless xor selection.
+
+   The core action of this loop is to render full pixels while replacing any
+   occurrence of cmd.color_1 (x) with the value cmd.color_2 (y). Branching is
+   too slow as often, so instead we use the fact that both x and y are fixed to
+   use the identity
+
+     c ^ ((x ^ y) & -(c == x)) = (c == x ? y : c)
+
+   We materialize -(c == x) by subtracting a register from itself with subc
+   after the comparison (which is delightfully elegant), while (x ^ y) is pre-
+   computed. This way, the selection is performed in one [subc], one [and] and
+   one [xor] for a total of 3 EX slots. This is slower than NULL-cancelling
+   (which only takes 2 EX slots) but still better than symmetric alternatives.
+
+   Since we have a palette, we further trick by comparing against the index but
+   selecting against the palette entry, ie. we do
+
+      palette[c] ^ ((palette[x] ^ y) & -(c == x)) = (c == x ? y : palette[c])
+
+   which allows the computation to occur in parallel with the palette access
+   and does not require the replacement value to be located at a valid index.
+
+   r0:  [temporary]
+   r7:  cmd.color_1
+   r8:  palette[cmd.color_1] ^ cmd.color_2 (ie. x ^ y)
+   r9:  Palette
+   r10: Holds (x ^ y) & -(c == x) during selection */
+
+.macro GEN_SWAPCOLOR_LOOP HFLIP, OUT_DIR
+	mov.l	@r8+, r9	/* cmd.palette */
+	mov.w	@r8+, r0	/* cmd.edge_2 (don't care) */
+	mov.w	@r8+, r7	/* cmd.color_1 */
+	mov.l	r10, @-r15
+	exts.b	r7, r7
+	mov	r7, r0
+	mov.w	@r8, r8		/* cmd.color_2 */
+	add	r0, r0
+	mov.w	@(r0, r9), r0
+	xor	r0, r8
+
+ .if \HFLIP
+	add	#-2, r5
+	mov	r2, r0
+	shll	r0
+	add	r0, r5
+	shll	r0
+	add	r0, r6
+ .endif
+
+	START
+
+2:	mov.b	@r3+, r0
+	cmp/eq	r0, r7
+	add	r0, r0
+	subc	r10, r10
+	mov.w	@(r0, r9), r0
+	and	r8, r10
+	xor	r10, r0
+	mov.w	r0, @r5
+3:	add	#\OUT_DIR, r5
+
+	END
+
+	mov.l	@r15+, r10
+	EPILOGUE
+.endm
+
+_gint_image_p8_swapcolor:
+	tst	#1, r0
+	bf	9f
+
+	GEN_SWAPCOLOR_LOOP 0, 2
+9:	GEN_SWAPCOLOR_LOOP 1, -2
--- a/src/render-cg/image/image_p8_swapcolor.c
+++ b/src/render-cg/image/image_p8_swapcolor.c
@ -0,0 +1,46 @@
+#include <gint/display.h>
+#include <gint/image.h>
+
+void dimage_p8_swapcolor(int x, int y, image_t const *img, int eff,
+	int old_color, int new_color)
+{
+	dsubimage_p8_swapcolor(x, y, img, 0, 0, img->width, img->height,
+		eff, old_color, new_color);
+}
+
+void dsubimage_p8_swapcolor(int x, int y, image_t const *img,
+	int left, int top, int w, int h, int eff, int old_index, int new_color)
+{
+	struct gint_image_box box = { x, y, w, h, left, top };
+	struct gint_image_cmd cmd;
+
+	if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
+		DHEIGHT)) return;
+	cmd.effect += 8;
+	cmd.color_1 = old_index;
+	cmd.color_2 = new_color;
+	cmd.loop = gint_image_p8_swapcolor;
+	gint_image_p8_loop(DWIDTH, &cmd);
+}
+
+void dimage_p8_addbg(int x, int y, image_t const *img, int eff,
+	int bg_color)
+{
+	dsubimage_p8_addbg(x, y, img, 0, 0, img->width, img->height,
+		eff, bg_color);
+}
+
+void dsubimage_p8_addbg(int x, int y, image_t const *img,
+	int left, int top, int w, int h, int eff, int bg_color)
+{
+	struct gint_image_box box = { x, y, w, h, left, top };
+	struct gint_image_cmd cmd;
+
+	if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
+		DHEIGHT)) return;
+	cmd.effect += 8;
+	cmd.color_1 = img->alpha;
+	cmd.color_2 = bg_color;
+	cmd.loop = gint_image_p8_swapcolor;
+	gint_image_p8_loop(DWIDTH, &cmd);
+}
--- a/src/render-cg/image/image_rgb16.S
+++ b/src/render-cg/image/image_rgb16.S
@ -0,0 +1,69 @@
+.global _gint_image_rgb16_loop
+
+/* gint's image renderer: 16-bit RGB entry piont
+
+   These formats are the simplest of the bunch. RGB565 can use longword access
+   in cases when alignment is favorable and no geometric effect is applied. In
+   other cases, pixels are handled individually; geometric effects affect the
+   input/output logic while color effects change the computations themselves.
+
+   r0: - (initially: cmd.effect)
+   r1: Number of lines remaining to draw
+   r2: Number of columns per line
+   r3: Input pointer
+   r4: Input stride
+   r5: Output pointer
+   r6: Output stride
+   r7: Right edge (only used in Azur) or [temporary]
+   r8: - (initially: cmd)
+   r9: - (initially: cmd.loop) */
+
+_gint_image_rgb16_loop:
+	/* r4: int output_width (pixels)
+	   r5: struct gint_image_cmd *cmd */
+
+	mov.b	@(1,r5), r0	/* cmd.effect */
+	add	#2, r5
+
+	mov.l	r8, @-r15
+	mov	r4, r6
+
+	mov.w	@r5+, r2	/* cmd.columns */
+	mov	r5, r8
+
+	/* For here on the command is r8 */
+
+	mov.l	r9, @-r15
+	shlr	r0		/* T bit is now VFLIP */
+
+	mov.w	@r8+, r4	/* cmd.input_stride */
+	sub	r2, r6
+
+	mov.b	@r8+, r1	/* cmd.lines */
+	add	r6, r6
+
+	mov.b	@r8+, r9	/* cmd.edge_1 (don't care) */
+	nop
+
+	mov.l	@r8+, r9
+	extu.b	r1, r1
+
+	mov.l	@r8+, r5	/* cmd.output */
+	nop
+
+	mov.l	@r8+, r3	/* cmd.input */
+	nop
+
+	bf.s	_NO_VFLIP
+	add	#4, r8		/* cmd.palette (don't care) */
+
+_VFLIP:
+	neg	r4, r4
+	nop
+
+_NO_VFLIP:
+	sub	r2, r4
+	nop
+
+	jmp	@r9
+	add	r4, r4
--- a/src/render-cg/image/image_rgb16.c
+++ b/src/render-cg/image/image_rgb16.c
@ -0,0 +1,43 @@
+#include <gint/image.h>
+#include <gint/display.h>
+
+void dimage_rgb16(int x, int y, image_t const *img, int eff)
+{
+	dsubimage_rgb16(x, y, img, 0, 0, img->width, img->height, eff);
+}
+
+void dsubimage_rgb16(int x, int y, image_t const *img,
+	int left, int top, int w, int h, int eff)
+{
+	if(img->profile == IMAGE_RGB565A)
+		return dsubimage_rgb16_clearbg(x, y, img, left, top, w, h, eff,
+			img->alpha);
+
+	struct gint_image_box box = { x, y, w, h, left, top };
+	struct gint_image_cmd cmd;
+
+	if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
+		DHEIGHT)) return;
+	cmd.loop = gint_image_rgb16_normal;
+	gint_image_rgb16_loop(DWIDTH, &cmd);
+}
+
+void dimage_rgb16_clearbg(int x, int y, image_t const *img, int eff,int bg)
+{
+	dsubimage_rgb16_clearbg(x, y, img, 0, 0, img->width, img->height, eff,
+		bg);
+}
+
+void dsubimage_rgb16_clearbg(int x, int y, image_t const *img,
+	int left, int top, int w, int h, int eff, int bg_color)
+{
+	struct gint_image_box box = { x, y, w, h, left, top };
+	struct gint_image_cmd cmd;
+
+	if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
+		DHEIGHT)) return;
+	cmd.effect += 4;
+	cmd.color_1 = bg_color;
+	cmd.loop = gint_image_rgb16_clearbg;
+	gint_image_rgb16_loop(DWIDTH, &cmd);
+}
--- a/src/render-cg/image/image_rgb16_clearbg_dye.S
+++ b/src/render-cg/image/image_rgb16_clearbg_dye.S
@ -0,0 +1,53 @@
+.global _gint_image_rgb16_clearbg
+.global _gint_image_rgb16_dye
+#include "image_macros.S"
+
+/* RGB16 CLEARBG and DYE, RAM version: trivial.
+
+   This function handles both CLEARBG and DYE; in RGB16 they are the same,
+   except that DYE writes not the pixel value (TMP) but a fixed color (SRC). As
+   if often the case, the RAM speed is limiting, so there is no point in
+   improving speed of the code on the CPU side. */
+
+.macro GEN_CLEARBG_DYE_LOOP HFLIP, OUT_DIR, TMP, SRC
+	mov.w	@r8+, r0	/* cmd.edge_2 (don't care) */
+	mov.w	@r8+, r9	/* cmd.color_1 (alpha color) */
+	mov.w	@r8+, r0	/* cmd.color_2 (dye color) */
+
+ .if \HFLIP
+	add	#-2, r5
+	mov	r2, r8
+	shll	r8
+	add	r8, r5
+	shll	r8
+	add	r8, r6
+ .endif
+
+1:	mov	r2, r8
+
+2:	mov.w	@r3+, \TMP
+	cmp/eq	\TMP, r9
+	bt	3f
+	mov.w	\SRC, @r5
+
+3:	dt	r8
+	bf.s	2b
+	add	#(\OUT_DIR/2), r5
+
+	END
+	EPILOGUE
+.endm
+
+_gint_image_rgb16_clearbg:
+	tst	#1, r0
+	bf	9f
+
+	GEN_CLEARBG_DYE_LOOP 0, 4, r0, r0
+9:	GEN_CLEARBG_DYE_LOOP 1, -4, r0, r0
+
+_gint_image_rgb16_dye:
+	tst	#1, r0
+	bf	9f
+
+	GEN_CLEARBG_DYE_LOOP 0, 4, r7, r0
+9:	GEN_CLEARBG_DYE_LOOP 1, -4, r7, r0
--- a/src/render-cg/image/image_rgb16_dye.c
+++ b/src/render-cg/image/image_rgb16_dye.c
@ -0,0 +1,23 @@
+#include <gint/display.h>
+#include <gint/image.h>
+
+void dimage_rgb16_dye(int x, int y, image_t const *img, int eff, int dye_color)
+{
+	dsubimage_rgb16_dye(x, y, img, 0, 0, img->width, img->height, eff,
+		dye_color);
+}
+
+void dsubimage_rgb16_dye(int x, int y, image_t const *img,
+	int left, int top, int w, int h, int eff, int dye_color)
+{
+	struct gint_image_box box = { x, y, w, h, left, top };
+	struct gint_image_cmd cmd;
+
+	if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
+		DHEIGHT)) return;
+	cmd.effect += 12;
+	cmd.color_1 = img->alpha;
+	cmd.color_2 = dye_color;
+	cmd.loop = gint_image_rgb16_dye;
+	gint_image_rgb16_loop(DWIDTH, &cmd);
+}
--- a/src/render-cg/image/image_rgb16_effect.c
+++ b/src/render-cg/image/image_rgb16_effect.c
@ -0,0 +1,32 @@
+#include <gint/image.h>
+
+void dsubimage_rgb16_effect(int x, int y, image_t const *img,
+	int left, int top, int w, int h, int eff, ...)
+{
+	va_list args;
+	va_start(args, eff);
+
+	if(eff & IMAGE_CLEARBG) {
+		int bg = va_arg(args, int);
+		dsubimage_rgb16_clearbg(x, y, img, left, top, w, h, eff, bg);
+	}
+	else if(eff & IMAGE_SWAPCOLOR) {
+		int from = va_arg(args, int);
+		int to = va_arg(args, int);
+		dsubimage_rgb16_swapcolor(x, y, img, left, top, w, h, eff,
+			from, to);
+	}
+	else if(eff & IMAGE_ADDBG) {
+		int bg = va_arg(args, int);
+		dsubimage_rgb16_addbg(x, y, img, left, top, w, h, eff, bg);
+	}
+	else if(eff & IMAGE_DYE) {
+		int dye = va_arg(args, int);
+		dsubimage_rgb16_dye(x, y, img, left, top, w, h, eff, dye);
+	}
+	else {
+		dsubimage_rgb16(x, y, img, left, top, w, h, eff);
+	}
+
+	va_end(args);
+}
--- a/src/render-cg/image/image_rgb16_normal.S
+++ b/src/render-cg/image/image_rgb16_normal.S
@ -0,0 +1,201 @@
+.global _gint_image_rgb16_normal
+#include "image_macros.S"
+
+/* RGB16 Opaque rendering, RAM version: by longword access.
+
+   This function of the image renderer is designed for the RAM model only. At
+   default overclock levels, the RAM can register a write every 13-14 cycles,
+   regardless of size. Since this amount of time is more than enough to build a
+   target longword regardless of alignment and geometry considerations, the
+   main and only focus of this function is to only write longwords.
+
+   Since longwords can only be written at 4-aligned addresses and always make
+   pairs of pixels, there are variations on the loop depending on the rendered
+   width and destination. These are marked with the following convention:
+
+   * w1 / w2 denotes the parity of the command width;
+   * o2 / o4 denotes the alignment of the output.
+
+   There is a forward and a backward variation for all four combinations of
+   these parameters, noted F_ and B_ in label names. Some word-based variations
+   are provided for width ≤ 8, which is just a way to ensure that the longword-
+   based loops always have a least one interation, since they're implemented as
+   do/while.
+
+   The loops themselves are nowhere near tight on the CPU side and entirely
+   bottlenecked by the RAM, hence the simplicity and complete disregard for
+   superscalar parallelism. */
+
+_gint_image_rgb16_normal:
+	/* We use word copy for width ≤ 8; this is to ensure that there is at
+	   least one longword in the non-trivial loop, simplifying checks */
+	tst	#1, r0
+	mov	#8, r0
+
+	bf.s	.BACKWARD
+	cmp/ge	r2, r0
+
+.FORWARD:
+	bt	_FORWARD_WORD_COPY
+	nop
+
+	bra	_FORWARD_LONG_COPY
+	nop
+
+.BACKWARD:
+	mov	r2, r0
+	add	r0, r0
+	add	r0, r5
+	add	r0, r0
+
+	bt.s	_BACKWARD_WORD_COPY
+	add	r0, r6
+
+	bra	_BACKWARD_LONG_COPY
+	nop
+
+_FORWARD_WORD_COPY:
+	START
+2:	movs.w	@r3+, x0
+3:	movs.w	x0, @r5+
+	END
+	EPILOGUE
+
+_BACKWARD_WORD_COPY:
+	START
+2:	movs.w	@r3+, x0
+3:	movs.w	x0, @-r5
+	END
+	EPILOGUE
+
+_FORWARD_LONG_COPY:
+	shlr	r2		/* Test width parity */
+	mov	#2, r0
+
+	bt	.F_w1
+	nop
+
+.F_w2:	tst	r0, r5		/* Test alignment of output */
+	bf	.F_w2o2
+
+.F_w2o4:
+	START
+2:	mov.w	@r3+, r0
+	mov.w	@r3+, r7
+	shll16	r7
+	xtrct	r0, r7
+	mov.l	r7, @r5
+3:	add	#4, r5
+	END
+	EPILOGUE
+
+.F_w2o2:
+	add	#-1, r2
+	START
+	mov.w	@r3+, r0
+	mov.w	r0, @r5
+	add	#2, r5
+2:	mov.w	@r3+, r0
+	mov.w	@r3+, r7
+	shll16	r7
+	xtrct	r0, r7
+	mov.l	r7, @r5
+3:	add	#4, r5
+	mov.w	@r3+, r0
+	mov.w	r0, @r5
+	add	#2, r5
+	END
+	EPILOGUE
+
+.F_w1:	tst	r0, r5		/* Test alignment of output */
+	bf	.F_w1o2
+
+.F_w1o4:
+	START
+2:	mov.w	@r3+, r0
+	mov.w	@r3+, r7
+	shll16	r7
+	xtrct	r0, r7
+	mov.l	r7, @r5
+3:	add	#4, r5
+	mov.w	@r3+, r0
+	mov.w	r0, @r5
+	add	#2, r5
+	END
+	EPILOGUE
+
+.F_w1o2:
+	START
+	mov.w	@r3+, r0
+	mov.w	r0, @r5
+	add	#2, r5
+2:	mov.w	@r3+, r0
+	mov.w	@r3+, r7
+	shll16	r7
+	xtrct	r0, r7
+	mov.l	r7, @r5
+3:	add	#4, r5
+	END
+	EPILOGUE
+
+_BACKWARD_LONG_COPY:
+	shlr	r2		/* Test width parity */
+	mov	#2, r0
+
+	bt	.B_w1
+	nop
+
+.B_w2:	tst	r0, r5		/* Test alignment of output */
+	bf	.B_w2o2
+
+.B_w2o4:
+	START
+2:	mov.w	@r3+, r0
+	mov.w	@r3+, r7
+	shll16	r0
+	xtrct	r7, r0
+3:	mov.l	r0, @-r5
+	END
+	EPILOGUE
+
+.B_w2o2:
+	add	#-1, r2
+	START
+	mov.w	@r3+, r0
+	mov.w	r0, @-r5
+2:	mov.w	@r3+, r0
+	mov.w	@r3+, r7
+	shll16	r0
+	xtrct	r7, r0
+3:	mov.l	r0, @-r5
+	mov.w	@r3+, r0
+	mov.w	r0, @-r5
+	END
+	EPILOGUE
+
+.B_w1:	tst	r0, r5		/* Test alignment of output */
+	bf	.B_w1o2
+
+.B_w1o4:
+	START
+2:	mov.w	@r3+, r0
+	mov.w	@r3+, r7
+	shll16	r0
+	xtrct	r7, r0
+3:	mov.l	r0, @-r5
+	mov.w	@r3+, r0
+	mov.w	r0, @-r5
+	END
+	EPILOGUE
+
+.B_w1o2:
+	START
+	mov.w	@r3+, r0
+	mov.w	r0, @-r5
+2:	mov.w	@r3+, r0
+	mov.w	@r3+, r7
+	shll16	r0
+	xtrct	r7, r0
+3:	mov.l	r0, @-r5
+	END
+	EPILOGUE
--- a/src/render-cg/image/image_rgb16_swapcolor.S
+++ b/src/render-cg/image/image_rgb16_swapcolor.S
@ -0,0 +1,45 @@
+.global _gint_image_rgb16_swapcolor
+#include "image_macros.S"
+
+/* RGB16 SWAPCOLOR, RAM version: trivial.
+
+   This function is once again bottlenecked by RAM. Generating longwords would
+   be tight and require significant adjustments, so we stick to words, and the
+   trivial bopti-style version already maxes out the output rate. */
+
+.macro GEN_SWAPCOLOR_LOOP HFLIP, OUT_DIR
+	mov.w	@r8+, r0	/* cmd.edge_2 (don't care) */
+	mov.w	@r8+, r9	/* cmd.color_1 */
+	mov.w	@r8+, r7	/* cmd.color_2 */
+
+ .if \HFLIP
+	add	#-2, r5
+	mov	r2, r0
+	shll	r0
+	add	r0, r5
+	shll	r0
+	add	r0, r6
+ .endif
+
+1:	mov	r2, r8
+
+2:	mov.w	@r3+, r0
+	cmp/eq	r0, r9
+	bf	4f
+	mov	r7, r0
+4:	mov.w	r0, @r5
+
+3:	dt	r8
+	bf.s	2b
+	add	#\OUT_DIR, r5
+
+	END
+	EPILOGUE
+.endm
+
+_gint_image_rgb16_swapcolor:
+	tst	#1, r0
+	bf	9f
+
+	GEN_SWAPCOLOR_LOOP 0, 2
+9:	GEN_SWAPCOLOR_LOOP 1, -2
--- a/src/render-cg/image/image_rgb16_swapcolor.c
+++ b/src/render-cg/image/image_rgb16_swapcolor.c
@ -0,0 +1,46 @@
+#include <gint/display.h>
+#include <gint/image.h>
+
+void dimage_rgb16_swapcolor(int x, int y, image_t const *img, int eff,
+	int old_color, int new_color)
+{
+	dsubimage_rgb16_swapcolor(x, y, img, 0, 0, img->width, img->height,
+		eff, old_color, new_color);
+}
+
+void dsubimage_rgb16_swapcolor(int x, int y, image_t const *img,
+	int left, int top, int w, int h, int eff, int old_color, int new_color)
+{
+	struct gint_image_box box = { x, y, w, h, left, top };
+	struct gint_image_cmd cmd;
+
+	if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
+		DHEIGHT)) return;
+	cmd.effect += 8;
+	cmd.color_1 = old_color;
+	cmd.color_2 = new_color;
+	cmd.loop = gint_image_rgb16_swapcolor;
+	gint_image_rgb16_loop(DWIDTH, &cmd);
+}
+
+void dimage_rgb16_addbg(int x, int y, image_t const *img, int eff,
+	int bg_color)
+{
+	dsubimage_rgb16_addbg(x, y, img, 0, 0, img->width, img->height,
+		eff, bg_color);
+}
+
+void dsubimage_rgb16_addbg(int x, int y, image_t const *img,
+	int left, int top, int w, int h, int eff, int bg_color)
+{
+	struct gint_image_box box = { x, y, w, h, left, top };
+	struct gint_image_cmd cmd;
+
+	if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
+		DHEIGHT)) return;
+	cmd.effect += 8;
+	cmd.color_1 = img->alpha;
+	cmd.color_2 = bg_color;
+	cmd.loop = gint_image_rgb16_swapcolor;
+	gint_image_rgb16_loop(DWIDTH, &cmd);
+}