azur: image shader with dynamic effects, and 16-row fragment

2022-05-07 18:17:33 +01:00 · 2022-05-07 18:17:33 +01:00 · 8ac9ac747a
parent e124719de3
commit 8ac9ac747a
27 changed files with 1599 additions and 877 deletions
--- a/azur/CMakeLists.txt
+++ b/azur/CMakeLists.txt
@ -28,10 +28,32 @@ endif()
 if(AZUR_GRAPHICS_GINT_CG)
  list(APPEND SOURCES
    src/gint/render.c
+    src/gint/r61524.s
+    # Clear shader
    src/gint/shaders/clear.c
    src/gint/shaders/clear.S
+    # Image shader
    src/gint/shaders/image.c
-    src/gint/shaders/image.S)
+    src/gint/shaders/image_rgb16_normal.S
+    src/gint/shaders/image_rgb16_clearbg.S
+    src/gint/shaders/image_rgb16_swapcolor.S
+    src/gint/shaders/image_rgb16_dye.S
+    src/gint/shaders/image_p8_normal.S
+    src/gint/shaders/image_p8_swapcolor.S
+    src/gint/shaders/image_p4_normal.S
+    # Image shader interface
+    src/gint/shaders/image_rgb16.c
+    src/gint/shaders/image_rgb16_effect.c
+    src/gint/shaders/image_rgb16_swapcolor.c
+    src/gint/shaders/image_rgb16_dye.c
+    src/gint/shaders/image_p8.c
+    src/gint/shaders/image_p8_effect.c
+    src/gint/shaders/image_p8_swapcolor.c
+    src/gint/shaders/image_p8_dye.c
+    src/gint/shaders/image_p4.c
+    src/gint/shaders/image_p4_effect.c
+    src/gint/shaders/image_p4_swapcolor.c
+    src/gint/shaders/image_p4_dye.c)
 endif()

 add_library(azur STATIC ${SOURCES})
--- a/azur/include/azur/defs.h
+++ b/azur/include/azur/defs.h
@ -1,5 +1,5 @@
 //---
-// azur.defs: Generation definitions
+// azur.defs: General definitions that are included in every file
 //---

 /* This exposes compile-time configuration symbols. I don't like running the
--- a/azur/include/azur/gint/render.h
+++ b/azur/include/azur/gint/render.h
@ -33,8 +33,8 @@
 #include <azur/defs.h>
 AZUR_BEGIN_DECLS

-#include <gint/defs/types.h>
 #include <gint/display.h>
+#include <gint/image.h>

 #include <libprof.h>

@ -45,7 +45,7 @@ AZUR_BEGIN_DECLS
 typedef void azrp_shader_t(void *uniforms, void *command, void *fragment);

 /* Video memory fragment used as rendering target (in XRAM). */
-extern uint16_t azrp_frag[];
+extern uint16_t *azrp_frag;

 /* Maximum number of commands that can be queued. (This is only one of two
   limits, the other being the size of the command data.) */
@ -128,19 +128,19 @@ extern int azrp_frag_height;
   The settings on each mode are as follow:

   * x1: Display resolution: 396x224
-         Fragment size: 8 rows (6336 bytes)
+         Fragment size: 16 rows (12672 bytes)
         Number of fragments: 28 (29 if an offset is used)
-         Total size of graphics data: 177.408 kB
+         Total size of graphics data: 177'408 bytes

   * x2: Display resolution: 198x112
-         Fragment size: 16 rows (6336 bytes)
+         Fragment size: 16 rows (6336 bytes) # TODO: increase
         Number of fragments 7 (8 if an offset if used)
-         Total size of graphics data: 44.352 kB
+         Total size of graphics data: 44'352 bytes

   * x3: Display resolution: 132x75 (last row only has 2/3 pixels)
-         Fragment size: 16 rows (4224 bytes)
+         Fragment size: 16 rows (4224 bytes) # TODO: increase
         Number of fragments: 5 (sometimes 6 if an offset is used)
-         Total size of graphics data: 19.800 kB
+         Total size of graphics data: 19'800 bytes

   As one would know when playing modern video games, super-resolution is one
   of the most useful ways to increase performance. The reduced amount of
@ -167,30 +167,50 @@ void azrp_config_scale(int scale);
   @offset  Fragment offset along the y-axis (0 ... height of fragment-1). */
 void azrp_config_frag_offset(int offset);

+//---
+// Hooks
+//---
+
+/* Hook called before a fragment is sent to the display. The fragment can be
+   accessed and modified freeely (however, the time spent in the hook is
+   counted as overhead and only part of [azrp_perf_render]). */
+typedef void azrp_hook_prefrag_t(int id, void *fragment, int size);
+
+/* Get or set the prefrag hook. */
+azrp_hook_prefrag_t *azrp_hook_get_prefrag(void);
+void azrp_hook_set_prefrag(azrp_hook_prefrag_t *);
+
 //---
 // Standard shaders
 //---

- /* Clears the entire output with a single color */
+/* Clears the entire output with a single color */
 extern uint8_t AZRP_SHADER_CLEAR;
- /* Renders RGB565 textures/images */
-extern uint8_t AZRP_SHADER_IMAGE;
+/* Renders gint images with various dynamic effects */
+extern uint8_t AZRP_SHADER_IMAGE_RGB16;
+extern uint8_t AZRP_SHADER_IMAGE_P8;
+extern uint8_t AZRP_SHADER_IMAGE_P4;

 /* azrp_clear(): Clear output [ARZP_SHADER_CLEAR] */
 void azrp_clear(uint16_t color);

-/* azrp_image(): Queue image command [AZRP_SHADER_IMAGE] */
+/* azrp_image(): Queue image command [AZRP_SHADER_IMAGE_*] */
 void azrp_image(int x, int y, bopti_image_t const *image);

-/* azrp_subimage(): Queue image subsection command [AZRP_SHADER_IMAGE] */
+/* azrp_subimage(): Queue image subsection command [AZRP_SHADER_IMAGE_*] */
 void azrp_subimage(int x, int y, bopti_image_t const *image,
   int left, int top, int width, int height, int flags);

+/* See below for more detailed image functions. Dynamic effects are provided
+   with the same naming convention as gint. */
+
 /* Functions to update uniforms for these shaders. You should call them when:
   * AZRP_SHADER_CLEAR: Changing super-scaling settings.
-   * AZRP_SHADER_IMAGE: Changing super-scaling or or fragment offsets. */
+   * AZRP_SHADER_IMAGE_*: Changing super-scaling or or fragment offsets. */
 void azrp_shader_clear_configure(void);
-void azrp_shader_image_configure(void);
+void azrp_shader_image_rgb16_configure(void);
+void azrp_shader_image_p8_configure(void);
+void azrp_shader_image_p4_configure(void);

 //---
 // Performance indicators
@ -250,32 +270,79 @@ void azrp_set_uniforms(int shader_id, void *uniforms);
   exceeded. */
 bool azrp_queue_command(void *command, size_t size, int fragment, int count);

+/* azrp_queue_image(): Split and queue a gint image command
+
+    The command must have been completely prepared with gint_image_mkcmd() and
+    have had its color effect sections filled. This function sets the shader ID
+    and adjusts the command for fragmented rendering. */
+void azrp_queue_image(struct gint_image_box *box, image_t const *img,
+    struct gint_image_cmd *cmd);
+
 //---
-// Internal shader definitions (for reference; no API guarantee)
+// Internal R61524 functions
 //---

-struct azrp_shader_image_command {
-    uint8_t shader_id;
-    /* First edge-preserved pixel offset (P4 only) */
-    int8_t edge1;
-    /* Pixels per line */
-    int16_t columns;
-    /* Address of the image structure */
-    bopti_image_t const *image;
-    /* Destination in XRAM (offset) */
-    uint16_t output;
-    /* Number of lines */
-    int16_t lines;
-    /* Already offset by start row and column */
-    void const *input;
+void azrp_r61524_fragment_x1(void *fragment, int size);

-    /* Info for structure update between fragments: */
-    int16_t height;
-    int16_t row_stride;
-    int16_t x;
+void azrp_r61524_fragment_x2(void *fragment, int width, int height);

-    /* Second edge-preserved pixel offset (P4 only) */
-    int16_t edge2;
-};
+//---
+// Internal functions for the image shader
+//
+// We use gint's image rendering API but replace some of the core loops with
+// Azur-specific versions that are faster in the CPU-bound context of this
+// rendering engine. Some of the main loops from Azur actually perform better
+// in RAM than bopti used to do, and are already in gint.
+//---
+
+/* azrp_image_effect(): Generalized azrp_image() with dynamic effects */
+#define azrp_image_effect(x, y, img, eff, ...) \
+    azrp_image_effect(x, y, img, 0, 0, (img)->width, (img)->height, eff, \
+        ##__VA_ARGS__)
+/* azrp_subimage_effect(): Generalized azrp_subimage() with dynamic effects */
+void azrp_subimage_effect(int x, int y, image_t const *img,
+    int left, int top, int w, int h, int effects, ...);
+
+/* Specific versions for each format */
+#define AZRP_IMAGE_SIG1(NAME, ...) \
+    void azrp_image_ ## NAME(int x, int y, image_t const *img,##__VA_ARGS__); \
+    void azrp_subimage_ ## NAME(int x, int y, image_t const *img, \
+        int left, int top, int w, int h, ##__VA_ARGS__);
+#define AZRP_IMAGE_SIG(NAME, ...) \
+    AZRP_IMAGE_SIG1(rgb16 ## NAME, ##__VA_ARGS__) \
+    AZRP_IMAGE_SIG1(p8 ## NAME, ##__VA_ARGS__) \
+    AZRP_IMAGE_SIG1(p4 ## NAME, ##__VA_ARGS__)
+
+AZRP_IMAGE_SIG(_effect, int effects, ...)
+AZRP_IMAGE_SIG(, int effects)
+AZRP_IMAGE_SIG(_clearbg, int effects, int bg_color_or_index)
+AZRP_IMAGE_SIG(_swapcolor, int effects, int source, int replacement)
+AZRP_IMAGE_SIG(_addbg, int effects, int bg_color)
+AZRP_IMAGE_SIG(_dye, int effects, int dye_color)
+
+#define azrp_image_rgb16_effect(x, y, img, eff, ...) \
+    azrp_subimage_rgb16_effect(x, y, img, 0, 0, (img)->width, (img)->height, \
+        eff, ##__VA_ARGS__)
+#define azrp_image_p8_effect(x, y, img, eff, ...) \
+    azrp_subimage_p8_effect(x, y, img, 0, 0, (img)->width, (img)->height, \
+        eff, ##__VA_ARGS__)
+#define azrp_image_p4_effect(x, y, img, eff, ...) \
+    azrp_subimage_p4_effect(x, y, img, 0, 0, (img)->width, (img)->height, \
+        eff, ##__VA_ARGS__)
+
+#undef AZRP_IMAGE_SIG
+#undef AZRP_IMAGE_SIG1
+
+/* Main loop provided by Azur; as usual, these are not real functions; their
+   only use is as the [.loop] field of a command. */
+
+void azrp_image_shader_rgb16_normal(void);
+void azrp_image_shader_rgb16_clearbg(void);
+void azrp_image_shader_rgb16_swapcolor(void);
+void azrp_image_shader_rgb16_dye(void);
+void azrp_image_shader_p8_normal(void);
+void azrp_image_shader_p8_swapcolor(void);
+void azrp_image_shader_p4_normal(void);
+void azrp_image_shader_p4_clearbg(void);

 AZUR_END_DECLS
--- a/azur/src/gint/r61524.s
+++ b/azur/src/gint/r61524.s
@ -0,0 +1,65 @@
+.section .ilram, "ax"
+
+.balign 4
+.global _azrp_r61524_fragment_x1
+_azrp_r61524_fragment_x1:
+	mov.l	.R61524_DATA, r2
+	shlr	r5
+
+	ldrs	1f
+	ldre	2f
+	ldrc	r5
+	nop
+
+	/* Read a word from XRAM */
+1:	mov.l	@r4+, r0
+	/* Write that word to the display */
+2:	mov.l	r0, @r2
+
+	rts
+	nop
+
+.balign 4
+.global _azrp_r61524_fragment_x2
+_azrp_r61524_fragment_x2:
+	mov.l	.R61524_DATA, r2
+	nop
+
+	/* Read a word, write it twice */
+	ldrs	1f
+	ldre	2f
+	ldrc	r5
+	nop
+
+1:	mov.w	@r4+, r0
+	nop
+	mov.w	r0, @r2
+	nop
+	mov.w	r0, @r2
+2:	nop
+
+	sub	r5, r4
+	sub	r5, r4
+
+	/* Do that again on a second line */
+	ldrs	3f
+	ldre	4f
+	ldrc	r5
+	nop
+
+3:	mov.w	@r4+, r0
+	nop
+	mov.w	r0, @r2
+	nop
+	mov.w	r0, @r2
+4:	nop
+
+	dt	r6
+	bf	_azrp_r61524_fragment_x2
+
+	rts
+	nop
+
+.balign 4
+.R61524_DATA:
+	.long	0xb4000000
--- a/azur/src/gint/render.c
+++ b/azur/src/gint/render.c
@ -7,11 +7,8 @@
 #include <string.h>
 #include <stdlib.h>

-#define YRAM ((void *)0xe5017000)
-
-/* 8 rows of video memory, occupying 6338/8192 bytes of XRAM.
-   TODO: Extend this to 16 rows, and move the rest to RAM */
-GXRAM GALIGNED(32) uint16_t azrp_frag[DWIDTH * 8];
+/* 16 rows of video memory, occupying 12736/16384 bytes or XYRAM (77.7%). */
+uint16_t *azrp_frag = (void *)0xe500e000 + 32;

 /* Super-scaling factor, width and height of output. */
 int azrp_scale;
@ -22,27 +19,33 @@ int azrp_frag_count;
 /* Height of fragment. */
 int azrp_frag_height;

-/* TODO: Either make command queue private or use azrp_ prefix */
-
 /* Number and total size of queued commands. */
-GXRAM int commands_count = 0, commands_length = 0;
+static int commands_count=0, commands_length=0;

-/* Array of pointers to queued commands (stored as an offset into YRAM). */
-GXRAM uint32_t commands_array[AZRP_MAX_COMMANDS];
+/* Array of pointers to queued commands. Each command has:
+   * Top 16 bits: fragment number
+   * Bottom 16 bits: offset into command data buffer
+   Rendering order is integer order. */
+static uint32_t commands_array[AZRP_MAX_COMMANDS];
+
+static GALIGNED(4) uint8_t commands_data[8192];

 /* Array of shader programs and uniforms. */
-GXRAM static azrp_shader_t *shaders[AZRP_MAX_SHADERS] = { NULL };
-GXRAM static void *shader_uniforms[AZRP_MAX_SHADERS] = { NULL };
+static azrp_shader_t *shaders[AZRP_MAX_SHADERS] = { NULL };
+static void *shader_uniforms[AZRP_MAX_SHADERS] = { NULL };

 /* Next free index in the shader program array. */
-GXRAM static uint16_t shaders_next = 0;
+static uint16_t shaders_next = 0;
+
+/* Hooks. */
+static azrp_hook_prefrag_t *azrp_hook_prefrag = NULL;

 /* Performance counters. */
-GXRAM prof_t azrp_perf_cmdgen;
-GXRAM prof_t azrp_perf_sort;
-GXRAM prof_t azrp_perf_shaders;
-GXRAM prof_t azrp_perf_r61524;
-GXRAM prof_t azrp_perf_render;
+prof_t azrp_perf_cmdgen;
+prof_t azrp_perf_sort;
+prof_t azrp_perf_shaders;
+prof_t azrp_perf_r61524;
+prof_t azrp_perf_render;

 //---
 // High and low-level pipeline functions
@ -110,25 +113,23 @@ void azrp_render_fragments(void)
    while(1) {
        while(cmd < next_frag_threshold && i < commands_count) {
            azrp_commands_total++;
-            uint8_t *data = (uint8_t *)YRAM + (cmd & 0xffff);
+            uint8_t *data = commands_data + (cmd & 0xffff);
            prof_enter_norec(azrp_perf_shaders);
            shaders[data[0]](shader_uniforms[data[0]], data, azrp_frag);
            prof_leave_norec(azrp_perf_shaders);
-
-            if(data[0] == AZRP_SHADER_IMAGE) {
-                struct azrp_shader_image_command *cmd = (void *)data;
-                cmd->height -= cmd->lines;
-                cmd->input += cmd->row_stride * cmd->lines;
-                cmd->lines = min(cmd->height, azrp_frag_height);
-                cmd->output = 2 * cmd->x;
-            }
-
            cmd = commands_array[++i];
        }

-        /* TODO: Consider xram_frame() by DMA in parallel? */
+        if(azrp_hook_prefrag) {
+            int size = azrp_width * azrp_frag_height * 2;
+            (*azrp_hook_prefrag)(frag, azrp_frag, size);
+        }
+
        prof_enter_norec(azrp_perf_r61524);
-        xram_frame(azrp_frag, 396 * 8);
+        if(azrp_scale == 1)
+            azrp_r61524_fragment_x1(azrp_frag, 396 * azrp_frag_height);
+        else if(azrp_scale == 2)
+            azrp_r61524_fragment_x2(azrp_frag, azrp_width, azrp_frag_height);
        prof_leave_norec(azrp_perf_r61524);

        if(++frag >= azrp_frag_count) break;
@ -149,10 +150,12 @@ void azrp_update(void)
 // Configuration calls
 //---

+// TODO: Use larger fragments in upscales x2 and x3
+
 static void update_frag_count(void)
 {
    if(azrp_scale == 1)
-        azrp_frag_count = 28 + (azrp_frag_offset > 0);
+        azrp_frag_count = 14 + (azrp_frag_offset > 0);
    else if(azrp_scale == 2)
        azrp_frag_count = 7 + (azrp_frag_offset > 0);
    else if(azrp_scale == 3)
@ -162,7 +165,7 @@ static void update_frag_count(void)
 static void update_size(void)
 {
    if(azrp_scale == 1)
-        azrp_width = 396, azrp_height = 198, azrp_frag_height = 8;
+        azrp_width = 396, azrp_height = 224, azrp_frag_height = 16;
    else if(azrp_scale == 2)
        azrp_width = 198, azrp_height = 112, azrp_frag_height = 16;
    else if(azrp_scale == 3)
@ -194,6 +197,20 @@ static void default_settings(void)
    azrp_config_scale(1);
 }

+//---
+// Hooks
+//---
+
+azrp_hook_prefrag_t *azrp_hook_get_prefrag(void)
+{
+    return azrp_hook_prefrag;
+}
+
+void azrp_hook_set_prefrag(azrp_hook_prefrag_t *hook)
+{
+    azrp_hook_prefrag = hook;
+}
+
 //---
 // Custom shaders
 //---
@ -226,7 +243,7 @@ bool azrp_queue_command(void *command, size_t size, int fragment, int count)
    if(commands_length + size >= 8192)
        return false;

-    uint8_t *dst = YRAM + commands_length;
+    uint8_t *dst = commands_data + commands_length;
    uint8_t *src = command;

    for(size_t i = 0; i < size; i++)
--- a/azur/src/gint/shaders/image.S
+++ b/azur/src/gint/shaders/image.S
@ -1,727 +0,0 @@
-/* Azur's built-in shaders: <image>
-
-   If there ever was a fantastic piece of assembler engineering in my work up
-   to this point, this would be it. Every trick in the book is used here, from
-   clever instruction combinations, pipeline flow and tricky DSP abuse all the
-   way up to memory layout planning, transforms on loop structures, and most
-   critically superscalar parallelism.
-
-   While the performance of the shader is not *strictly* proportional to the
-   speed of the tightest loop, it's very close. The use of operand-bus XRAM for
-   graphics data, systematic alignment, and detailed pipeline stalling
-   measurements for common instruction sequences in gintctl allow very accurate
-   speed predictions to be made based on the tightness of the code.
-
-   The palette formats of bopti have been refined for the purpose of this
-   shader, with P8 being split into P8_RGB565A and P8_RGB565 with big changes,
-   and P4 being renamed P4_RGB565A with minimal changes along with a variation
-   aptly named P4_RGB565.
-
-   The asymptotic performance for each format is as follows:
-   * RGB565:      1    cycle/pixel if source and destination align
-                  2   cycles/pixel otherwise
-   * RGB565A:     4   cycles/pixel
-   * P8_RGB565A:  4.5 cycles/pixel
-   * P8_RGB565:   3   cycles/pixel
-   * P4_RGB565A:  5   cycles/pixel
-   * P4_RGB565:   3.5 cycles/pixel
-
-   Entirely documenting this code would take me hours, but some elements are
-   provided in the comments. Superscalar parallelism is most easily appreciated
-   by reading the two-page section 4.2 of the SH4AL-DSP manual. The other main
-   structural technique at play in this code is loop transforms.
-
-   Basically, a loop that loads a pixel, performs computations with it, and
-   writes the result is inefficient because of the RAW dependencies on most
-   operations (with full stall cycles between loads and computations, and
-   between computations and uses as addresses). Well-established loop
-   optimization literature has lots of techniques to help with this problem,
-   and I use two here:
-
-   * _Pipelining_ the loop consists in handling a single pixel over several
-     iterations by doing a little bit of work in each iteration. The data for
-     the pixel would move from register to register at each iteration, with the
-     loop code doing one stage's worth of computation on each register. (You
-     can view it as a diagonal iteration pattern in the pixel*instruction grid
-     if you like such visualizations.)
-
-     By increasing the number of pixels in the pipeline, a lot of independent
-     data can be obtained, reducing dependency pressure and allowing for
-     greater parallelism at the cost of more registers being used.
-
-     The use of pipelining in this shader is very modest, with 2 stages at
-     most, and usually only a couple of instructions being performed in advance
-     for the next pixel while the current one finishes processing. Register
-     assignments have some subtleties though since pressure is high overall.
-
-   * _Unrolling_ iterations of the loop consists in loading two (or more)
-     pixels at the start of each iteration so that we can work on one while
-     waiting for stalls and dependencies on the other.
-
-     Unlike pipelining, a loop iteration starts and ends with full pixels and
-     no work carries between iterations. Unrolling allows different pixels to
-     use different registers and generally better optimize the instruction
-     sequence, at the cost of only supporting pixel counts that are multipes of
-     the unrolling level.
-
-     Handling non-multiple sizes is the everlasting bane of unrolled loops,
-     sometimes requiring duplicate code. Smart maneuvers are used in P8 and P4
-     to only handle even sizes and neutralize unwanted pixels after the fact.
-
-   Both techniques are used simultaneously, with 2-unrolled 2-stage loops for
-   almost all formats (except RGB556A which performs DSP trickery).
-*/
-
-.global _azrp_shader_image
-.align 4
-
-/* Register assignment
-   r0: (temporary)
-   r1: Lines
-   r2: Command queue; (temporary)
-   r3: Input
-   r4: [parameter] azrp_width*2; output stride
-   r5: [parameter] Command queue; Output
-   r6: [parameter] azrp_frag; alpha value; (temporary)
-   r7: Columns
-   r8: Image pointer; (temporary)
-   r9: Input stride */
-_azrp_shader_image:
-	mov.l	r8, @-r15
-	add	#2, r5
-
-	mov.l	r9, @-r15
-	mov	r5, r2
-
-	mov.w	@r2+, r7    /* command.columns */
-
-	mov.l	@r2+, r8    /* command.image */
-
-	mov.w	@r2+, r5    /* command.output (offset) */
-	sub	r7, r4
-
-	mov.w	@r8+, r9    /* image.profile */
-	sub	r7, r4
-
-	mov.w	@r2+, r1    /* command.lines */
-	add	r6, r5
-
-	mov.l	@r2+, r3    /* command.input (pointer) */
-	shll2	r9
-
-	mova	.formats, r0
-
-	mov.w	@r8+, r6    /* image.alpha */
-
-	mov.l	@(r0,r9), r0
-
-	mov.w	@r8+, r9    /* image.width */
-
-	jmp	@r0
-	nop
-
-.align 4
-.formats:
-	.long	_RGB565
-	.long	_RGB565A
-	.long	_NOP /* P8 */
-	.long	_P4_RGB565A /* =P4 */
-	.long	_P8_RGB565
-	.long	_P8_RGB565A
-	.long	_P4_RGB565
-
-/* [Loop macros]
-
-   The following macros implement the main loop of the image renderer.
-   * Each line is rendered in the tight loop between 2: and 3: (both included).
-   * r5 is the output (with stride r4, in bytes)
-   * r3 is the input (with stride r9, in bytes)
-   * There are r1 rows with r7 iterations each */
-
-#define START()			\
-	nop; /* 4-alignment */	\
-	ldrs	2f;		\
-	ldre	3f;		\
-1:	ldrc	r7
-
-#define END_NORET()		\
-	dt	r1;		\
-	add	r4, r5;		\
-	bf.s	1b;		\
-	add	r9, r3
-
-#define END()			\
-	END_NORET();		\
-	mov.l	@r15+, r9;	\
-	rts;			\
-	mov.l	@r15+, r8
-
-/* [Rendering strategy for the RGB565 format]
-
-   In RGB565, all pixels are copied verbatim. This is a 2D memcpy, which we can
-   optimize by moving longwords. Since longwords are pairs of pixels, there are
-   variations and subcases based on the parity of each parameter:
-
-   * w[eo] denotes whether the width of the image is even or odd;
-   * d[eo] denotes whether the memory accesses to the source and destination
-     are even (4-aligned) or odd (2-aligned).
-
-   When the destination and source have identical parity, the d[eo] variation
-   can be defined. In this case the copy is pretty direct, it's a longword copy
-   and it takes 2 cycles to copy 4 bytes, plus some extra at the edges if the
-   start or end address is 2-aligned.
-
-   However, when they have opposite parity, each longword read matches up with
-   a 2-aligned write (or vice-versa). Rearranging words with arithmetic does
-   not help because of the stall cycle between loading a register and using it
-   in the ALU, which makes the minimum time 4 cycles for 2 pixels (the same as
-   the word-based copy). Unrolling iterations could help but would be too
-   complex here (adding sub-cases); a super-heavy renderer with more hypotheses
-   (like a tileset shader) should aim for that route though. Also, movua.l
-   followed by mov.l is even slower (5 cycles). */
-.align 4
-_RGB565:
-	mov	#8, r0      /* Maximum width for naive method */
-	sub	r7, r9
-
-	cmp/ge	r7, r0
-
-	shll	r9
-
-	bt.s	_RGB565.naive
-	mov	#2, r0
-
-	/* Use naive method for opposite source/destination parity */
-	mov	r5, r6
-	xor	r3, r6
-
-	tst	r0, r6
-	bf	_RGB565.naive
-
-	shlr	r7
-	bt	_RGB565.wo
-
-_RGB565.we:
-	tst	r0, r5
-	bf	_RGB565.we_do
-
-/* This is 4-aligned */
-_RGB565.we_de:
-	START()
-2:	movs.l	@r3+, x0
-3:	movs.l	x0, @r5+
-	END()
-
-.align 4
-_RGB565.we_do:
-	add	#-1, r7
-
-	START()
-	movs.w	@r3+, x0
-	movs.w	x0, @r5+
-
-2:	movs.l	@r3+, x0
-3:	movs.l	x0, @r5+
-
-	movs.w	@r3+, x0
-	movs.w	x0, @r5+
-	END()
-
-.align 4
-_RGB565.wo:
-	tst	r0, r5
-	bf	_RGB565.wo_do
-
-_RGB565.wo_de:
-	START()
-2:	movs.l	@r3+, x0
-3:	movs.l	x0, @r5+
-
-	movs.w	@r3+, x0
-	movs.w	x0, @r5+
-	END()
-
-.align 4
-_RGB565.wo_do:
-	START()
-	movs.w	@r3+, x0
-	movs.w	x0, @r5+
-
-2:	movs.l	@r3+, x0
-3:	movs.l	x0, @r5+
-	END()
-
-/* Naive method for small widths and opposite source/destination parity */
-.align 4
-_RGB565.naive:
-	START()
-2:	movs.w	@r3+, x0
-3:	movs.w	x0, @r5+
-	END()
-
-/* [Rendering strategy for the RGB565A format]
-
-   Since we have to check for the alpha value in each pixel, there's really no
-   longword-based optimization. Instead, we just go as fast as possible with
-   each pixel, using DSP instructions because conditional execution is pretty
-   damn good. This takes 4 cycles/pixel. I tried a number of reductions to
-   3 cycles/pixel but could not get any of them to work. */
-.align 4
-_RGB565A:
-	shll16	r6
-	mov	#0x0004, r0 /* DC Zero mode */
-
-	sub	r7, r9
-
-	shll	r9
-
-	lds	r6, y0
-
-	lds	r0, dsr
-
-	START()
-2:	                        movs.w  @r3+, x0
-	    pcmp    x0, y0      movx.w  @r5, x1
-	dct pcopy   x1, x0
-3:	                        movx.w  x0, @r5+
-	END()
-
-/* [Rendering strategy for the P8_RGB565A format]
-
-   The work needed for each pixel gets more difficult as we go, with alpha
-   being the major culprit due to its additional comparisons, jumps, and
-   limited optimization opportunities when unrolling due to conditionally-
-   executed code.
-
-   Because arithmetic is unavoidable and there are 1-cycle delays between both
-   loading-arithmetic, and arithmetic-indexing pairs, the loop has 2-unrolled
-   iterations with a 2-stage pipeline structure. This fills the stall cycles
-   and increases parallelism significantly. Pure loop optimization handbook.
-
-   Dealing with odd widths is a major pain as usual. Instead of adding logic to
-   handle the extra pixel separately, this routine lets the loop overwrite it,
-   then restores its original value afterwards - a delightfully elegant trick.
-
-   The P8 format is actually so bad that spending precious time grinding cycles
-   felt completely inappropriate without first refining it. This led to two new
-   variations, P8_RGB565 and P8_RGB565A, which fix the following problems.
-
-   -> First there is alpha for all images, which is the most costly feature,
-      single-handedly accounting for half of the work per pixel. P8_RGB565
-      does no support alpha, which basically doubles performance.
-
-   -> Then, there is the alpha value itself. In P8 it is a variable (and fxconv
-      sets it to 0xff), which burns a register for the comparison and enforces
-      a fixed order between comparison and left-shift. P8_RGB565A always sets
-      an alpha value of 0x00 which lifts both constraints.
-
-   -> Then, there are palette indices. In P8 they are unsigned, which requires
-      an extu.b. In P8_RGB565 and P8_RGB565A they are signed, so the sign-
-      extended value of the mov.b can be used directly (once doubled). The
-      palette base is simply offset by 128 entries, with colors numbered
-      -128..-1 first and only then 0..127.
-
-   -> Finally, there's the palette itself. In P8 it always has 256 entries,
-      even when only a few are used. For small images this is a huge waste, so
-      P8_RGB565 and P8_RGB565A only store colors that are actually used.
-
-   P8_RGB565A achieves 4.5 cycles/pixel asymptotically, which is really good
-   compared to 4 cycles/pixel for RGB565A. */
-.align 4
-_P8_RGB565A:
-	mov.l	r13, @-r15
-	sub	r7, r9
-
-	mov	r7, r13
-	add	#-2, r9 /* Input stride compensation for pipelining */
-
-	mov.l	r12, @-r15
-	shlr	r7
-
-	mov.l	r10, @-r15
-	movt	r6
-
-	mov.w	_P8_RGB565A.palette_distance, r0
-	shll	r13
-
-	add	r6, r7
-
-	sub	r6, r9
-
-	sub	r6, r4
-
-	sub	r6, r4
-
-	add	r0, r8
-
-	add	r5, r13
-	mov	r7, r2
-
-	add	#-4, r5 /* Output offset compensation in the loop */
-
-	shll2	r2
-
-	add	r4, r2
-
-	START()
-
-	mov.b	@r3+, r6
-
-	/* Save next pixel for the odd-width case */
-	mov.w	@r13, r12
-
-	mov.b	@r3+, r10
-	tst	r6, r6
-
-	/* 2-unrolled 2-stage main loop */
-2:	add	r6, r6
-	mov	r6, r0
-
-	add	r10, r10
-	bt.s	5f
-
-	tst	r10, r10
-	mov.w	@(r0,r8), r0
-
-	mov.w	r0, @(4,r5)
-
-     5: mov.b	@r3+, r6
-	mov	r10, r0
-
-	bt.s	6f
-	add	#4, r5
-
-	mov.w	@(r0,r8), r0
-
-	mov.w	r0, @(2,r5)
-
-     6:	mov.b	@r3+, r10
-3:	tst	r6, r6
-
-	/* Restore last pixel */
-	mov.w	r12, @r13
-	add	r2, r13
-
-	END_NORET()
-	mov.l	@r15+, r10
-	mov.l	@r15+, r12
-	mov.l	@r15+, r13
-	mov.l	@r15+, r9
-	rts
-	mov.l	@r15+, r8
-
-_P8_RGB565A.palette_distance:
-	/* Distance between image pointer and palette array base */
-	.word	260
-
-/* [Rendering strategy for the P8_RGB565 format]
-
-   See P8_RGB565A for format details. Removing the checks for transparency and
-   the jumps simplifies the instruction sequence and allows superior
-   parallelism because all paths are unconditional. This routines achieves
-   3 cycles/pixel asymptotically. */
-.align 4
-_P8_RGB565:
-	mov.l	r13, @-r15
-	sub	r7, r9
-
-	mov	r7, r13
-	add	#-2, r9 /* Input stride compensation for pipelining */
-
-	mov.l	r12, @-r15
-	shlr	r7
-
-	mov.l	r10, @-r15
-	movt	r6
-
-	mov.w	_P8_RGB565.palette_distance, r0
-	shll	r13
-
-	add	r6, r7
-
-	sub	r6, r9
-
-	sub	r6, r4
-
-	sub	r6, r4
-
-	add	r0, r8
-
-	add	r5, r13
-
-	add	#-4, r5 /* Output offset compensation in the loop */
-	mov	r7, r2
-
-	shll2	r2
-
-	add	r4, r2
-
-	START()
-
-	mov.b	@r3+, r0
-
-	/* Save next pixel for the odd-width case */
-	mov.w	@r13, r12
-
-	mov.b	@r3+, r10
-	shll	r0
-
-	/* 2-unrolled 2-stage main loop */
-2:	mov.b	@r3+, r6
-	shll	r10
-
-	mov.w	@(r0,r8), r0
-	/* This nop is not for show, it actually prevents the loop from slowing
-	   down to 7 cycles /i, probably due to instruction reads alignment. */
-	nop
-
-	mov.w	r0, @(4,r5)
-	mov	r10, r0
-
-	mov.b	@r3+, r10
-	add	#4, r5
-
-	mov.w	@(r0,r8), r0
-	shll	r6
-
-	mov.w	r0, @(2,r5)
-3:	mov	r6, r0
-
-	/* Restore last pixel */
-	mov.w	r12, @r13
-	add	r2, r13
-
-	END_NORET()
-	mov.l	@r15+, r10
-	mov.l	@r15+, r12
-	mov.l	@r15+, r13
-	mov.l	@r15+, r9
-	rts
-	mov.l	@r15+, r8
-
-_P8_RGB565.palette_distance:
-	/* Distance between image pointer and palette array base */
-	.word	260
-
-/* [Rendering strategy for the P4_RGB565A format]
-
-   This is the most complex format. Most of the remarks that apply to
-   P8_RGB565A also apply here, except that there are less opportunities to save
-   computation because nibbles must be extracted anyway.
-
-   The P4_RGB565A format is simply bopti's P4, but an additional variation
-   P4_RGB565 is specified to save on transparency handling, which is very
-   expensive.
-
-   The special nature of the nibble packing means the simplest loop form writes
-   2 pixels from a 2-aligned source image position in a single iteration. Other
-   structures don't even come close: selecting nibbles individually is folly,
-   while not unrolling is inefficient. So the whole point of this routine is to
-   forcibly align the subimage on a byte-aligned and never break that grid.
-
-   The command builder for P4 does this alignment before submitting the
-   command. Obviously the transform can cause one extra pixel to be overridden
-   on each side of every line. The command is thus extended with two edge
-   offsets indicating pixels to preserve at each end. When overwrites occurs,
-   the edge offsets point to the overwritten pixels so they can be restored.
-   Otherwise, they point to the next pixels and the restores are no-ops. See
-   the strategy used for managing unrolling in P8 formats for details.
-
-   The only irregularity is image width, which the command builder cannot
-   modify. It is rounded up to the next multiple of 2, then halved. There is a
-   nice trick for this operation, which is [shlr rX] then adding T to rX. We
-   also need to add -1 for another adjustement, and both are combined into an
-   addc, which saves one add and one movt off the EX critical chain.
-
-   The main loop achieves 5 cycles/pixel. */
-.align 4
-_P4_RGB565A:
-	shlr	r7
-	mov.w	@(6, r2), r0	/* command.edge2 */
-
-	mov.l	r12, @-r15
-	add	#-15, r2	/* Go back to start of command */
-
-	mov	#-1, r12
-	shlr	r9
-
-	mov.l	r11, @-r15
-	addc	r12, r9
-
-	mov	r0, r12
-	add	r12, r12
-
-	mov.l	r10, @-r15
-	sub	r7, r9
-
-	mov.b	@r2, r11	/* command.edge1 */
-	add	#2, r8		/* image.palette */
-
-	mov.l	r13, @-r15
-	mov	r5, r0
-
-	mov.l	r14, @-r15
-	shll	r11
-
-	add	#-4, r5
-	nop	/* 4-alignment */
-
-	START()
-
-	mov.b	@r3+, r6
-	mov	r0, r10
-
-	mov.w	@(r0,r11), r13
-
-	mov.w	@(r0,r12), r14
-	shll	r6
-
-	/* Main loop with 2 pixels sharing a single byte */
-2:	mov	r6, r0
-	and	#0x1e, r0
-
-	tst	r0, r0
-
-	bt.s	4f
-	shlr2	r6
-
-	mov.w	@(r0,r8), r0
-
-	mov.w	r0, @(6,r5)
-     4:	shlr2	r6
-
-	mov	r6, r0
-	and	#0x1e, r0
-
-	tst	r0, r0
-	mov.b	@r3+, r6
-
-	bt.s	5f
-	add	#4, r5
-
-	mov.w	@(r0,r8), r0
-
-	mov.w	r0, @r5
-3:   5: shll	r6
-
-	mov	r10, r0
-	mov	r7, r10
-
-	shll2	r10
-
-	mov.w	r13, @(r0,r11)
-	add	r4, r10
-
-	mov.w	r14, @(r0,r12)
-	add	r0, r10
-
-	mov	r10, r0
-	/* Parallelizes with [dt r1] expanded from END_NORET() */
-
-	END_NORET()
-	mov.l	@r15+, r14
-	mov.l	@r15+, r13
-	mov.l	@r15+, r10
-	mov.l	@r15+, r11
-	mov.l	@r15+, r12
-	mov.l	@r15+, r9
-	rts
-	mov.l	@r15+, r8
-
-/* [Rendering strategy for the P4_RGB565 format]
-   Same as P4_RGB565A without transparency checks (fairly straightforward). The
-   core loop runs in 3.5 cycles/pixel. */
-.align 4
-_P4_RGB565:
-	shlr	r7
-	mov.w	@(6, r2), r0	/* command.edge2 */
-
-	mov.l	r10, @-r15
-	add	#-15, r2	/* Go back to start of command */
-
-	mov.l	r12, @-r15
-	shlr	r9
-
-	add	#2, r8		/* image.palette */
-	mov	#-1, r12
-
-	mov.l	r11, @-r15
-	addc	r12, r9
-
-	mov	r0, r12
-	add	r12, r12
-
-	mov.b	@r2, r11	/* command.edge1 */
-	sub	r7, r9
-
-	mov.l	r13, @-r15
-	mov	#0x1e, r2
-
-	mov.l	r14, @-r15
-	shll	r11
-
-	mov	r5, r0
-	add	#-4, r5
-
-	START()
-
-	mov.b	@r3+, r6
-	mov	#-4, r10
-
-	mov.l	r0, @-r15
-
-	mov.w	@(r0,r11), r13
-
-	mov.w	@(r0,r12), r14
-	shll	r6
-
-	/* Main loop with 2 pixels sharing a single byte */
-2:	mov	r6, r0
-	and	#0x1e, r0
-
-	shld	r10, r6
-
-	mov.w	@(r0,r8), r0
-	and	r2, r6
-
-	mov.w	r0, @(6,r5)
-	mov	r6, r0
-
-	mov.b	@r3+, r6
-	add	#4, r5
-
-	mov.w	@(r0,r8), r0
-
-	mov.w	r0, @r5
-3:	shll	r6
-
-	mov.l	@r15+, r0
-	mov	r7, r10
-
-	shll2	r10
-
-	mov.w	r13, @(r0,r11)
-	add	r4, r10
-
-	mov.w	r14, @(r0,r12)
-	add	r0, r10
-
-	mov	r10, r0
-	/* Parallelizes with [dt r1] expanded from END_NORET() */
-
-	END_NORET()
-	mov.l	@r15+, r14
-	mov.l	@r15+, r13
-	mov.l	@r15+, r11
-	mov.l	@r15+, r12
-	mov.l	@r15+, r10
-	mov.l	@r15+, r9
-	rts
-	mov.l	@r15+, r8
-
-/* [Unsupported formats]
-   P8 is unsupported, use P8_RGB565 and P8_RGB565A. */
-_NOP:
-	mov.l	@r15+, r9
-	rts
-	mov.l	@r15+, r8
--- a/azur/src/gint/shaders/image.c
+++ b/azur/src/gint/shaders/image.c
@ -1,88 +1,45 @@
 #include <azur/gint/render.h>
 #include <gint/defs/util.h>

-uint8_t AZRP_SHADER_IMAGE = -1;
-
-__attribute__((constructor))
-static void register_shader(void)
+void azrp_queue_image(struct gint_image_box *box, image_t const *img,
+    struct gint_image_cmd *cmd)
 {
-    extern azrp_shader_t azrp_shader_image;
-    AZRP_SHADER_IMAGE = azrp_register_shader(azrp_shader_image);
-}
-
-void azrp_shader_image_configure(void)
-{
-    azrp_set_uniforms(AZRP_SHADER_IMAGE, (void *)(2 * azrp_width));
-}
-
-//---
-
-/* Profile IDs */
-#define RGB565      0
-#define RGB565A     1
-#define P4_RGB565A  3
-#define P8_RGB565   4
-#define P8_RGB565A  5
-#define P4_RGB565   6
-
-void azrp_image(int x, int y, bopti_image_t const *image)
-{
-    azrp_subimage(x, y, image, 0, 0, image->width, image->height, 0);
-}
-
-void azrp_subimage(int x, int y, bopti_image_t const *image,
-    int left, int top, int width, int height, int flags)
-{
-    prof_enter(azrp_perf_cmdgen);
-
-    if(!(flags & DIMAGE_NOCLIP)) {
-        /* TODO: image: clip function */
-    }
-
-    struct azrp_shader_image_command cmd;
-    cmd.shader_id = AZRP_SHADER_IMAGE;
-    cmd.columns = width;
-    cmd.image = image;
-
-    int row_stride;
-
-    if(image->profile == P8_RGB565 || image->profile == P8_RGB565A) {
-        row_stride = image->width;
-        cmd.input = (void *)image->data + (image->data[0] * 2) + 2 +
-            top * row_stride + left;
-    }
-    else if(image->profile == P4_RGB565 || image->profile == P4_RGB565A) {
-        row_stride = (image->width + 1) >> 1;
-        cmd.input = (void *)image->data + 32 + top * row_stride + (left >> 1);
-
-        int odd_left  = left & 1;
-        int odd_right = (left + width) & 1;
-
-        cmd.edge1 = -1 + odd_left;
-        cmd.edge2 = width + odd_left;
-        cmd.columns += odd_left + odd_right;
-        x -= odd_left;
-    }
-    else {
-        row_stride = image->width << 1;
-        cmd.input = (void *)image->data + top * row_stride + (left << 1);
-    }
+    /* TODO: Ironically, this loads all 3 entry points */
+    int p = img->profile;
+    if(p == IMAGE_RGB565 || p == IMAGE_RGB565A)
+        cmd->shader_id = AZRP_SHADER_IMAGE_RGB16;
+    else if(p == IMAGE_P8_RGB565 || p == IMAGE_P8_RGB565A)
+        cmd->shader_id = AZRP_SHADER_IMAGE_P8;
+    else
+        cmd->shader_id = AZRP_SHADER_IMAGE_P4;

    /* This divides by azrp_frag_height */
-    int fragment_id = (azrp_scale == 1) ? (y >> 3) : (y >> 4);
+    /* TODO: Have a proper way to do optimized-division by azrp_frag_height */
+    int fragment_id = (azrp_scale == 1) ? (box->y >> 4) : (box->y >> 4);

    /* These settings only apply to the first fragment */
-    int first_y = (y + azrp_frag_offset) & (azrp_frag_height - 1);
-    cmd.lines = azrp_frag_height - first_y;
-    cmd.output = 2 * (azrp_width * first_y + x);
+    int first_y = (box->y + azrp_frag_offset) & (azrp_frag_height - 1);
+    cmd->lines = min(box->h, azrp_frag_height - first_y);
+    cmd->output = (void *)azrp_frag + (azrp_width * first_y + cmd->x) * 2;

-    /* Settings for further updates */
-    cmd.height = height;
-    cmd.row_stride = row_stride;
-    cmd.x = x;
-
-    int n = 1 + (height - cmd.lines + azrp_frag_height - 1) / azrp_frag_height;
-    azrp_queue_command(&cmd, sizeof cmd, fragment_id, n);
-
-    prof_leave(azrp_perf_cmdgen);
+    int n = 1 + (box->h - cmd->lines + azrp_frag_height-1) / azrp_frag_height;
+    azrp_queue_command(cmd, sizeof *cmd, fragment_id, n);
+}
+
+void azrp_subimage(int x, int y, image_t const *img,
+    int left, int top, int width, int height, int flags)
+{
+    int p = img->profile;
+
+    if(p == IMAGE_RGB565 || p == IMAGE_RGB565A)
+        return azrp_subimage_rgb16(x, y, img, left, top, width, height, flags);
+    if(p == IMAGE_P8_RGB565 || p == IMAGE_P8_RGB565A)
+        return azrp_subimage_p8(x, y, img, left, top, width, height, flags);
+    if(p == IMAGE_P4_RGB565 || p == IMAGE_P4_RGB565A)
+        return azrp_subimage_p4(x, y, img, left, top, width, height, flags);
+}
+
+void azrp_image(int x, int y, image_t const *img)
+{
+    azrp_subimage(x, y, img, 0, 0, img->width, img->height, 0);
 }
--- a/azur/src/gint/shaders/image_macros.S
+++ b/azur/src/gint/shaders/image_macros.S
@ -0,0 +1,37 @@
+/* mov.wv: Move at a variable offset. This macro is functionally identical to
+     mov.w \SRC, @(\OFF, \DST)
+   except that when OFF=0 it simplifies into [mov.w \SRC, @\DST] so that SRC is
+   not constrained to be r0. */
+.macro mov.wv SRC, OFF, DST
+ .if (\OFF == 0)
+	mov.w	\SRC, @\DST
+ .else
+	mov.w	\SRC, @(\OFF, \DST)
+ .endif
+.endm
+
+/* START: Sets up the inner and outer loop. The outer loop is anything between
+   the calls to macros START and END, while the inner loop is the code between
+   labels 2: and 3: (both *INCLUDED*). */
+.macro START
+	ldrs	2f
+	ldre	3f
+1:	ldrc	r2
+	nop
+.endm
+
+/* END: Finishes the outer loop and adds strides. */
+.macro END
+	dt	r1
+	add	r4, r3
+	bf.s	1b
+	add	r6, r5
+.endm
+
+/* EPILOGUE: Finishes the call by reloading registers saved in the prologue. */
+.macro EPILOGUE
+	mov.l	@r15+, r9
+	mov	r3, r0
+	rts
+	mov.l	@r15+, r8
+.endm
--- a/azur/src/gint/shaders/image_p4.c
+++ b/azur/src/gint/shaders/image_p4.c
@ -0,0 +1,70 @@
+#include <azur/gint/render.h>
+#include <gint/defs/util.h>
+
+uint8_t AZRP_SHADER_IMAGE_P4 = -1;
+
+static void shader_p4(void *uniforms, void *command, void *fragment)
+{
+    struct gint_image_cmd *cmd = (void *)command;
+    cmd->input = gint_image_p4_loop((int)uniforms, cmd);
+    cmd->height -= cmd->lines;
+    cmd->lines = min(cmd->height, azrp_frag_height);
+    cmd->output = fragment + cmd->x * 2;
+}
+
+__attribute__((constructor))
+static void register_shader(void)
+{
+    AZRP_SHADER_IMAGE_P4 = azrp_register_shader(shader_p4);
+}
+
+void azrp_shader_image_p4_configure(void)
+{
+    azrp_set_uniforms(AZRP_SHADER_IMAGE_P4, (void *)azrp_width);
+}
+
+void azrp_image_p4(int x, int y, image_t const *img, int eff)
+{
+    azrp_subimage_p4(x, y, img, 0, 0, img->width, img->height, eff);
+}
+
+void azrp_subimage_p4(int x, int y, image_t const *img,
+    int left, int top, int w, int h, int eff)
+{
+    if(img->profile == IMAGE_P4_RGB565A)
+        return azrp_subimage_p4_clearbg(x, y, img, left, top, w, h, eff,
+            img->alpha);
+
+    prof_enter(azrp_perf_cmdgen);
+    struct gint_image_box box = { x, y, w, h, left, top };
+    struct gint_image_cmd cmd;
+
+    if(gint_image_mkcmd(&box, img, eff, true, true, &cmd, azrp_width,
+            azrp_height)) {
+        cmd.loop = azrp_image_shader_p4_normal;
+        azrp_queue_image(&box, img, &cmd);
+    }
+    prof_leave(azrp_perf_cmdgen);
+}
+
+void azrp_image_p4_clearbg(int x, int y, image_t const *img, int eff, int bg)
+{
+    azrp_subimage_p4_clearbg(x, y, img, 0, 0, img->width, img->height, eff,bg);
+}
+
+void azrp_subimage_p4_clearbg(int x, int y, image_t const *img,
+    int left, int top, int w, int h, int eff, int bg_color)
+{
+    prof_enter(azrp_perf_cmdgen);
+    struct gint_image_box box = { x, y, w, h, left, top };
+    struct gint_image_cmd cmd;
+
+    if(gint_image_mkcmd(&box, img, eff, true, true, &cmd, azrp_width,
+            azrp_height)) {
+        cmd.effect += 4;
+        cmd.color_1 = bg_color;
+        cmd.loop = gint_image_p4_clearbg_alt;
+        azrp_queue_image(&box, img, &cmd);
+    }
+    prof_leave(azrp_perf_cmdgen);
+}
--- a/azur/src/gint/shaders/image_p4_dye.c
+++ b/azur/src/gint/shaders/image_p4_dye.c
@ -0,0 +1,26 @@
+#include <azur/gint/render.h>
+
+void azrp_image_p4_dye(int x, int y, image_t const *img, int eff,
+    int dye_color)
+{
+    azrp_subimage_p4_dye(x, y, img, 0, 0, img->width, img->height, eff,
+        dye_color);
+}
+
+void azrp_subimage_p4_dye(int x, int y, image_t const *img,
+    int left, int top, int w, int h, int eff, int dye_color)
+{
+    prof_enter(azrp_perf_cmdgen);
+    struct gint_image_box box = { x, y, w, h, left, top };
+    struct gint_image_cmd cmd;
+
+    if(gint_image_mkcmd(&box, img, eff, true, true, &cmd, azrp_width,
+            azrp_height)) {
+        cmd.effect += 4;
+        cmd.color_1 = img->alpha;
+        cmd.color_2 = dye_color;
+        cmd.loop = gint_image_p4_dye;
+        azrp_queue_image(&box, img, &cmd);
+    }
+    prof_leave(azrp_perf_cmdgen);
+}
--- a/azur/src/gint/shaders/image_p4_effect.c
+++ b/azur/src/gint/shaders/image_p4_effect.c
@ -0,0 +1,31 @@
+#include <azur/gint/render.h>
+
+void azrp_subimage_p4_effect(int x, int y, image_t const *img,
+    int left, int top, int w, int h, int eff, ...)
+{
+    va_list args;
+    va_start(args, eff);
+
+    if(eff & IMAGE_CLEARBG) {
+        int bg = va_arg(args, int);
+        azrp_subimage_p4_clearbg(x, y, img, left, top, w, h, eff, bg);
+    }
+    else if(eff & IMAGE_SWAPCOLOR) {
+        int from = va_arg(args, int);
+        int to = va_arg(args, int);
+        azrp_subimage_p4_swapcolor(x, y, img, left, top, w, h, eff, from, to);
+    }
+    else if(eff & IMAGE_ADDBG) {
+        int bg = va_arg(args, int);
+        azrp_subimage_p4_addbg(x, y, img, left, top, w, h, eff, bg);
+    }
+    else if(eff & IMAGE_DYE) {
+        int dye = va_arg(args, int);
+        azrp_subimage_p4_dye(x, y, img, left, top, w, h, eff, dye);
+    }
+    else {
+        azrp_subimage_p4(x, y, img, left, top, w, h, eff);
+    }
+
+    va_end(args);
+}
--- a/azur/src/gint/shaders/image_p4_normal.S
+++ b/azur/src/gint/shaders/image_p4_normal.S
@ -0,0 +1,119 @@
+.global _azrp_image_shader_p4_normal
+#include "image_macros.S"
+
+/* P4 Opaque rendering, Azur version: trivial with loop transforms.
+
+   This is a pretty direct loop with no difficult tricks involved; it expands
+   on P8 by adding another edge pointer. The main change is the decoding logic
+   which now only involves a single byte to load for every two pixels, but more
+   arithmetic to extract the nibbles.
+
+   All the loops in Azur's P4 functions are obvious EX chains and thus any
+   optimization would need to simplify the arithmetic to gain any half-cycles.
+
+   r0:  [temporary]
+   r7:  Right edge pointer
+   r8:  Right edge value
+   r9:  Palette
+   r10: Left edge pointer
+   r11: Left edge value
+   r12: Edge stride
+   r13: [temporary]
+   r14: [temporary] */
+
+.macro GEN_NORMAL_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
+	shlr	r2
+	nop
+
+	add	r10, r10
+	nop
+
+	mov.l	@r8+, r9	/* cmd.palette */
+	mov	r2, r0
+
+	mov.w	@r8+, r7	/* cmd.edge_2 */
+	shll2	r0
+
+	mov.l	r12, @-r15
+	shll	r7
+
+	mov.l	r11, @-r15
+	add	r5, r7
+
+	mov	r0, r12
+	add	r6, r12
+
+	mov.l	r13, @-r15
+	add	r5, r10
+
+	mov.l	r14, @-r15
+	add	#-4, r5
+
+	add	#-1, r4		/* Input stride compensation for pipelining */
+	nop
+
+ .if \HFLIP
+	add	r0, r5
+	nop
+
+	shll	r0
+	nop
+
+	add	r0, r6
+	nop
+ .endif
+
+	START
+
+	mov.b	@r3+, \TMP1
+	mov	#-4, \TMP2
+
+	mov.w	@r7, r8		/* Save right edge */
+	nop
+
+	mov.w	@r10, r11	/* Save left edge */
+	shll	\TMP1
+
+2:	mov	\TMP1, r0
+	and	#0x1e, r0
+
+	shld	\TMP2, \TMP1
+	mov	#0x1e, \TMP2
+
+	mov.w	@(r0,r9), r0
+	and	\TMP2, \TMP1
+
+	mov.w	r0, @(\OFF1,r5)
+	mov	\TMP1, r0
+
+	mov.b	@r3+, \TMP1
+	add	#\OUT_DIR, r5
+
+	mov.w	@(r0,r9), r0
+	mov	#-4, \TMP2
+
+	mov.w	r0, @(\OFF2,r5)
+3:	shll	\TMP1
+
+	mov.w	r8, @r7		/* Restore right edge */
+	add	r12, r7
+
+	mov.w	r11, @r10	/* Restore left edge */
+	add	r12, r10
+
+	END
+
+	mov.l	@r15+, r14
+	mov.l	@r15+, r13
+	mov.l	@r15+, r11
+	mov.l	@r15+, r12
+	mov.l	@r15+, r10
+	EPILOGUE
+.endm
+
+_azrp_image_shader_p4_normal:
+	tst	#1, r0
+	bf	9f
+
+	GEN_NORMAL_LOOP 0, 4, r13, r14, 6, 0
+9:	GEN_NORMAL_LOOP 1, -4, r13, r14, 0, 6
--- a/azur/src/gint/shaders/image_p4_swapcolor.c
+++ b/azur/src/gint/shaders/image_p4_swapcolor.c
@ -0,0 +1,51 @@
+#include <azur/gint/render.h>
+
+void azrp_image_p4_swapcolor(int x, int y, image_t const *img, int eff,
+    int old_color, int new_color)
+{
+    azrp_subimage_p4_swapcolor(x, y, img, 0, 0, img->width, img->height,
+        eff, old_color, new_color);
+}
+
+void azrp_subimage_p4_swapcolor(int x, int y, image_t const *img,
+    int left, int top, int w, int h, int eff, int old_index, int new_color)
+{
+    prof_enter(azrp_perf_cmdgen);
+    struct gint_image_box box = { x, y, w, h, left, top };
+    struct gint_image_cmd cmd;
+
+    if(gint_image_mkcmd(&box, img, eff, true, true, &cmd, azrp_width,
+            azrp_height)) {
+        cmd.effect += 8;
+        cmd.color_1 = old_index;
+        cmd.color_2 = new_color;
+        cmd.loop = gint_image_p4_swapcolor;
+        azrp_queue_image(&box, img, &cmd);
+    }
+    prof_leave(azrp_perf_cmdgen);
+}
+
+void azrp_image_p4_addbg(int x, int y, image_t const *img, int eff,
+    int bg_color)
+{
+    azrp_subimage_p4_addbg(x, y, img, 0, 0, img->width, img->height,
+        eff, bg_color);
+}
+
+void azrp_subimage_p4_addbg(int x, int y, image_t const *img,
+    int left, int top, int w, int h, int eff, int bg_color)
+{
+    prof_enter(azrp_perf_cmdgen);
+    struct gint_image_box box = { x, y, w, h, left, top };
+    struct gint_image_cmd cmd;
+
+    if(gint_image_mkcmd(&box, img, eff, true, true, &cmd, azrp_width,
+            azrp_height)) {
+        cmd.effect += 8;
+        cmd.color_1 = img->alpha;
+        cmd.color_2 = bg_color;
+        cmd.loop = gint_image_p4_swapcolor;
+        azrp_queue_image(&box, img, &cmd);
+    }
+    prof_leave(azrp_perf_cmdgen);
+}
--- a/azur/src/gint/shaders/image_p8.c
+++ b/azur/src/gint/shaders/image_p8.c
@ -0,0 +1,71 @@
+#include <azur/gint/render.h>
+#include <gint/defs/util.h>
+
+uint8_t AZRP_SHADER_IMAGE_P8 = -1;
+
+static void shader_p8(void *uniforms, void *command, void *fragment)
+{
+    struct gint_image_cmd *cmd = (void *)command;
+    cmd->input = gint_image_p8_loop((int)uniforms, cmd);
+    cmd->height -= cmd->lines;
+    cmd->lines = min(cmd->height, azrp_frag_height);
+    cmd->output = fragment + cmd->x * 2;
+}
+
+__attribute__((constructor))
+static void register_shader(void)
+{
+    AZRP_SHADER_IMAGE_P8 = azrp_register_shader(shader_p8);
+}
+
+void azrp_shader_image_p8_configure(void)
+{
+    azrp_set_uniforms(AZRP_SHADER_IMAGE_P8, (void *)azrp_width);
+}
+
+void azrp_image_p8(int x, int y, image_t const *img, int eff)
+{
+    azrp_subimage_p8(x, y, img, 0, 0, img->width, img->height, eff);
+}
+
+void azrp_subimage_p8(int x, int y, image_t const *img,
+    int left, int top, int w, int h, int eff)
+{
+    if(img->profile == IMAGE_P8_RGB565A)
+        return azrp_subimage_p8_clearbg(x, y, img, left, top, w, h, eff,
+            img->alpha);
+
+    prof_enter(azrp_perf_cmdgen);
+    struct gint_image_box box = { x, y, w, h, left, top };
+    struct gint_image_cmd cmd;
+
+    if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
+            azrp_height)) {
+        cmd.loop = azrp_image_shader_p8_normal;
+        azrp_queue_image(&box, img, &cmd);
+    }
+    prof_leave(azrp_perf_cmdgen);
+}
+
+void azrp_image_p8_clearbg(int x, int y, image_t const *img, int eff, int bg)
+{
+    azrp_subimage_p8_clearbg(x, y, img, 0, 0, img->width, img->height, eff,
+        bg);
+}
+
+void azrp_subimage_p8_clearbg(int x, int y, image_t const *img,
+    int left, int top, int w, int h, int eff, int bg_color)
+{
+    prof_enter(azrp_perf_cmdgen);
+    struct gint_image_box box = { x, y, w, h, left, top };
+    struct gint_image_cmd cmd;
+
+    if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
+            azrp_height)) {
+        cmd.effect += 4;
+        cmd.color_1 = bg_color;
+        cmd.loop = gint_image_p8_clearbg;
+        azrp_queue_image(&box, img, &cmd);
+    }
+    prof_leave(azrp_perf_cmdgen);
+}
--- a/azur/src/gint/shaders/image_p8_dye.c
+++ b/azur/src/gint/shaders/image_p8_dye.c
@ -0,0 +1,26 @@
+#include <azur/gint/render.h>
+
+void azrp_image_p8_dye(int x, int y, image_t const *img, int eff,
+    int dye_color)
+{
+    azrp_subimage_p8_dye(x, y, img, 0, 0, img->width, img->height, eff,
+        dye_color);
+}
+
+void azrp_subimage_p8_dye(int x, int y, image_t const *img,
+    int left, int top, int w, int h, int eff, int dye_color)
+{
+    prof_enter(azrp_perf_cmdgen);
+    struct gint_image_box box = { x, y, w, h, left, top };
+    struct gint_image_cmd cmd;
+
+    if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
+            azrp_height)) {
+        cmd.effect += 4;
+        cmd.color_1 = img->alpha;
+        cmd.color_2 = dye_color;
+        cmd.loop = gint_image_p8_dye;
+        azrp_queue_image(&box, img, &cmd);
+    }
+    prof_leave(azrp_perf_cmdgen);
+}
--- a/azur/src/gint/shaders/image_p8_effect.c
+++ b/azur/src/gint/shaders/image_p8_effect.c
@ -0,0 +1,31 @@
+#include <azur/gint/render.h>
+
+void azrp_subimage_p8_effect(int x, int y, image_t const *img,
+    int left, int top, int w, int h, int eff, ...)
+{
+    va_list args;
+    va_start(args, eff);
+
+    if(eff & IMAGE_CLEARBG) {
+        int bg = va_arg(args, int);
+        azrp_subimage_p8_clearbg(x, y, img, left, top, w, h, eff, bg);
+    }
+    else if(eff & IMAGE_SWAPCOLOR) {
+        int from = va_arg(args, int);
+        int to = va_arg(args, int);
+        azrp_subimage_p8_swapcolor(x, y, img, left, top, w, h, eff, from, to);
+    }
+    else if(eff & IMAGE_ADDBG) {
+        int bg = va_arg(args, int);
+        azrp_subimage_p8_addbg(x, y, img, left, top, w, h, eff, bg);
+    }
+    else if(eff & IMAGE_DYE) {
+        int dye = va_arg(args, int);
+        azrp_subimage_p8_dye(x, y, img, left, top, w, h, eff, dye);
+    }
+    else {
+        azrp_subimage_p8(x, y, img, left, top, w, h, eff);
+    }
+
+    va_end(args);
+}
--- a/azur/src/gint/shaders/image_p8_normal.S
+++ b/azur/src/gint/shaders/image_p8_normal.S
@ -0,0 +1,100 @@
+.global _azrp_image_shader_p8_normal
+#include "image_macros.S"
+
+/* P8 Opaque rendering, Azur version: trivial with loop transforms.
+
+   This is fairly straightforward, with no particular tricks; just index the
+   palette as fast as possible in a 2-unrolled 2-stage-pipeline loop that maxes
+   out CPU speed.
+
+   r0:  [temporary]
+   r7:  Right edge pointer
+   r8:  Right edge value
+   r9:  Palette
+   r10: [temporary]
+   r11: [temporary]
+   r12: Right edge stride */
+
+.macro GEN_NORMAL_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
+	mov.l	@r8+, r9	/* cmd.palette */
+	shlr	r2
+
+	mov.w	@r8+, r7	/* cmd.edge_2 */
+	mov	r2, r0
+
+	mov.l	r12, @-r15
+	shll2	r0
+
+	mov.l	r10, @-r15
+	shll	r7
+
+	mov.l	r11, @-r15
+	add	r5, r7
+
+	mov	r0, r12
+	add	r6, r12
+
+	add	#-4, r5
+	nop
+
+	add	#-2, r4		/* Input stride compensation for pipelining */
+	nop
+
+ .if \HFLIP
+	add	r0, r5
+	nop
+
+	shll	r0
+	nop
+
+	add	r0, r6
+	nop
+ .endif
+
+	START
+
+	mov.b	@r3+, r0
+	nop
+
+	mov.w	@r7, r8		/* Save right edge */
+	nop
+
+	mov.b	@r3+, \TMP1
+	shll	r0
+
+2:	mov.b	@r3+, \TMP2
+	shll	\TMP1
+
+	mov.w	@(r0,r9), r0
+	/* Fun fact: omitting this nop slows the loop to 7 cycles/i */
+	nop
+
+	mov.w	r0, @(\OFF1,r5)
+	mov	\TMP1, r0
+
+	mov.b	@r3+, \TMP1
+	add	#\OUT_DIR, r5
+
+	mov.w	@(r0,r9), r0
+	shll	\TMP2
+
+	mov.w	r0, @(\OFF2,r5)
+3:	mov	\TMP2, r0
+
+	mov.w	r8, @r7		/* Restore right edge */
+	add	r12, r7
+
+	END
+
+	mov.l	@r15+, r11
+	mov.l	@r15+, r10
+	mov.l	@r15+, r12
+	EPILOGUE
+.endm
+
+_azrp_image_shader_p8_normal:
+	tst	#1, r0
+	bf	9f
+
+	GEN_NORMAL_LOOP 0, 4, r10, r11, 4, 2
+9:	GEN_NORMAL_LOOP 1, -4, r10, r11, 2, 4
--- a/azur/src/gint/shaders/image_p8_swapcolor.S
+++ b/azur/src/gint/shaders/image_p8_swapcolor.S
@ -0,0 +1,142 @@
+.global _azrp_image_shader_p8_swapcolor
+#include "image_macros.S"
+
+/* P8 SWAPCOLOR, Azur version: by branchless xor selection.
+
+   This is essentially the same logic as gint's P8 SWAPCOLOR version, but with
+   a 2-unrolled 2-stage-pipeline since the bottleneck on RAM is now on the CPU.
+
+   r0:  [temporary]
+   r7:  Right edge pointer
+   r8:  palette[cmd.color_1] ^ cmd.color_2 (ie. x ^ y)
+   r9:  Palette
+   r10: Holds (x ^ y) & -(c == x) during selection
+   r11: cmd.color_1
+   r12: Right edge stride
+   r13: [temporary]
+   r14: [temporary]
+
+   Spilled to stack:
+   @(-4,r15): Right edge value */
+
+.macro GEN_SWAPCOLOR_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
+	mov.l	@r8+, r9	/* cmd.palette */
+	shlr	r2
+
+	mov.w	@r8+, r7	/* cmd.edge_2 */
+	mov	r2, r0
+
+	mov.l	r12, @-r15
+	shll2	r0
+
+	mov.l	r11, @-r15
+	shll	r7
+
+	mov.w	@r8+, r11	/* cmd.color_1 */
+	add	r5, r7
+
+	mov.l	r10, @-r15
+	add	#-4, r5
+
+	mov.l	r13, @-r15
+	exts.b	r11, r11
+
+	mov	r11, r13
+	add	r13, r13
+
+	mov.w	@r8, r8		/* cmd.color_2 */
+	add	r9, r13
+
+	mov	r0, r12
+	add	r6, r12
+
+	mov.w	@r13, r13
+	add	#-2, r4		/* Input stride compensation for pipelining */
+
+	mov.l	r14, @-r15
+	nop
+
+	xor	r13, r8
+	nop
+
+ .if \HFLIP
+	add	r0, r5
+	nop
+
+	shll	r0
+	nop
+
+	add	r0, r6
+	nop
+ .endif
+
+	START
+
+	mov.b	@r3+, \TMP2
+	nop
+
+	mov.w	@r7, r0		/* Save right edge */
+	nop
+
+	mov.l	r0, @-r15
+	cmp/eq	\TMP2, r11
+
+	mov.b	@r3+, \TMP1
+	add	\TMP2, \TMP2
+
+2:	subc	r10, r10
+	mov	\TMP2, r0
+
+	cmp/eq	\TMP1, r11
+	mov.w	@(r0, r9), r0
+
+	and	r8, r10
+	nop
+
+	xor	r10, r0
+	nop
+
+	mov.w	r0, @(\OFF1, r5)
+	add	#\OUT_DIR, r5
+
+	mov.b	@r3+, \TMP2
+	subc	r10, r10
+
+	add	\TMP1, \TMP1
+	mov	\TMP1, r0
+
+	mov.w	@(r0, r9), r0
+	cmp/eq	\TMP2, r11
+
+	mov.b	@r3+, \TMP1
+	and	r8, r10
+
+	xor	r10, r0
+	nop
+
+	mov.w	r0, @(\OFF2, r5)
+3:	add	\TMP2, \TMP2
+
+	/* TODO: Use x0 as temporary storage by moving the main registers */
+	mov.l	@r15+, r0
+	nop
+
+	mov.w	r0, @r7		/* Restore right edge */
+	add	r12, r7
+
+	END
+
+	mov.l	@r15+, r14
+	mov.l	@r15+, r13
+	mov.l	@r15+, r10
+	mov.l	@r15+, r11
+	mov.l	@r15+, r12
+	EPILOGUE
+.endm
+
+_azrp_image_shader_p8_swapcolor:
+	tst	#1, r0
+	bf	9f
+
+	GEN_SWAPCOLOR_LOOP 0, 4, r13, r14, 4, 2
+9:	GEN_SWAPCOLOR_LOOP 1, -4, r13, r14, 2, 4
--- a/azur/src/gint/shaders/image_p8_swapcolor.c
+++ b/azur/src/gint/shaders/image_p8_swapcolor.c
@ -0,0 +1,51 @@
+#include <azur/gint/render.h>
+
+void azrp_image_p8_swapcolor(int x, int y, image_t const *img, int eff,
+    int old_color, int new_color)
+{
+    azrp_subimage_p8_swapcolor(x, y, img, 0, 0, img->width, img->height,
+        eff, old_color, new_color);
+}
+
+void azrp_subimage_p8_swapcolor(int x, int y, image_t const *img,
+    int left, int top, int w, int h, int eff, int old_index, int new_color)
+{
+    prof_enter(azrp_perf_cmdgen);
+    struct gint_image_box box = { x, y, w, h, left, top };
+    struct gint_image_cmd cmd;
+
+    if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
+            azrp_height)) {
+        cmd.effect += 8;
+        cmd.color_1 = old_index;
+        cmd.color_2 = new_color;
+        cmd.loop = azrp_image_shader_p8_swapcolor;
+        azrp_queue_image(&box, img, &cmd);
+    }
+    prof_leave(azrp_perf_cmdgen);
+}
+
+void azrp_image_p8_addbg(int x, int y, image_t const *img, int eff,
+    int bg_color)
+{
+    azrp_subimage_p8_addbg(x, y, img, 0, 0, img->width, img->height,
+        eff, bg_color);
+}
+
+void azrp_subimage_p8_addbg(int x, int y, image_t const *img,
+    int left, int top, int w, int h, int eff, int bg_color)
+{
+    prof_enter(azrp_perf_cmdgen);
+    struct gint_image_box box = { x, y, w, h, left, top };
+    struct gint_image_cmd cmd;
+
+    if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
+            azrp_height)) {
+        cmd.effect += 8;
+        cmd.color_1 = img->alpha;
+        cmd.color_2 = bg_color;
+        cmd.loop = azrp_image_shader_p8_swapcolor;
+        azrp_queue_image(&box, img, &cmd);
+    }
+    prof_leave(azrp_perf_cmdgen);
+}
--- a/azur/src/gint/shaders/image_rgb16.c
+++ b/azur/src/gint/shaders/image_rgb16.c
@ -0,0 +1,71 @@
+#include <azur/gint/render.h>
+#include <gint/defs/util.h>
+
+uint8_t AZRP_SHADER_IMAGE_RGB16 = -1;
+
+static void shader_rgb16(void *uniforms, void *command, void *fragment)
+{
+    struct gint_image_cmd *cmd = (void *)command;
+    cmd->input = gint_image_rgb16_loop((int)uniforms, cmd);
+    cmd->height -= cmd->lines;
+    cmd->lines = min(cmd->height, azrp_frag_height);
+    cmd->output = fragment + cmd->x * 2;
+}
+
+__attribute__((constructor))
+static void register_shader(void)
+{
+    AZRP_SHADER_IMAGE_RGB16 = azrp_register_shader(shader_rgb16);
+}
+
+void azrp_shader_image_rgb16_configure(void)
+{
+    azrp_set_uniforms(AZRP_SHADER_IMAGE_RGB16, (void *)azrp_width);
+}
+
+void azrp_image_rgb16(int x, int y, image_t const *img, int eff)
+{
+    azrp_subimage_rgb16(x, y, img, 0, 0, img->width, img->height, eff);
+}
+
+void azrp_subimage_rgb16(int x, int y, image_t const *img,
+    int left, int top, int w, int h, int eff)
+{
+    if(img->profile == IMAGE_RGB565A)
+        return azrp_subimage_rgb16_clearbg(x, y, img, left, top, w, h, eff,
+            img->alpha);
+
+    prof_enter(azrp_perf_cmdgen);
+    struct gint_image_box box = { x, y, w, h, left, top };
+    struct gint_image_cmd cmd;
+
+    if(gint_image_mkcmd(&box, img, eff, false, false, &cmd, azrp_width,
+            azrp_height)) {
+        cmd.loop = azrp_image_shader_rgb16_normal;
+        azrp_queue_image(&box, img, &cmd);
+    }
+    prof_leave(azrp_perf_cmdgen);
+}
+
+void azrp_image_rgb16_clearbg(int x, int y, image_t const *img, int eff, int bg)
+{
+    azrp_subimage_rgb16_clearbg(x, y, img, 0, 0, img->width, img->height, eff,
+        bg);
+}
+
+void azrp_subimage_rgb16_clearbg(int x, int y, image_t const *img,
+    int left, int top, int w, int h, int eff, int bg_color)
+{
+    prof_enter(azrp_perf_cmdgen);
+    struct gint_image_box box = { x, y, w, h, left, top };
+    struct gint_image_cmd cmd;
+
+    if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
+            azrp_height)) {
+        cmd.effect += 4;
+        cmd.color_1 = bg_color;
+        cmd.loop = azrp_image_shader_rgb16_clearbg;
+        azrp_queue_image(&box, img, &cmd);
+    }
+    prof_leave(azrp_perf_cmdgen);
+}
--- a/azur/src/gint/shaders/image_rgb16_clearbg.S
+++ b/azur/src/gint/shaders/image_rgb16_clearbg.S
@ -0,0 +1,135 @@
+.global _azrp_image_shader_rgb16_clearbg
+#include "image_macros.S"
+
+/* RGB16 CLEARBG and DYE, Azur version: by NULL canceling.
+
+   This function handles both CLEARBG and DYE, which happen to work identically
+   on RGB16, save for the fact that the DYE loop ignores the value of opaque
+   pixels and uses the dye color instead. It's one of the standard 2-unrolled
+   2-stage-pipeline loops with a right edge, using NULL canceling for
+   transparency.
+
+   r0:  [temporary] (CLEARBG) or dye value (DYE)
+   r7:  Right edge pointer
+   r8:  Right edge value
+   r9:  Background color
+   r10: Nullable output pointer
+   r11: 0 (to neutralize addc during NULL-cancelling)
+   r12: Right edge stride
+   r13: [temporary] (one of the pixels)
+   r14: [temporary] (one of the pixels in DYE)
+
+   The GEN_CLEARBG_LOOP macro parameters are as follows. All of them except for
+   SRC1 and SRC2 are determined by HFLIP; it's just simpler to set their values
+   on the macro's call site than have .if statements everywhere. This set of
+   parameters is used for virtually all the functions of all the formats.
+
+   SRC1 and SRC2 are used in DYE mode to replace the pixel values read from
+   memory with a constant register.
+
+   HFLIP: Whether to enable HFLIP
+   OUT_DIR: Variation of r5 at each loop, either 4 or -4
+   TMP1: Temporary register for first pixel
+   TMP2: Temporary register for second pixel
+   OFF1: Offset for first pixel write
+   OFF2: Offset for second pixel write
+   SRC1: Source of first write (here either TMP1 or r0)
+   SRC2: Source of second write (here either TMP2 or r0) */
+
+.macro GEN_CLEARBG_DYE_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2, SRC1, SRC2
+	mov.w	@r8+, r7	/* cmd.edge_2 */
+	shlr	r2
+
+	mov.l	r11, @-r15
+	mov	#0, r11
+
+	mov.w	@r8+, r9	/* cmd.color_1 */
+	shll	r7
+
+	mov.l	r10, @-r15
+	add	r5, r7
+
+	mov.l	r12, @-r15
+	add	#-2, r5		/* Pre-decrement, see output logic */
+
+	mov	r2, r12
+	shll2	r12
+
+	mov.l	r13, @-r15
+	add	r6, r12
+
+	mov.l	r14, @-r15
+	add	#-2, r4		/* Input stride compensation for pipelining */
+
+ .if \HFLIP
+	mov	r2, r0
+	shll2	r0
+
+	add	r0, r5
+	nop
+
+	shll	r0
+	nop
+
+	add	r0, r6
+	nop
+ .endif
+
+	mov.w	@r8+, r0	/* cmd.color_2 */
+	nop
+
+	START
+
+	mov.w	@r3+, \TMP1
+	nop
+
+	mov.w	@r7, r8		/* Save right edge */
+	nop
+
+	cmp/eq	\TMP1, r9
+	nop
+
+2:	mov	#-1, r10
+	addc	r11, r10
+
+	mov.w	@r3+, \TMP2
+	and	r5, r10
+
+	add	#\OUT_DIR, r5
+	nop
+
+	mov.wv	\SRC1, \OFF1, r10
+	cmp/eq	\TMP2, r9
+
+	mov	#-1, r10
+	addc	r11, r10
+
+	mov.w	@r3+, \TMP1
+	and	r5, r10
+
+	cmp/eq	\TMP1, r9
+3:	mov.wv	\SRC2, \OFF2, r10
+
+	mov.w	r8, @r7		/* Restore right edge */
+	add	r12, r7
+
+	END
+
+	mov.l	@r15+, r14
+	mov.l	@r15+, r13
+	mov.l	@r15+, r12
+	mov.l	@r15+, r10
+	mov.l	@r15+, r11
+	EPILOGUE
+.endm
+
+#ifndef AZRP_RGB16_DYE
+
+_azrp_image_shader_rgb16_clearbg:
+	tst	#1, r0
+	bf	9f
+
+	GEN_CLEARBG_DYE_LOOP 0, 4, r0, r13, 2, 0, r0, r13
+9:	GEN_CLEARBG_DYE_LOOP 1, -4, r13, r0, 0, 2, r13, r0
+
+#endif
--- a/azur/src/gint/shaders/image_rgb16_dye.S
+++ b/azur/src/gint/shaders/image_rgb16_dye.S
@ -0,0 +1,12 @@
+.global _azrp_image_shader_rgb16_dye
+#define AZRP_RGB16_DYE
+#include "image_rgb16_clearbg.S"
+
+/* See image_rgb16_clearbg.S for details on this function. */
+
+_azrp_image_shader_rgb16_dye:
+	tst	#1, r0
+	bf	9f
+
+	GEN_CLEARBG_DYE_LOOP 0, 4, r14, r13, 2, 0, r0, r0
+9:	GEN_CLEARBG_DYE_LOOP 1, -4, r13, r14, 0, 2, r0, r0
--- a/azur/src/gint/shaders/image_rgb16_dye.c
+++ b/azur/src/gint/shaders/image_rgb16_dye.c
@ -0,0 +1,26 @@
+#include <azur/gint/render.h>
+
+void azrp_image_rgb16_dye(int x, int y, image_t const *img, int eff,
+    int dye_color)
+{
+    azrp_subimage_rgb16_dye(x, y, img, 0, 0, img->width, img->height, eff,
+        dye_color);
+}
+
+void azrp_subimage_rgb16_dye(int x, int y, image_t const *img,
+    int left, int top, int w, int h, int eff, int dye_color)
+{
+    prof_enter(azrp_perf_cmdgen);
+    struct gint_image_box box = { x, y, w, h, left, top };
+    struct gint_image_cmd cmd;
+
+    if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
+            azrp_height)) {
+        cmd.effect += 12;
+        cmd.color_1 = img->alpha;
+        cmd.color_2 = dye_color;
+        cmd.loop = azrp_image_shader_rgb16_dye;
+        azrp_queue_image(&box, img, &cmd);
+    }
+    prof_leave(azrp_perf_cmdgen);
+}
--- a/azur/src/gint/shaders/image_rgb16_effect.c
+++ b/azur/src/gint/shaders/image_rgb16_effect.c
@ -0,0 +1,31 @@
+#include <azur/gint/render.h>
+
+void azrp_subimage_rgb16_effect(int x, int y, image_t const *img,
+    int left, int top, int w, int h, int eff, ...)
+{
+    va_list args;
+    va_start(args, eff);
+
+    if(eff & IMAGE_CLEARBG) {
+        int bg = va_arg(args, int);
+        azrp_subimage_rgb16_clearbg(x, y, img, left, top, w, h, eff, bg);
+    }
+    else if(eff & IMAGE_SWAPCOLOR) {
+        int c1 = va_arg(args, int);
+        int c2 = va_arg(args, int);
+        azrp_subimage_rgb16_swapcolor(x, y, img, left, top, w, h, eff, c1, c2);
+    }
+    else if(eff & IMAGE_ADDBG) {
+        int bg = va_arg(args, int);
+        azrp_subimage_rgb16_addbg(x, y, img, left, top, w, h, eff, bg);
+    }
+    else if(eff & IMAGE_DYE) {
+        int dye = va_arg(args, int);
+        azrp_subimage_rgb16_dye(x, y, img, left, top, w, h, eff, dye);
+    }
+    else {
+        azrp_subimage_rgb16(x, y, img, left, top, w, h, eff);
+    }
+
+    va_end(args);
+}
--- a/azur/src/gint/shaders/image_rgb16_normal.S
+++ b/azur/src/gint/shaders/image_rgb16_normal.S
@ -0,0 +1,124 @@
+.global _azrp_image_shader_rgb16_normal
+#include "image_macros.S"
+
+/* RGB16 Opaque rendering, Azur version: by straightforward copy.
+
+   This function of the image renderer is designed for Azur's streaming model
+   only. Unlike its RAM-model counterpart which is bottlenecked by its writing
+   speed, this function is entirely limited by the CPU's ability to output the
+   data in the required format.
+
+   In the simple case where there is no color effect and no HFLIP, the task of
+   rendering a 16-bit opaque image boils down to a 2-dimensional memcpy. This
+   task can be optimized by moving longwords if the source and destination and
+   co-4-aligned, with four variations depending on the width and initial
+   position, identified by the following parameters:
+
+   * w1 / w2 denotes the parity of the command width;
+   * o2 / o4 denotes the alignment of the output.
+
+   It is easy to see that when input and output are not co-aligned, any attempt
+   to combine two word reads into a single long write requires at least 3
+   cycles per 2 pixels and needs parallelism over several pixels to not get
+   immediately shut down by the LS-to-EX delay. Here we decide to naively copy
+   by words, which achieves 4 cycles per 2 pixels, mainly because large RGB16
+   images are very quickly bottlenecked in reading by their own size anyway.
+
+   The HFLIP version also needs to rearrange pixels, and is thus performed with
+   word-based copies in all situations, which is a straightforward process. */
+
+_azrp_image_shader_rgb16_normal:
+	/* Not a single cycle */
+	tst	#1, r0
+	bf	_BACKWARD_WORD_COPY
+
+	mov	#8, r0		/* Use the naive method for width ≤ 8 */
+	cmp/ge	r2, r0
+
+	bt.s	_FORWARD_WORD_COPY
+	nop
+
+	mov	r5, r0		/* Check if r3 and r5 are co-aligned */
+	xor	r3, r0
+
+	/* Not a single cycle */
+	tst	#2, r0
+	bt	_FORWARD_LONG_COPY
+
+_FORWARD_WORD_COPY:
+	START
+2:	movs.w	@r3+, x0
+3:	movs.w	x0, @r5+
+	END
+	EPILOGUE
+
+_FORWARD_LONG_COPY:
+	shlr	r2		/* Test width parity */
+	mov	#2, r0
+
+	bt	.w1
+	nop
+
+.w2:	tst	r0, r3		/* Test alignment of input */
+	bf	.w2d2
+
+.w2d4:	START
+2:	movs.l	@r3+, x0
+3:	movs.l	x0, @r5+
+	END
+	EPILOGUE
+
+.w2d2:	add	#-1, r2
+	nop
+
+	START
+	movs.w	@r3+, x0
+	movs.w	x0, @r5+
+
+2:	movs.l	@r3+, x0
+3:	movs.l	x0, @r5+
+
+	movs.w	@r3+, x0
+	movs.w	x0, @r5+
+	END
+	EPILOGUE
+
+.w1:	tst	r0, r3		/* Test alignment of input */
+	bf	.w1d2
+
+.w1d4:	START
+2:	movs.l	@r3+, x0
+3:	movs.l	x0, @r5+
+
+	movs.w	@r3+, x0
+	movs.w	x0, @r5+
+	END
+	EPILOGUE
+
+.w1d2:	START
+	movs.w	@r3+, x0
+	movs.w	x0, @r5+
+
+2:	movs.l	@r3+, x0
+3:	movs.l	x0, @r5+
+	END
+	EPILOGUE
+
+_BACKWARD_WORD_COPY:
+	mov	r2, r0
+	shll	r0
+
+	add	r0, r5
+	nop
+
+	shll	r0
+	nop
+
+	add	r0, r6
+	nop
+
+	START
+2:	movs.w	@r3+, x0
+3:	movs.w	x0, @-r5
+	END
+	EPILOGUE
--- a/azur/src/gint/shaders/image_rgb16_swapcolor.S
+++ b/azur/src/gint/shaders/image_rgb16_swapcolor.S
@ -0,0 +1,116 @@
+.global _azrp_image_shader_rgb16_swapcolor
+#include "image_macros.S"
+
+/* RGB16 SWAPCOLOR, Azur version: by branchless xor selection.
+
+   The xor selection is explained in gint's version of P8 SWAPCOLOR. This
+   version's selection is slightly simpler because we don't have to index the
+   palette to find the source color. We use a 2-unrolled 2-stage-pipeline loop
+   to optimize for CPU speed.
+
+   r7:  Right edge pointer
+   r8:  Right edge value
+   r9:  cmd.color_1
+   r10: Holds (x ^ y) & -(c == x) during selection
+   r11: cmd.color_1 ^ cmd.color_2 (ie. x ^ y)
+   r12: Right edge stride
+   r13: [temporary] */
+
+.macro GEN_SWAPCOLOR_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
+	mov.w	@r8+, r7	/* cmd.edge_2 */
+	shlr	r2
+
+	mov.l	r11, @-r15
+	add	#-2, r4		/* Input stride compensation for pipelining */
+
+	mov.w	@r8+, r9	/* cmd.color_1 */
+	shll	r7
+
+	mov.l	r10, @-r15
+	add	r5, r7
+
+	mov.l	r12, @-r15
+	add	#-2, r5		/* Predecrement, see output logic */
+
+	mov.w	@r8+, r11	/* cmd.color_2 */
+	mov	r2, r12
+
+	mov.l	r13, @-r15
+	shll2	r12
+
+	add	r6, r12
+	nop
+
+	xor	r9, r11
+	nop
+
+ .if \HFLIP
+	mov	r2, r0
+	shll2	r0
+
+	add	r0, r5
+	nop
+
+	shll	r0
+	nop
+
+	add	r0, r6
+	nop
+ .endif
+
+	START
+
+	mov.w	@r3+, \TMP1
+	nop
+
+	mov.w	@r7, r8		/* Save right edge */
+	nop
+
+	cmp/eq	\TMP1, r9
+	nop
+
+2:	subc	r10, r10
+	nop
+
+	and	r11, r10
+	mov.w	@r3+, \TMP2
+
+	xor	r10, \TMP1
+	nop
+
+	mov.wv	\TMP1 \OFF1 r5
+	cmp/eq	\TMP2, r9
+
+	add	#\OUT_DIR, r5
+	nop
+
+	subc	r10, r10
+	nop
+
+	and	r11, r10
+	mov.w	@r3+, \TMP1
+
+	xor	r10, \TMP2
+	nop
+
+	cmp/eq	\TMP1, r9
+3:	mov.wv	\TMP2 \OFF2 r5
+
+	mov.w	r8, @r7		/* Restore right edge */
+	add	r12, r7
+
+	END
+
+	mov.l	@r15+, r13
+	mov.l	@r15+, r12
+	mov.l	@r15+, r10
+	mov.l	@r15+, r11
+	EPILOGUE
+.endm
+
+_azrp_image_shader_rgb16_swapcolor:
+	tst	#1, r0
+	bf	9f
+
+	GEN_SWAPCOLOR_LOOP 0, 4, r0, r13, 2, 0
+9:	GEN_SWAPCOLOR_LOOP 1, -4, r13, r0, 0, 2
--- a/azur/src/gint/shaders/image_rgb16_swapcolor.c
+++ b/azur/src/gint/shaders/image_rgb16_swapcolor.c
@ -0,0 +1,51 @@
+#include <azur/gint/render.h>
+
+void azrp_image_rgb16_swapcolor(int x, int y, image_t const *img, int eff,
+    int old_color, int new_color)
+{
+    azrp_subimage_rgb16_swapcolor(x, y, img, 0, 0, img->width, img->height,
+        eff, old_color, new_color);
+}
+
+void azrp_subimage_rgb16_swapcolor(int x, int y, image_t const *img,
+    int left, int top, int w, int h, int eff, int old_color, int new_color)
+{
+    prof_enter(azrp_perf_cmdgen);
+    struct gint_image_box box = { x, y, w, h, left, top };
+    struct gint_image_cmd cmd;
+
+    if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
+            azrp_height)) {
+        cmd.effect += 8;
+        cmd.color_1 = old_color;
+        cmd.color_2 = new_color;
+        cmd.loop = azrp_image_shader_rgb16_swapcolor;
+        azrp_queue_image(&box, img, &cmd);
+    }
+    prof_leave(azrp_perf_cmdgen);
+}
+
+void azrp_image_rgb16_addbg(int x, int y, image_t const *img, int eff,
+    int bg_color)
+{
+    azrp_subimage_rgb16_addbg(x, y, img, 0, 0, img->width, img->height,
+        eff, bg_color);
+}
+
+void azrp_subimage_rgb16_addbg(int x, int y, image_t const *img,
+    int left, int top, int w, int h, int eff, int bg_color)
+{
+    prof_enter(azrp_perf_cmdgen);
+    struct gint_image_box box = { x, y, w, h, left, top };
+    struct gint_image_cmd cmd;
+
+    if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
+            azrp_height)) {
+        cmd.effect += 8;
+        cmd.color_1 = img->alpha;
+        cmd.color_2 = bg_color;
+        cmd.loop = azrp_image_shader_rgb16_swapcolor;
+        azrp_queue_image(&box, img, &cmd);
+    }
+    prof_leave(azrp_perf_cmdgen);
+}