diff --git a/azur/src/gint/shaders/tex2d.S b/azur/src/gint/shaders/tex2d.S
index f766ed7..67d3c87 100644
--- a/azur/src/gint/shaders/tex2d.S
+++ b/azur/src/gint/shaders/tex2d.S
@@ -44,8 +44,7 @@ _azrp_shader_tex2d:
 	mov.w	@r8+, r9    /* image.width */
 
 	jmp	@r0
-	/* Stall for r9 */
-	sub	r7, r9
+	nop
 
 .align 4
 .formats:
@@ -108,6 +107,8 @@ _azrp_shader_tex2d:
 .align 4
 _RGB565:
 	mov	#8, r0      /* Maximum width for naive method */
+	sub	r7, r9
+
 	cmp/ge	r7, r0
 
 	shll	r9
@@ -118,6 +119,7 @@ _RGB565:
 	/* Use naive method for opposite source/destination parity */
 	mov	r5, r6
 	xor	r3, r6
+
 	tst	r0, r6
 	bf	_RGB565.naive
 
@@ -189,6 +191,8 @@ _RGB565A:
 	shll16	r6
 	mov	#0x0004, r0 /* DC Zero mode */
 
+	sub	r7, r9
+
 	shll	r9
 
 	lds	r6, y0
@@ -245,18 +249,20 @@ _RGB565A:
 .align 4
 _P8_RGB565A:
 	mov.l	r13, @-r15
-	add	#-2, r9 /* Input stride compensation for openness */
+	sub	r7, r9
 
 	mov	r7, r13
-	shlr	r7
+	add	#-2, r9 /* Input stride compensation for openness */
 
 	mov.l	r12, @-r15
-	movt	r6
+	shlr	r7
 
 	mov.l	r10, @-r15
-	shll	r13
+	movt	r6
 
 	mov.w	_P8_RGB565A.palette_distance, r0
+	shll	r13
+
 	add	r6, r7
 
 	sub	r6, r9
@@ -337,18 +343,20 @@ _P8_RGB565A.palette_distance:
 .align 4
 _P8_RGB565:
 	mov.l	r13, @-r15
-	add	#-2, r9 /* Input stride compensation for openness */
+	sub	r7, r9
 
 	mov	r7, r13
-	shlr	r7
+	add	#-2, r9 /* Input stride compensation for openness */
 
 	mov.l	r12, @-r15
-	movt	r6
+	shlr	r7
 
 	mov.l	r10, @-r15
-	shll	r13
+	movt	r6
 
 	mov.w	_P8_RGB565.palette_distance, r0
+	shll	r13
+
 	add	r6, r7
 
 	sub	r6, r9
@@ -437,23 +445,32 @@ _P8_RGB565.palette_distance:
    Otherwise, they point to the next pixels and the restores are no-ops. See
    the strategy used for managing interweaving in P8 formats for details.
 
-   TODO: Asymptotic performance */
+   The only irregularity is image width, which the command builder cannot
+   modify. It is rounded up to the next multiple of 2, then halved. There is a
+   nice trick for this operation, which is [shlr rX] then adding T to rX. We
+   also need to add -1 for another adjustement, and both are combined into an
+   addc, which saves one add and one movt off the EX critical chain.
+
+   The main loop achieves 5 cycles/pixel. */
 .align 4
 _P4_RGB565A:
-	mov.l	r10, @-r15
 	shlr	r9
+	mov	#-1, r0
+
+	mov.l	r10, @-r15
+	addc	r0, r9
 
 	mov.l	r11, @-r15
-	add	#-1, r9 /* Input stride compensation for openness */
-
-	mov.l	r12, @-r15
-	add	#2, r8 /* image.palette */
-
-	mov.w	@r2+, r11 /* command.edge1 */
 	shlr	r7
 
+	mov.l	r12, @-r15
+	sub	r7, r9
+
+	mov.w	@r2+, r11 /* command.edge1 */
+	add	#2, r8 /* image.palette */
+
 	mov.w	@r2+, r12 /* command.edge2 */
-	mov	r5, r10
+	mov	r5, r0
 
 	mov.l	r13, @-r15
 	shll	r11
@@ -461,62 +478,59 @@ _P4_RGB565A:
 	mov.l	r14, @-r15
 	shll	r12
 
+	add	#-4, r5
+
 	TEX2D_START()
 
-	mov	r10, r0
 	mov.b	@r3+, r6
-
-	/* Stall for r0 */
+	mov	r0, r10
 
 	mov.w	@(r0,r11), r13
 
 	mov.w	@(r0,r12), r14
-
-	/* Main loop with 2 pixels sharing a single byte */
-
-2:	/* Stall for r6 */
-
 	shll	r6
 
-	mov	r6, r0
+	/* Main loop with 2 pixels sharing a single byte */
+2:	mov	r6, r0
 	and	#0x1e, r0
 
 	tst	r0, r0
 
-	bt	4f
-	mov.w	@(r0,r8), r0
-
-	mov.w	r0, @(2,r5)
-     4:	shlr2	r6
-
+	bt.s	4f
 	shlr2	r6
 
+	mov.w	@(r0,r8), r0
+
+	mov.w	r0, @(6,r5)
+     4:	shlr2	r6
+
 	mov	r6, r0
 	and	#0x1e, r0
 
 	tst	r0, r0
+	mov.b	@r3+, r6
+
+	bt.s	5f
+	add	#4, r5
 
-	bt	5f
 	mov.w	@(r0,r8), r0
 
 	mov.w	r0, @r5
-
-     5: mov.b	@r3+, r6
-3:	add	#4, r5
+3:   5: shll	r6
 
 	mov	r10, r0
-	add	r7, r10
+	mov	r7, r10
 
-	/* Stall for r0 */
+	shll2	r10
 
 	mov.w	r13, @(r0,r11)
-	add	r7, r10
-
-	mov.w	r14, @(r0,r12)
 	add	r4, r10
 
-	add	r7, r10
-	add	r7, r10
+	mov.w	r14, @(r0,r12)
+	add	r0, r10
+
+	mov	r10, r0
+	/* Parallelizes with [dt r1] expanded from TEX2D_END_NORET() */
 
 	TEX2D_END_NORET()
 	mov.l	@r15+, r14
@@ -529,13 +543,92 @@ _P4_RGB565A:
 	mov.l	@r15+, r8
 
 /* [Rendering strategy for the P4_RGB565 format]
-   Same as P4_RGB565A without transparency checks (fairly straightforward). */
+   Same as P4_RGB565A without transparency checks (fairly straightforward). The
+   core loop runs in 3.5 cycles/pixel. */
 .align 4
 _P4_RGB565:
+	shlr	r9
+	mov	#-1, r0
+
+	mov.l	r10, @-r15
+	addc	r0, r9
+
+	mov.l	r11, @-r15
+	shlr	r7
+
+	mov.l	r12, @-r15
+	sub	r7, r9
+
+	mov.w	@r2+, r11 /* command.edge1 */
+	add	#2, r8 /* image.palette */
+
+	mov.w	@r2+, r12 /* command.edge2 */
+	mov	r5, r0
+
+	mov.l	r13, @-r15
+	shll	r11
+
+	mov.l	r14, @-r15
+	shll	r12
+
+	add	#-4, r5
+	mov	#0x1e, r2
+
 	TEX2D_START()
-2:
-3:	nop
-	TEX2D_END()
+
+	mov.b	@r3+, r6
+	mov	#-4, r10
+
+	mov.l	r0, @-r15
+
+	mov.w	@(r0,r11), r13
+
+	mov.w	@(r0,r12), r14
+	shll	r6
+
+	/* Main loop with 2 pixels sharing a single byte */
+2:	mov	r6, r0
+	and	#0x1e, r0
+
+	shld	r10, r6
+
+	mov.w	@(r0,r8), r0
+	and	r2, r6
+
+	mov.w	r0, @(6,r5)
+	mov	r6, r0
+
+	mov.b	@r3+, r6
+	add	#4, r5
+
+	mov.w	@(r0,r8), r0
+
+	mov.w	r0, @r5
+3:	shll	r6
+
+	mov.l	@r15+, r0
+	mov	r7, r10
+
+	shll2	r10
+
+	mov.w	r13, @(r0,r11)
+	add	r4, r10
+
+	mov.w	r14, @(r0,r12)
+	add	r0, r10
+
+	mov	r10, r0
+	/* Parallelizes with [dt r1] expanded from TEX2D_END_NORET() */
+
+	TEX2D_END_NORET()
+	mov.l	@r15+, r14
+	mov.l	@r15+, r13
+	mov.l	@r15+, r12
+	mov.l	@r15+, r11
+	mov.l	@r15+, r10
+	mov.l	@r15+, r9
+	rts
+	mov.l	@r15+, r8
 
 /* [Unsupported formats]
    P8 is unsupported, use P8_RGB565 and P8_RGB565A. */
diff --git a/azur/src/gint/shaders/tex2d.c b/azur/src/gint/shaders/tex2d.c
index 60a20e5..eef2ffa 100644
--- a/azur/src/gint/shaders/tex2d.c
+++ b/azur/src/gint/shaders/tex2d.c
@@ -44,17 +44,17 @@ void azrp_subimage(int x, int y, bopti_image_t const *image,
     cmd.columns = width;
     cmd.image = image;
 
-    int input_multiplier = 1;
-    void const *data = image->data;
+    int row_stride;
     size_t cmd_size = sizeof cmd - 4;
 
     if(image->profile == P8_RGB565 || image->profile == P8_RGB565A) {
-        input_multiplier = 0;
-        data += (image->data[0] * 2) + 2;
+        row_stride = image->width;
+        cmd.input = (void *)image->data + (image->data[0] * 2) + 2 +
+            top * row_stride + left;
     }
     else if(image->profile == P4_RGB565 || image->profile == P4_RGB565A) {
-        input_multiplier = -1;
-        data += 32;
+        row_stride = (image->width + 1) >> 1;
+        cmd.input = (void *)image->data + 32 + top * row_stride + (left >> 1);
 
         int odd_left  = left & 1;
         int odd_right = (left + width) & 1;
@@ -65,6 +65,10 @@ void azrp_subimage(int x, int y, bopti_image_t const *image,
         x -= odd_left;
         cmd_size += 4;
     }
+    else {
+        row_stride = image->width << 1;
+        cmd.input = (void *)image->data + top * row_stride + (left << 1);
+    }
 
     /* This divides by azrp_frag_height */
     cmd.fragment_id = (azrp_scale == 1) ? (y >> 3) : (y >> 4);
@@ -72,9 +76,6 @@ void azrp_subimage(int x, int y, bopti_image_t const *image,
     while(height > 0) {
         cmd.lines = min(height, azrp_frag_height - (y & (azrp_frag_height-1)));
 
-        int input_offset = image->width * top + left;
-        input_offset = (input_offset << (input_multiplier + 1)) >> 1;
-        cmd.input = data + input_offset;
         cmd.output = 2 * (azrp_width * (y & (azrp_frag_height-1)) + x);
 
         y += cmd.lines;
@@ -83,6 +84,7 @@ void azrp_subimage(int x, int y, bopti_image_t const *image,
 
         azrp_queue_command(&cmd, cmd_size);
         cmd.fragment_id++;
+        cmd.input += row_stride * cmd.lines;
     }
 
     prof_leave(azrp_perf_cmdgen);