diff --git a/azur/include/azur/gint/render.h b/azur/include/azur/gint/render.h
index 98adf09..ca58823 100644
--- a/azur/include/azur/gint/render.h
+++ b/azur/include/azur/gint/render.h
@@ -16,9 +16,9 @@
 // fragment shaders.
 //
 // The command queue stores all rendering commands, split into fragments. Each
-// fragment needs to read through all commands to produce its output, and
-// because fragments are rendered in order, the sequence of commands must be
-// read several times, therefore stored.
+// fragment needs to read through a number of commands, and the order does not
+// match the order of API calls because each API call typically impacts several
+// fragments. Therefore commands need to be stored.
 //
 // Fragment shaders are the programs that render commands into graphics data
 // for each fragments. They are pretty similar to OpenGL shaders, in that they
@@ -115,26 +115,24 @@ void azrp_image(int x, int y, uint16_t *pixels, int w, int h, int stride);
 // use them, so they are safe to write to and reset when they're not running.
 //---
 
-/* This counter runs during command generation and enqueue operations, usually
-   between azrp_begin_frame() and azrp_render_frame(). */
+/* This counter runs during command generation and queue operations. */
 extern prof_t azrp_perf_cmdgen;
 
-/* This counter runs during the command sorting step, which occurs at the start
-   of azrp_render_frame(). */
+/* This counter runs during the command sorting step. */
 extern prof_t azrp_perf_sort;
 
-/* This counter runs during shader executions in arzp_render_frame(). */
+/* This counter runs during shader executions in arzp_render_fragments(). */
 extern prof_t azrp_perf_shaders;
 
 /* This counter runs during CPU transfers to the R61524 display. */
 extern prof_t azrp_perf_r61524;
 
-/* This counter runs during the whole azrp_frame_render() operation; it is the
-   sum of sort, shaders, r61524, plus some logic overhead. */
+/* This counter runs during the whole azrp_update() operation; it is the sum of
+   sort, shaders, r61524, plus some logic overhead. */
 extern prof_t azrp_perf_render;
 
 /* azrp_perf_clear(): Clear all performance counters
-   Generally you want to do this before azrp_frame_begin(). */
+   Generally you want to do this before azrp_update(). */
 void azrp_perf_clear(void);
 
 //---
@@ -172,12 +170,13 @@ struct azrp_shader_tex2d_command {
     int16_t columns;
     /* Already offset by start row and column */
     void *input;
-    /* Destination in XRAM */
-    void *output;
+    /* Destination in XRAM (offset) */
+    uint16_t output;
     /* Number of lines */
     int16_t lines;
     /* Distance between two lines (columns excluded) */
     int16_t stride;
-};
+
+} GPACKED(2);
 
 AZUR_END_DECLS
diff --git a/azur/src/gint/render.c b/azur/src/gint/render.c
index 9bd292f..1ff5212 100644
--- a/azur/src/gint/render.c
+++ b/azur/src/gint/render.c
@@ -132,8 +132,13 @@ bool azrp_queue_command(void *command, size_t size)
     if(commands_length + size >= 8192)
         return false;
 
+    uint8_t *dst = YRAM + commands_length;
+    uint8_t *src = command;
+
+    for(size_t i = 0; i < size; i++)
+        dst[i] = src[i];
+
     commands_array[commands_count++] = commands_length;
-    memcpy(YRAM + commands_length, command, size);
     commands_length += size;
 
     return true;
diff --git a/azur/src/gint/shaders/tex2d.S b/azur/src/gint/shaders/tex2d.S
index ad7804e..ea014a3 100644
--- a/azur/src/gint/shaders/tex2d.S
+++ b/azur/src/gint/shaders/tex2d.S
@@ -1,60 +1,143 @@
 .global _azrp_shader_tex2d
 .align 4
 
+/* TODO [scaling]: Pass the _792 constant and fragment address as uniform */
+
 /* Register assignment
+   r0: (temporary)
    r1: Lines
    r2: Columns
    r3: Input
    r4: Output
-   r5: Command queue
-   r7: Constant 396*2 = 0x318
-   r8: Output stride
-   r9: Input stride */
+   r5: Command queue; (temporary)
+   r6: (temporary)
+   r7: Output stride
+   r8: Input stride */
 _azrp_shader_tex2d:
-	mov.l	r8, @-r15
+	mov.w	_792, r7
 	add	#2, r5
 
-	mov.l	r9, @-r15
-	mov	#0x03, r7
+	mov.w	@r5+, r2    /* Columns */
 
-	ldrs	1f
-	shll8	r7
+	mov.l	r8, @-r15
 
-	ldre	2f
-	add	#0x18, r7
+	mov.w	@r5+, r6    /* Input (1/2) */
+	sub	r2, r7
 
-	/* CHECK: 4-alignment here */
+	mov.w	@r5+, r3    /* Input (2/2) */
+	sub	r2, r7
 
-.texture:
-	mov.w	@r5+, r2 /* Columns */
-	mov	r7, r8
+	mov.w	@r5+, r4    /* Output offset */
 
-	mov.l	@r5+, r3 /* Input */
+	mov.w	@r5+, r1    /* Lines */
+	shll16	r3
 
-	mov	r2, r0
-	mov.l	@r5+, r4 /* Output */
+	xtrct	r6, r3
+	mov.l	.fragment, r6
 
-	shll	r0
-	mov.w	@r5+, r1 /* Lines */
+	mov.w	@r5+, r8    /* Input stride */
+	mov	#8, r0      /* Maximum width for naive method */
 
-	sub	r0, r8
-	mov.w	@r5+, r9 /* Input stride */
+	add	r6, r4
+	cmp/ge	r2, r0
+
+	bt.s	.naive
+	mov	#2, r0
+
+/* The following variations are named based on the parity of each parameter:
+   * w[eo] (width even, width odd)
+   * d[eo] (data even, data odd)
+   where even/odd means 4-aligned/2-aligned in terms of pointers.
+
+   When the destination and source have identical parity, the copy is pretty
+   direct and takes 2 cycles to copy 4 bytes. When they have opposite parity
+   however, longwords need to be rearranged, which is a problem: arithmetic
+   operations under a RAW dependency take 3 cycles, so there's no way to
+   complete the 4-byte copy in less than 4 cycles unless iterations are opened
+   and weaved, which would add too much sub-cases. So in this case the naive
+   method that copies 4 bytes in 4 cycles is used. A very heavy image renderer
+   like a tileset shader should consider the optimized route though.  */
+
+#define TEX2D_START()		\
+	ldrs	2f;		\
+	ldre	3f;		\
+				\
+1:	ldrc	r2;		\
+	dt	r1;		\
+
+#define TEX2D_END()		\
+	add	r7, r4;		\
+	bf.s	1b;		\
+	add	r8, r3;		\
+				\
+	rts;			\
+	mov.l	@r15+, r8
+
+.case_analysis:
+	/* Use naive method for opposite source/destination parity */
+	mov	r4, r6
+	xor	r3, r6
+	tst	r0, r6
+	bf	.naive
 
 	shlr	r2
+	bt	.wo
 
-.line:
-	ldrc	r2
-	dt	r1
+.we:
+	tst	r0, r4
+	bf	.we_do
 
-1:	movs.l	@r3+, x0
-2:	movs.l	x0, @r4+
+.we_de:
+	TEX2D_START()
+2:	movs.l	@r3+, x0
+3:	movs.l	x0, @r4+
+	TEX2D_END()
 
-	add	r8, r4
+.we_do:
+	add	#-1, r2
 
-	bf.s	.line
-	add	r9, r3
+	TEX2D_START()
+	movs.w	@r3+, x0
+	movs.w	x0, @r4+
 
-.end:
-	mov.l	@r15+, r9
-	rts
-	mov.l	@r15+, r8
+2:	movs.l	@r3+, x0
+3:	movs.l	x0, @r4+
+
+	movs.w	@r3+, x0
+	movs.w	x0, @r4+
+	TEX2D_END()
+
+.wo:
+	tst	r0, r4
+	bf	.wo_do
+
+.wo_de:
+	TEX2D_START()
+2:	movs.l	@r3+, x0
+3:	movs.l	x0, @r4+
+
+	movs.w	@r3+, x0
+	movs.w	x0, @r4+
+	TEX2D_END()
+
+.wo_do:
+	TEX2D_START()
+	movs.w	@r3+, x0
+	movs.w	x0, @r4+
+
+2:	movs.l	@r3+, x0
+3:	movs.l	x0, @r4+
+	TEX2D_END()
+
+/* Naive method for small widths and opposite source/destination parity */
+.naive:
+	TEX2D_START()
+2:	movs.w	@r3+, x0
+3:	movs.w	x0, @r4+
+	TEX2D_END()
+
+.align 4
+.fragment:
+	.long _azrp_frag
+_792:
+	.word	792