From 0fec6da1c44532ae259d18c893269833f2089a6c Mon Sep 17 00:00:00 2001
From: Lephe <sebastien.michelland@protonmail.com>
Date: Fri, 27 Aug 2021 14:23:53 +0200
Subject: [PATCH] azur: progress on tex2d as bopti, custom command sorter

---
 azur/include/azur/gint/render.h |  19 +++--
 azur/src/gint/render.c          |  52 +++++++++---
 azur/src/gint/shaders/clear.c   |  14 ++-
 azur/src/gint/shaders/tex2d.S   | 145 ++++++++++++++++++++++++--------
 azur/src/gint/shaders/tex2d.c   |  52 ++++++++++++
 5 files changed, 218 insertions(+), 64 deletions(-)

diff --git a/azur/include/azur/gint/render.h b/azur/include/azur/gint/render.h
index 527d631..da1c8a9 100644
--- a/azur/include/azur/gint/render.h
+++ b/azur/include/azur/gint/render.h
@@ -113,6 +113,8 @@ extern int azrp_width, azrp_height;
 extern int azrp_frag_count;
 /* Offset of first fragment. */
 extern int azrp_frag_offset;
+/* Height of fragments. */
+extern int azrp_frag_height;
 
 /* azrp_config_scale(): Select the renderer's super-scaling factor
 
@@ -178,7 +180,11 @@ extern uint8_t AZRP_SHADER_TEX2D;
 void azrp_clear(uint16_t color);
 
 /* azrp_image(): Queue image command [AZRP_SHADER_TEX2D] */
-void azrp_image(int x, int y, uint16_t *pixels, int w, int h, int stride);
+void azrp_image(int x, int y, bopti_image_t const *image);
+
+/* azrp_subimage(): Queue image subsection command [AZRP_SHADER_TEX2D] */
+void azrp_subimage(int x, int y, bopti_image_t const *image,
+   int left, int top, int width, int height, int flags);
 
 /* Functions to update uniforms for these shaders. You should call them when:
    * AZRP_SHADER_CLEAR: Changing super-scaling settings.
@@ -254,15 +260,14 @@ struct azrp_shader_tex2d_command {
     uint8_t fragment_id;
     /* Pixels per line */
     int16_t columns;
-    /* Already offset by start row and column */
-    void *input;
+    /* Address of the image structure */
+    bopti_image_t const *image;
     /* Destination in XRAM (offset) */
     uint16_t output;
     /* Number of lines */
     int16_t lines;
-    /* Distance between two lines (columns excluded) */
-    int16_t stride;
-
-} GPACKED(2);
+    /* Already offset by start row and column */
+    void const *input;
+};
 
 AZUR_END_DECLS
diff --git a/azur/src/gint/render.c b/azur/src/gint/render.c
index fd66e28..457ebc9 100644
--- a/azur/src/gint/render.c
+++ b/azur/src/gint/render.c
@@ -17,6 +17,10 @@ int azrp_width, azrp_height;
 /* Offset of first fragment for alignment, and number of fragments. */
 int azrp_frag_offset;
 int azrp_frag_count;
+/* Height of fragment. */
+int azrp_frag_height;
+
+/* TODO: Either make command queue private or use azrp_ prefix */
 
 /* Number and total size of queued commands. */
 GXRAM int commands_count = 0, commands_length = 0;
@@ -48,26 +52,45 @@ void azrp_clear_commands(void)
     commands_length = 0;
 }
 
-static int compare_commands(void const *c1, void const *c2)
+/* Custom quick sort for commands */
+
+static inline int compare(int8_t *c1, int8_t *c2)
 {
-    uint16_t offset1 = *(uint16_t *)c1;
-    uint16_t offset2 = *(uint16_t *)c2;
+    int d = c1[1] - c2[1];
+    return (d ? d : c1 - c2);
+}
 
-    uint8_t *ptr1 = (uint8_t *)(0xe5017000 + offset1);
-    uint8_t *ptr2 = (uint8_t *)(0xe5017000 + offset2);
+static void cmdsort(int low, int high)
+{
+    if(low >= high) return;
 
-    int diff_fragments = (int)ptr1[1] - (int)ptr2[1];
-    if(diff_fragments) return diff_fragments;
+    int8_t *pivot = YRAM + commands_array[(low + high) >> 1];
 
-    return (int)offset1 - (int)offset2;
+    int i = low - 1;
+    int j = high + 1;
+
+    while(1) {
+        do i++;
+        while(compare(YRAM + commands_array[i], pivot) < 0);
+
+        do j--;
+        while(compare(YRAM + commands_array[j], pivot) > 0);
+
+        if(i >= j) break;
+
+        uint16_t tmp = commands_array[i];
+        commands_array[i] = commands_array[j];
+        commands_array[j] = tmp;
+    }
+
+    cmdsort(low, j);
+    cmdsort(j+1, high);
 }
 
 void azrp_sort_commands(void)
 {
     prof_enter(azrp_perf_sort);
-    /* TODO: azrp_sort_commands: Use a custom sorter */
-    qsort(commands_array, commands_count, sizeof commands_array[0],
-        compare_commands);
+    cmdsort(0, commands_count - 1);
     prof_leave(azrp_perf_sort);
 }
 
@@ -95,6 +118,7 @@ void azrp_render_fragments(void)
         }
         else {
             prof_enter(azrp_perf_r61524);
+            /* TODO: Consider xram_frame() by DMA in parallel? */
             xram_frame(azrp_frag, 396 * 8);
             prof_leave(azrp_perf_r61524);
             frag++;
@@ -129,11 +153,11 @@ static void update_frag_count(void)
 static void update_size(void)
 {
     if(azrp_scale == 1)
-        azrp_width = 396, azrp_height = 198;
+        azrp_width = 396, azrp_height = 198, azrp_frag_height = 8;
     else if(azrp_scale == 2)
-        azrp_width = 198, azrp_height = 112;
+        azrp_width = 198, azrp_height = 112, azrp_frag_height = 16;
     else if(azrp_scale == 3)
-        azrp_width = 132, azrp_height = 75;
+        azrp_width = 132, azrp_height = 75,  azrp_frag_height = 16;
 }
 
 void azrp_config_scale(int scale)
diff --git a/azur/src/gint/shaders/clear.c b/azur/src/gint/shaders/clear.c
index 4e0354b..cf1fefa 100644
--- a/azur/src/gint/shaders/clear.c
+++ b/azur/src/gint/shaders/clear.c
@@ -11,15 +11,7 @@ static void register_shader(void)
 
 void azrp_shader_clear_configure(void)
 {
-    int longs_in_fragment = 0;
-
-    if(azrp_scale == 1)
-        longs_in_fragment = (396 * 2) * 8 / 4;
-    else if(azrp_scale == 2)
-        longs_in_fragment = (198 * 2) * 16 / 4;
-    else if(azrp_scale == 3)
-        longs_in_fragment = (132 * 2) * 16 / 4;
-
+    int longs_in_fragment = (azrp_width * azrp_frag_height / 2);
     azrp_set_uniforms(AZRP_SHADER_CLEAR, (void *)longs_in_fragment);
 }
 
@@ -35,6 +27,8 @@ struct command {
 
 void azrp_clear(uint16_t color)
 {
+    prof_enter(azrp_perf_cmdgen);
+
     struct command cmd;
     cmd.shader_id = AZRP_SHADER_CLEAR;
     cmd.color = color;
@@ -43,4 +37,6 @@ void azrp_clear(uint16_t color)
         cmd.fragment_id = i;
         azrp_queue_command(&cmd, sizeof cmd);
     }
+
+    prof_leave(azrp_perf_cmdgen);
 }
diff --git a/azur/src/gint/shaders/tex2d.S b/azur/src/gint/shaders/tex2d.S
index 6c65da1..f71a67a 100644
--- a/azur/src/gint/shaders/tex2d.S
+++ b/azur/src/gint/shaders/tex2d.S
@@ -1,58 +1,70 @@
 .global _azrp_shader_tex2d
 .align 4
 
+/* Profile values from bopti */
+#define PX_RGB565   0
+#define PX_RGB565A  1
+#define PX_P8       2
+#define PX_P4       3
+
 /* Register assignment
    r0: (temporary)
    r1: Lines
    r2: Output
    r3: Input
-   r4: Output stride (initially uniform: azrp_width*2)
-   r5: Command queue; (temporary)
-   r6: (temporary) (initially azrp_frag)
+   r4: [parameter] azrp_width*2; output stride
+   r5: [parameter] Command queue; (temporary)
+   r6: [parameter] azrp_frag; (temporary)
    r7: Columns
-   r8: Input stride */
+   r8: Input stride
+   r9: Image profile */
 _azrp_shader_tex2d:
+	mov.l	r8, @-r15
 	add	#2, r5
 
-	mov.w	@r5+, r7    /* Columns */
+	mov.l	r9, @-r15
 
-	mov.l	r8, @-r15
+	mov.w	@r5+, r7    /* command.columns */
 
-	mov.w	@r5+, r0    /* Input (1/2) */
+	mov.l	@r5+, r8    /* command.image */
+
+	mov.w	@r5+, r2    /* command.output (offset) */
 	sub	r7, r4
 
-	mov.w	@r5+, r3    /* Input (2/2) */
+	mov.w	@r5+, r1    /* command.lines */
 	sub	r7, r4
 
-	mov.w	@r5+, r2    /* Output offset */
-
-	mov.w	@r5+, r1    /* Lines */
-	shll16	r3
-
-	xtrct	r0, r3
-
-	mov.w	@r5+, r8    /* Input stride */
-	mov	#8, r0      /* Maximum width for naive method */
-
+	mov.w	@r8+, r0    /* image.profile */
 	add	r6, r2
-	cmp/ge	r7, r0
 
-	bt.s	.naive
-	mov	#2, r0
+	mov.w	@r8+, r6    /* image.alpha */
+	cmp/eq	#PX_P4, r0
 
-/* The following variations are named based on the parity of each parameter:
-   * w[eo] (width even, width odd)
-   * d[eo] (data even, data odd)
-   where even/odd means 4-aligned/2-aligned in terms of pointers.
+	mov.w	@r8, r8     /* image.width */
 
-   When the destination and source have identical parity, the copy is pretty
-   direct and takes 2 cycles to copy 4 bytes. When they have opposite parity
-   however, longwords need to be rearranged, which is a problem: arithmetic
-   operations under a RAW dependency take 3 cycles, so there's no way to
-   complete the 4-byte copy in less than 4 cycles unless iterations are opened
-   and weaved, which would add too much sub-cases. So in this case the naive
-   method that copies 4 bytes in 4 cycles is used. A very heavy image renderer
-   like a tileset shader should consider the optimized route though.  */
+	mov.l	@r5+, r3    /* command.input (pointer) */
+
+	sub	r7, r8
+
+	bt.s	.format_P4
+	shll	r8
+
+	cmp/eq	#PX_P8, r0
+
+	bt	.format_P8
+	cmp/eq	#PX_RGB565A, r0
+
+	bt	.format_RGB565A
+
+	/* Default below is .format_RGB565 */
+
+/* [Loop macros]
+
+   The following macros implement the main loop of the image renderer.
+   * Each line is rendered in the tight loop between 2: and 3: (both included).
+   * r2 is the output (with stride r4, in bytes)
+   * r3 is the input (with stride r8, in bytes)
+   * There are r1 rows with r7 iterations each */
 
 #define TEX2D_START()		\
 	ldrs	2f;		\
@@ -66,10 +78,41 @@ _azrp_shader_tex2d:
 	bf.s	1b;		\
 	add	r8, r3;		\
 				\
+	mov.l	@r15+, r9;	\
 	rts;			\
 	mov.l	@r15+, r8
 
-.case_analysis:
+/* [Rendering strategy for the RGB565 format]
+
+   In RGB565, all pixels are copied verbatim. This is a 2D memcpy, which we can
+   optimize by moving longwords. Since longwords are pairs of pixels, there are
+   variations and subcases based on the parity of each parameter:
+
+   * w[eo] denotes whether the width of the image is even or odd;
+   * d[eo] denotes whether the memory accesses to the source and destination
+     are even (4-aligned) or odd (2-aligned).
+
+   When the destination and source have identical parity, the d[eo] variation
+   can be defined. In this case the copy is pretty direct, it's a longword copy
+   and it takes 2 cycles to copy 4 bytes, plus some extra at the edges if the
+   start or end address if 2-aligned.
+
+   However, when they have opposite parity, each longword read matches up with
+   a 2-aligned write (or vice-versa). Rearranging words with arithmetic does
+   not help because of the stall cycle between loading a register and using it
+   in the ALU, which makes the minimum time 4 cycles for 2 pixels (the same as
+   the word-based copy). Weaving iterations could help but would be too complex
+   here (adding sub-cases); a super-heavy renderer with more hypotheses (like a
+   tileset shader) should aim for that route though. Also, movua.l followed by
+   mov.l is even slower (5 cycles). */
+
+.format_RGB565:
+	mov	#8, r0      /* Maximum width for naive method */
+	cmp/ge	r7, r0
+
+	bt.s	.naive
+	mov	#2, r0
+
 	/* Use naive method for opposite source/destination parity */
 	mov	r2, r6
 	xor	r3, r6
@@ -131,3 +174,37 @@ _azrp_shader_tex2d:
 2:	movs.w	@r3+, x0
 3:	movs.w	x0, @r2+
 	TEX2D_END()
+
+/* [Rendering strategy for the RGB565A format]
+
+   Since we have to check for the alpha value in each pixel, there's really no
+   longword-based optimization. Instead, we just go as fast as possible with
+   each pixels, using DSP instructions. Branchless jump is pretty useful.
+
+   TODO: Opening iterations will definitely save at least 1 cycle per pixel; it
+         just requires a subcase for extremely small images (width = 1). */
+
+.format_RGB565A:
+	mov	r2, r5
+
+	TEX2D_START()
+	/* In the comparison, DC=1 if x0 == image.alpha */
+2:	                         movs.w  @r3+, x0
+	     pcmp    x0, y0      movx.w  @r5, x1
+	dct  pcopy   x1, x0
+3:	     movx.w  x0, @r5+
+	TEX2D_END()
+
+/* [Rendering strategy for the P8 format] */
+.format_P8:
+	TEX2D_START()
+2:
+3:
+	TEX2D_END()
+
+/* [Rendering strategy for the P4 format] */
+.format_P4:
+	TEX2D_START()
+2:
+3:
+	TEX2D_END()
diff --git a/azur/src/gint/shaders/tex2d.c b/azur/src/gint/shaders/tex2d.c
index 6d410e0..5f2f9e9 100644
--- a/azur/src/gint/shaders/tex2d.c
+++ b/azur/src/gint/shaders/tex2d.c
@@ -1,4 +1,5 @@
 #include <azur/gint/render.h>
+#include <gint/defs/util.h>
 
 uint8_t AZRP_SHADER_TEX2D = -1;
 
@@ -15,3 +16,54 @@ void azrp_shader_tex2d_configure(void)
 }
 
 //---
+
+/* Profile values from bopti */
+#define PX_RGB565   0
+#define PX_RGB565A  1
+#define PX_P8       2
+#define PX_P4       3
+
+void azrp_image(int x, int y, bopti_image_t const *image)
+{
+    azrp_subimage(x, y, image, 0, 0, image->width, image->height, 0);
+}
+
+void azrp_subimage(int x, int y, bopti_image_t const *image,
+    int left, int top, int width, int height, int flags)
+{
+    prof_enter(azrp_perf_cmdgen);
+
+    if(!(flags & DIMAGE_NOCLIP)) {
+        /* TODO: tex2d: clip function */
+    }
+
+    struct azrp_shader_tex2d_command cmd;
+    cmd.shader_id = AZRP_SHADER_TEX2D;
+    cmd.columns = width;
+    cmd.image = image;
+
+    int input_multiplier = 1;
+    if(image->profile == PX_P8) input_multiplier = 0;
+    if(image->profile == PX_P4) input_multiplier = -1;
+
+    /* This divides by azrp_frag_height */
+    cmd.fragment_id = (azrp_scale == 1) ? (y >> 3) : (y >> 4);
+
+    while(height > 0) {
+        cmd.lines = min(height, azrp_frag_height - (y & (azrp_frag_height-1)));
+
+        int input_offset = (image->width * top + left) << input_multiplier;
+        cmd.input = (void *)image->data + input_offset;
+
+        cmd.output = 2 * (azrp_width * (y & (azrp_frag_height-1)) + x);
+
+        y += cmd.lines;
+        top += cmd.lines;
+        height -= cmd.lines;
+
+        azrp_queue_command(&cmd, sizeof cmd);
+        cmd.fragment_id++;
+    }
+
+    prof_leave(azrp_perf_cmdgen);
+}