dma: finalize dma_memset() and dma_memcpy()

Adds support for dma_memcpy(), and uses a proper ILRAM allocation scheme (static linking here) for the temporary buffer in dma_memset().
support data loading in ILRAM, XRAM and YRAM
2019-09-15 19:30:57 +02:00 · 2019-09-15 19:29:47 +02:00 · 2019-09-15 15:20:23 +02:00
8 changed files with 169 additions and 31 deletions
--- a/fx9860g.ld
+++ b/fx9860g.ld
@ -15,19 +15,24 @@ MEMORY
 {
 	/* Userspace mapping of the add-in (0x200 B are for the G1A header).
 	   220k is the maximum amount of simultaneously-mappable code */
-	rom  (rx):	o = 0x00300200, l = 220k
+	rom   (rx):   o = 0x00300200, l = 220k
 	/* This is mapped to RAM; 8k on SH3, apparently 32k on SH4 */
-	ram  (rw):	o = 0x08100000, l = 8k
+	ram   (rw):   o = 0x08100000, l = 8k
 	/* gint's VBR space, mentioned here for completeness */
-	vbr  (rwx):	o = 0x8800e000, l = 5k
+	vbr   (rwx):  o = 0x8800e000, l = 5k
 	/* Some RAM region from P1 area; gint's data will reside here */
-	rram (rwx):	o = 0x8800f400, l = 3k
+	rram  (rwx):  o = 0x8800f400, l = 3k
+	/* On-chip IL memory */
+	ilram (rwx):  o = 0xe5200000, l = 4k
+	/* On-chip X and Y memory */
+	xram  (rwx):  o = 0xe5007000, l = 8k
+	yram  (rwx):  o = 0xe5017000, l = 8k
 }

 SECTIONS
 {
 	/*
-	**	ROM sections
+	**  ROM sections
 	*/

 	/* First address to be mapped to ROM (including G1A header) */
@ -94,13 +99,16 @@ SECTIONS
 	   - Resources or assets from fxconv or similar converters
 	   - Data marked read-only by the compiler (.rodata and .rodata.*) */
 	.rodata : SUBALIGN(4) {
+		/* Put these first, they need to be 4-aligned */
+		*(.rodata.4)
+
 		*(.rodata .rodata.*)
 	} > rom



 	/*
-	**	RAM sections
+	**  RAM sections
 	*/

 	. = ORIGIN(ram);
@ -142,12 +150,48 @@ SECTIONS

 	_sdata = SIZEOF(.data) + SIZEOF(.data.4);

+	/* On-chip memory sections: IL, X and Y memory */
+
+	. = ORIGIN(ilram);
+	.ilram ALIGN(4) : ALIGN(4) {
+		_lilram = LOADADDR(.ilram);
+		_rilram = . ;
+
+		*(.ilram)
+
+		. = ALIGN(16);
+	} > ilram AT> rom
+
+	. = ORIGIN(xram);
+	.xram ALIGN(4) : ALIGN(4) {
+		_lxram = LOADADDR(.xram);
+		_rxram = . ;
+
+		*(.xram)
+
+		. = ALIGN(16);
+	} > xram AT> rom
+
+	. = ORIGIN(yram);
+	.yram ALIGN(4) : ALIGN(4) {
+		_lyram = LOADADDR(.yram);
+		_ryram = . ;
+
+		*(.yram)
+
+		. = ALIGN(16);
+	} > yram AT> rom
+
+	_silram = SIZEOF(.ilram);
+	_sxram  = SIZEOF(.xram);
+	_syram  = SIZEOF(.yram);
+


 	/*
-	**	RRAM sections
-	**	8800e000:4k  VBR space
-	**	8800f000:4k  .gint.data and .gint.bss
+	**  RRAM sections
+	**  8800e000:4k  VBR space
+	**  8800f000:4k  .gint.data and .gint.bss
 	*/

 	/* VBR address: let's just start at the beginning of the RRAM area.
@ -185,7 +229,7 @@ SECTIONS


 	/*
-	**	Other sections
+	**  Other sections
 	*/

 	/* Unwanted sections going to meet Dave Null:
--- a/fxcg50.ld
+++ b/fxcg50.ld
@ -13,20 +13,25 @@ ENTRY(_start)
 MEMORY
 {
 	/* Userspace mapping of the add-in (without G3A header) */
-	rom  (rx):	o = 0x00300000, l = 220k
+	rom   (rx):   o = 0x00300000, l = 220k
 	/* Static RAM; stack grows down from the end of this region.
 	   The first 0x2000 bytes are reserved by gint, see below */
-	ram  (rw):	o = 0x08102000, l = 512k
+	ram   (rw):   o = 0x08102000, l = 512k
 	/* gint's VBR space, mentioned here for completeness */
-	vbr  (rwx):	o = 0x8c160000, l = 5k
+	vbr   (rwx):  o = 0x8c160000, l = 5k
 	/* Some RAM region from P1 area; gint's data will reside here */
-	rram (rwx):	o = 0x8c161400, l = 3k
+	rram  (rwx):  o = 0x8c161400, l = 3k
+	/* On-chip IL memory */
+	ilram (rwx):  o = 0xe5200000, l = 4k
+	/* On-chip X and Y memory */
+	xram  (rwx):  o = 0xe5007000, l = 8k
+	yram  (rwx):  o = 0xe5017000, l = 8k
 }

 SECTIONS
 {
 	/*
-	**	ROM sections
+	**  ROM sections
 	*/

 	/* First address to be mapped to ROM */
@ -87,7 +92,7 @@ SECTIONS
 	   - Data marked read-only by the compiler (.rodata and .rodata.*) */
 	.rodata : SUBALIGN(4) {
 		/* Put these first, they need to be 4-aligned */
-		*(.rodata.assets)
+		*(.rodata.4)

 		*(.rodata .rodata.*)
 	} > rom
@ -95,7 +100,7 @@ SECTIONS


 	/*
-	**	RAM sections
+	**  RAM sections
 	*/

 	. = ORIGIN(ram);
@ -130,12 +135,48 @@ SECTIONS

 	_sdata = SIZEOF(.data) + SIZEOF(.data.4);

+	/* On-chip memory sections: IL, X and Y memory */
+
+	. = ORIGIN(ilram);
+	.ilram ALIGN(4) : ALIGN(4) {
+		_lilram = LOADADDR(.ilram);
+		_rilram = . ;
+
+		*(.ilram)
+
+		. = ALIGN(16);
+	} > ilram AT> rom
+
+	. = ORIGIN(xram);
+	.xram ALIGN(4) : ALIGN(4) {
+		_lxram = LOADADDR(.xram);
+		_rxram = . ;
+
+		*(.xram)
+
+		. = ALIGN(16);
+	} > xram AT> rom
+
+	. = ORIGIN(yram);
+	.yram ALIGN(4) : ALIGN(4) {
+		_lyram = LOADADDR(.yram);
+		_ryram = . ;
+
+		*(.yram)
+
+		. = ALIGN(16);
+	} > yram AT> rom
+
+	_silram = SIZEOF(.ilram);
+	_sxram  = SIZEOF(.xram);
+	_syram  = SIZEOF(.yram);
+


 	/*
-	**	gint-related sections
-	**	8c160000:4k  VBR space
-	**	8c161000:4k  .gint.data and .gint.bss
+	**  gint-related sections
+	**  8c160000:4k  VBR space
+	**  8c161000:4k  .gint.data and .gint.bss
 	*/

 	/* VBR address: let's just start at the beginning of the RAM area.
@ -173,7 +214,7 @@ SECTIONS


 	/*
-	**	Other sections
+	**  Other sections
 	*/

 	/* Unwanted sections going to meet Dave Null:
--- a/include/gint/defs/attributes.h
+++ b/include/gint/defs/attributes.h
@ -13,6 +13,10 @@
 /* Additional sections that are only needed on SH3 */
 #define GDATA3		__attribute__((section(".gint.data.sh3")))
 #define GBSS3		__attribute__((section(".gint.bss.sh3")))
+/* Objects for the ILRAM, XRAM and YRAM regions */
+#define GILRAM		__attribute__((section(".ilram")))
+#define GXRAM		__attribute__((section(".xram")))
+#define GYRAM		__attribute__((section(".yram")))

 /* Unused parameters or variables */
 #define GUNUSED		__attribute__((unused))
--- a/include/gint/dma.h
+++ b/include/gint/dma.h
@ -57,7 +57,7 @@ void dma_transfer(int channel, dma_size_t size, uint length,
 	void const *src, dma_address_t src_mode,
 	void *dst, dma_address_t dst_mode);

-/* dma_transfer_wait() - Wait for a transfer on channel 0 to finish
+/* dma_transfer_wait() - Wait for a transfer to finish

   You should call this function when you need to transfer to be complete
   before continuing execution. If you are sure that the transfer is finished,
@ -83,16 +83,31 @@ void dma_transfer_noint(int channel, dma_size_t size, uint blocks,
 //---

 /* dma_memset(): Fast 32-aligned memset
-   This function is your typical memset, except that the destination and size
+
+   This function is your typical memset(), except that the destination and size
   must be 32-aligned, and that the pattern is 4 bytes instead of one. It is
   replicated to 32 bytes then used to fill the destination area. This 4-byte
   fixed size may be lifted in future versions.

+   This function cannot be used with virtualized (P0) addresses.
+
   @dst      Destination address (32-aligned)
   @pattern  4-byte pattern to fill @dst
   @size     Sie of destination area (32-aligned) */
 void *dma_memset(void *dst, uint32_t pattern, size_t size);

+/* dma_memcpy(): Fast 32-aligned memcpy
+
+   This function works exactly like memcpy(), but it expects 32-aligned source,
+   destination, and size, and uses the DMA to efficiently copy.
+
+   This function cannot be used with virtualized (P0) addresses.
+
+   @dst   Destination address (32-aligned)
+   @dst   Source addresss (32-aligned)
+   @size  Size of region (32-aligned) */
+void *dma_memcpy(void * restrict dst, const void * restrict src, size_t size);
+
 #endif /* FXCG50 */

 #endif /* GINT_DMA */
--- a/src/core/start.c
+++ b/src/core/start.c
@ -20,6 +20,9 @@ extern uint32_t
 	brom, srom,			/* Limits of ROM mappings */
 	lgdata, sgdata, rgdata,		/* gint's data section */
 	ldata,  sdata,  rdata,		/* User's data section */
+	lilram, silram, rilram,		/* IL memory section */
+	lxram,  sxram,  rxram,		/* X memory section */
+	lyram,  syram,  ryram,		/* Y memory section */
 	sbss, rbss,			/* User's BSS section */
 	btors, mtors, etors;		/* Constructor/destructor arrays */
 extern gint_driver_t
@ -120,7 +123,10 @@ int start(int isappli, int optnum)
 	/* Load data sections and wipe the bss section. This has to be done
 	   first for static and global variables to be initialized */
 	regcpy(lgdata, sgdata, rgdata);
-	regcpy(ldata, sdata, rdata);
+	regcpy(ldata,  sdata,  rdata);
+	regcpy(lilram, silram, rilram);
+	regcpy(lxram,  sxram,  rxram);
+	regcpy(lyram,  syram,  ryram);
 	regclr(rbss, sbss);
 	bootlog_loaded();

--- a/src/dma/dma.c
+++ b/src/dma/dma.c
@ -41,7 +41,7 @@ static uint32_t dma_translate(void const *address)
 		return a;

 	/* First additional on-chip memory area (XRAM) */
-	if(a >= 0xe5007000 && a < 0xE5009000)
+	if(a >= 0xe5007000 && a < 0xe5009000)
 		return a;

 	/* Second on-chip memory area (YRAM) */
@ -123,8 +123,18 @@ void dma_transfer_wait(int channel)
 	channel_t *ch = dma_channel(channel);
 	if(!ch) return;

-	/* Wait for the channel to be disabled by the interrupt handler */
-	while(ch->CHCR.DE) sleep();
+	/* Wait for the channel to be disabled by the interrupt handler.
+	   When the source or the destination of the transfer is X, Y or IL
+	   memory, refrain from sleeping as this also stops the transfer. */
+	int onchip = 0;
+
+	if(ch->SAR >= 0xe5007000 && ch->SAR < 0xe5204000) onchip = 1;
+	if(ch->DAR >= 0xe5007000 && ch->DAR < 0xe5204000) onchip = 1;
+
+	while(ch->CHCR.DE)
+	{
+		if(!onchip) sleep();
+	}
 }

 /* dma_transfer_noint(): Perform a data transfer without interruptions */
--- a/src/dma/memcpy.c
+++ b/src/dma/memcpy.c
@ -0,0 +1,10 @@
+#include <gint/dma.h>
+
+/* dma_memcpy(): Fast 32-aligned memcpy */
+void *dma_memcpy(void * restrict dst, const void * restrict src, size_t size)
+{
+	dma_transfer(1, DMA_32B, size >> 5, src, DMA_INC, dst, DMA_INC);
+	dma_transfer_wait(1);
+
+	return dst;
+}
--- a/src/dma/memset.c
+++ b/src/dma/memset.c
@ -1,12 +1,20 @@
 #include <gint/dma.h>

+/* Allocate a 32-byte buffer in ILRAM */
+GALIGNED(32) GILRAM static uint32_t ILbuf[8];
+
 /* dma_memset(): Fast 32-aligned memset */
 void *dma_memset(void *dst, uint32_t l, size_t size)
 {
-	/* TODO: Use a proper IL memory allocation scheme */
-	uint32_t *IL = (void *)0xe5200000;
-	for(int i = 0; i < 8; i++) IL[i] = l;
+	/* Prepare the ILRAM buffer. We need to use ILRAM because the DMA will
+	   have to read the operand once per block, as opposed to an assembler
+	   routine that would hold it in a register. If we place it in RAM, the
+	   DMA will perform twice as many RAM accesses as the handwritten
+	   assembler, which would be very slow. By using ILRAM we use two
+	   different memory regions, making the DMA faster than the CPU. */
+	for(int i = 0; i < 8; i++) ILbuf[i] = l;

-	dma_transfer_noint(1, DMA_32B, size >> 5, IL, DMA_FIXED, dst, DMA_INC);
+	dma_transfer(1, DMA_32B, size >> 5, ILbuf, DMA_FIXED, dst, DMA_INC);
+	dma_transfer_wait(1);
 	return dst;
 }