diff --git a/include/gint/dma.h b/include/gint/dma.h index bae3909..cc95571 100644 --- a/include/gint/dma.h +++ b/include/gint/dma.h @@ -83,16 +83,31 @@ void dma_transfer_noint(int channel, dma_size_t size, uint blocks, //--- /* dma_memset(): Fast 32-aligned memset - This function is your typical memset, except that the destination and size + + This function is your typical memset(), except that the destination and size must be 32-aligned, and that the pattern is 4 bytes instead of one. It is replicated to 32 bytes then used to fill the destination area. This 4-byte fixed size may be lifted in future versions. + This function cannot be used with virtualized (P0) addresses. + @dst Destination address (32-aligned) @pattern 4-byte pattern to fill @dst @size Sie of destination area (32-aligned) */ void *dma_memset(void *dst, uint32_t pattern, size_t size); +/* dma_memcpy(): Fast 32-aligned memcpy + + This function works exactly like memcpy(), but it expects 32-aligned source, + destination, and size, and uses the DMA to efficiently copy. + + This function cannot be used with virtualized (P0) addresses. + + @dst Destination address (32-aligned) + @dst Source addresss (32-aligned) + @size Size of region (32-aligned) */ +void *dma_memcpy(void * restrict dst, const void * restrict src, size_t size); + #endif /* FXCG50 */ #endif /* GINT_DMA */ diff --git a/src/dma/memcpy.c b/src/dma/memcpy.c new file mode 100644 index 0000000..ded4a46 --- /dev/null +++ b/src/dma/memcpy.c @@ -0,0 +1,10 @@ +#include + +/* dma_memcpy(): Fast 32-aligned memcpy */ +void *dma_memcpy(void * restrict dst, const void * restrict src, size_t size) +{ + dma_transfer(1, DMA_32B, size >> 5, src, DMA_INC, dst, DMA_INC); + dma_transfer_wait(1); + + return dst; +} diff --git a/src/dma/memset.c b/src/dma/memset.c index a903e6d..b2f7016 100644 --- a/src/dma/memset.c +++ b/src/dma/memset.c @@ -1,13 +1,20 @@ #include +/* Allocate a 32-byte buffer in ILRAM */ +GALIGNED(32) GILRAM static uint32_t ILbuf[8]; + /* dma_memset(): Fast 32-aligned memset */ void *dma_memset(void *dst, uint32_t l, size_t size) { - /* TODO: Use a proper IL memory allocation scheme */ - uint32_t *IL = (void *)0xe5200000; - for(int i = 0; i < 8; i++) IL[i] = l; + /* Prepare the ILRAM buffer. We need to use ILRAM because the DMA will + have to read the operand once per block, as opposed to an assembler + routine that would hold it in a register. If we place it in RAM, the + DMA will perform twice as many RAM accesses as the handwritten + assembler, which would be very slow. By using ILRAM we use two + different memory regions, making the DMA faster than the CPU. */ + for(int i = 0; i < 8; i++) ILbuf[i] = l; - dma_transfer(1, DMA_32B, size >> 5, IL, DMA_FIXED, dst, DMA_INC); + dma_transfer(1, DMA_32B, size >> 5, ILbuf, DMA_FIXED, dst, DMA_INC); dma_transfer_wait(1); return dst; }