diff --git a/include/gint/dma.h b/include/gint/dma.h
index bae3909..cc95571 100644
--- a/include/gint/dma.h
+++ b/include/gint/dma.h
@@ -83,16 +83,31 @@ void dma_transfer_noint(int channel, dma_size_t size, uint blocks,
 //---
 
 /* dma_memset(): Fast 32-aligned memset
-   This function is your typical memset, except that the destination and size
+
+   This function is your typical memset(), except that the destination and size
    must be 32-aligned, and that the pattern is 4 bytes instead of one. It is
    replicated to 32 bytes then used to fill the destination area. This 4-byte
    fixed size may be lifted in future versions.
 
+   This function cannot be used with virtualized (P0) addresses.
+
    @dst      Destination address (32-aligned)
    @pattern  4-byte pattern to fill @dst
    @size     Sie of destination area (32-aligned) */
 void *dma_memset(void *dst, uint32_t pattern, size_t size);
 
+/* dma_memcpy(): Fast 32-aligned memcpy
+
+   This function works exactly like memcpy(), but it expects 32-aligned source,
+   destination, and size, and uses the DMA to efficiently copy.
+
+   This function cannot be used with virtualized (P0) addresses.
+
+   @dst   Destination address (32-aligned)
+   @dst   Source addresss (32-aligned)
+   @size  Size of region (32-aligned) */
+void *dma_memcpy(void * restrict dst, const void * restrict src, size_t size);
+
 #endif /* FXCG50 */
 
 #endif /* GINT_DMA */
diff --git a/src/dma/memcpy.c b/src/dma/memcpy.c
new file mode 100644
index 0000000..ded4a46
--- /dev/null
+++ b/src/dma/memcpy.c
@@ -0,0 +1,10 @@
+#include <gint/dma.h>
+
+/* dma_memcpy(): Fast 32-aligned memcpy */
+void *dma_memcpy(void * restrict dst, const void * restrict src, size_t size)
+{
+	dma_transfer(1, DMA_32B, size >> 5, src, DMA_INC, dst, DMA_INC);
+	dma_transfer_wait(1);
+
+	return dst;
+}
diff --git a/src/dma/memset.c b/src/dma/memset.c
index a903e6d..b2f7016 100644
--- a/src/dma/memset.c
+++ b/src/dma/memset.c
@@ -1,13 +1,20 @@
 #include <gint/dma.h>
 
+/* Allocate a 32-byte buffer in ILRAM */
+GALIGNED(32) GILRAM static uint32_t ILbuf[8];
+
 /* dma_memset(): Fast 32-aligned memset */
 void *dma_memset(void *dst, uint32_t l, size_t size)
 {
-	/* TODO: Use a proper IL memory allocation scheme */
-	uint32_t *IL = (void *)0xe5200000;
-	for(int i = 0; i < 8; i++) IL[i] = l;
+	/* Prepare the ILRAM buffer. We need to use ILRAM because the DMA will
+	   have to read the operand once per block, as opposed to an assembler
+	   routine that would hold it in a register. If we place it in RAM, the
+	   DMA will perform twice as many RAM accesses as the handwritten
+	   assembler, which would be very slow. By using ILRAM we use two
+	   different memory regions, making the DMA faster than the CPU. */
+	for(int i = 0; i < 8; i++) ILbuf[i] = l;
 
-	dma_transfer(1, DMA_32B, size >> 5, IL, DMA_FIXED, dst, DMA_INC);
+	dma_transfer(1, DMA_32B, size >> 5, ILbuf, DMA_FIXED, dst, DMA_INC);
 	dma_transfer_wait(1);
 	return dst;
 }