perf/memory: add alternating reads and writes

This commit is contained in:
Lephenixnoir 2022-04-03 10:36:35 +01:00
parent 32a81d6e54
commit 71d7b2fcf2
Signed by: Lephenixnoir
GPG Key ID: 1BBA026E13FC0495
2 changed files with 200 additions and 68 deletions

View File

@ -1,7 +1,7 @@
/* Useful macros to get a repeating DSP loop for instances with more than
4096 iterations. Uses r1 and r2, labels 3: and 4:. */
#define PRELUDE(ITERATIONS) \
mov.w .w4095, r1; \
mov.l .l4095, r1; \
mov #0, r0; \
3: ldrs 1f; \
cmp/hi r1, ITERATIONS; \
@ -70,6 +70,69 @@ _mem_write32:
1: 2: mov.l r0, @-r4
EPILOGUE(r5)
/* Alternate reads from CPU */
.global _mem_read8_alt
.global _mem_read16_alt
.global _mem_read32_alt
_mem_read8_alt:
shlr r6
PRELUDE(r6)
1: mov.b @r4, r0
2: mov.b @r5, r0
EPILOGUE(r6)
_mem_read16_alt:
shlr2 r6
PRELUDE(r6)
1: mov.w @r4, r0
2: mov.w @r5, r0
EPILOGUE(r6)
_mem_read32_alt:
shlr r6
shlr2 r6
PRELUDE(r6)
1: mov.l @r4, r0
2: mov.l @r5, r0
EPILOGUE(r6)
/* Alternate writes from CPU */
.global _mem_write8_alt
.global _mem_write16_alt
.global _mem_write32_alt
_mem_write8_alt:
mov.b @r4, r0
mov.b @r5, r3
shlr r6
mov #0, r0
PRELUDE(r6)
1: mov.b r0, @r4
2: mov.b r3, @r5
EPILOGUE(r6)
_mem_write16_alt:
mov.w @r4, r0
mov.w @r5, r3
shlr2 r6
PRELUDE(r6)
1: mov.w r0, @r4
2: mov.w r3, @r5
EPILOGUE(r6)
_mem_write32_alt:
mov.l @r4, r0
mov.l @r5, r3
shlr r6
shlr2 r6
PRELUDE(r6)
1: mov.l r0, @r4
2: mov.l r3, @r5
EPILOGUE(r6)
/* Memory reads and writes from DSP XRAM */
.global _mem_dspx_read16
@ -140,4 +203,6 @@ _mem_dsps_write32:
1: 2: movs.l x0, @r4+
EPILOGUE(r5)
.w4095: .word 4095
.align 4
.l4095:
.long 4095

View File

@ -22,10 +22,18 @@
extern void mem_read8 (void *mem, int size);
extern void mem_read16 (void *mem, int size);
extern void mem_read32 (void *mem, int size);
/* Right asm writes of different sizes. SPU2 memory only supports 32-bit */
/* Tight asm writes of different sizes. SPU2 memory only supports 32-bit */
extern void mem_write8 (void *mem, int size);
extern void mem_write16 (void *mem, int size);
extern void mem_write32 (void *mem, int size);
/* Tight asm reads of 2 addresses; size is the total volume; no increment */
extern void mem_read8_alt (void *mem1, void *mem2, int size);
extern void mem_read16_alt (void *mem1, void *mem2, int size);
extern void mem_read32_alt (void *mem1, void *mem2, int size);
/* Tight asm writes of 2 addresses; size is the total volume; no increment */
extern void mem_write8_alt (void *mem1, void *mem2, int size);
extern void mem_write16_alt (void *mem1, void *mem2, int size);
extern void mem_write32_alt (void *mem1, void *mem2, int size);
/* Same using the DSP's XRAM addressing instructions (movx) */
extern void mem_dspx_read16 (void *mem, int size);
extern void mem_dspx_read32 (void *mem, int size);
@ -60,7 +68,7 @@ extern void *dma_memcpy (void *dst, void const *src, size_t size);
GILRAM GALIGNED(32) static char ilram_buffer[0x800];
GXRAM GALIGNED(32) static char xram_buffer[0x800];
// GYRAM GALIGNED(32) static char yram_buffer[0x800];
GYRAM GALIGNED(32) static char yram_buffer[0x800];
#define pram0_buffer ((void *)0xfe200000)
typedef struct
@ -87,12 +95,13 @@ region_t RAM_CU = { (void*)0x8c200000, 65536, 1, 0 };
region_t RAM_NC = { (void*)0xac200000, 2048, 16, 0 };
region_t ILRAM = { ilram_buffer, 2048, 64, 0 };
region_t XRAM = { xram_buffer, 2048, 64, DSPXRAM };
region_t YRAM = { yram_buffer, 2048, 64, DSPXRAM };
region_t PRAM0 = { pram0_buffer, 2048, 16, ONLY32BIT };
region_t const *REGIONS[] = {
&ROM_CF_MMU, &ROM_CU_MMU, &ROM_CF, &ROM_CU, &ROM_NC,
&RAM_CF_MMU, &RAM_CU_MMU, &RAM_CF, &RAM_CU, &RAM_NC,
&ILRAM, &XRAM, &PRAM0,
&ILRAM, &XRAM, &YRAM, &PRAM0,
};
char const *REGIONS_NAMES[] = {
"ROM (cached, MMU)", "ROM (cached linear, MMU)",
@ -101,7 +110,7 @@ char const *REGIONS_NAMES[] = {
"RAM (cached, MMU)", "RAM (cached linear, MMU)",
"RAM (cached, no MMU)", "RAM (cached linear, no MMU)",
"RAM (uncached, no MMU)",
"ILRAM", "XRAM", "PRAM0",
"ILRAM", "XRAM", "YRAM", "PRAM0",
};
#define REGIONS_COUNT ((int)(sizeof REGIONS / sizeof REGIONS[0]))
@ -112,7 +121,9 @@ char const *REGIONS_NAMES[] = {
typedef struct
{
int mem_read8, mem_read16, mem_read32;
int mem_read8_alt, mem_read16_alt, mem_read32_alt;
int mem_write8, mem_write16, mem_write32;
int mem_write8_alt, mem_write16_alt, mem_write32_alt;
int dma_memset;
union {
@ -147,82 +158,124 @@ static void benchmark(region_t const *region, info_t *info)
memset(info, 0xff, sizeof *info);
int f = region->flags;
int size = region->size;
void *p1 = region->pointer;
void *p2 = p1 + size / 2;
if(~f & ONLY32BIT)
/* Hack to switch page on XRAM/YRAM for reading and writing tests */
if(f & DSPXRAM)
p2 = (void *)((uint32_t)p1 ^ 0x00001000);
if(~f & ONLY32BIT) {
info->time.mem_read8 = prof_exec({
for(int i = 0; i < region->rounds; i++)
mem_read8(region->pointer, region->size);
mem_read8(p1, size);
});
if(~f & ONLY32BIT)
info->time.mem_read8_alt = prof_exec({
for(int i = 0; i < region->rounds; i++)
mem_read8_alt(p1, p2, size);
});
}
if(~f & ONLY32BIT) {
info->time.mem_read16 = prof_exec({
for(int i = 0; i < region->rounds; i++)
mem_read16(region->pointer, region->size);
mem_read16(p1, size);
});
info->time.mem_read16_alt = prof_exec({
for(int i = 0; i < region->rounds; i++)
mem_read16_alt(p1, p2, size);
});
}
info->time.mem_read32 = prof_exec({
for(int i = 0; i < region->rounds; i++)
mem_read32(region->pointer, region->size);
mem_read32(p1, size);
});
info->time.mem_read32_alt = prof_exec({
for(int i = 0; i < region->rounds; i++)
mem_read32_alt(p1, p2, size);
});
if((~f & READONLY) && (~f & ONLY32BIT))
if((~f & READONLY) && (~f & ONLY32BIT)) {
info->time.mem_write8 = prof_exec({
for(int i = 0; i < region->rounds; i++)
mem_write8(region->pointer, region->size);
mem_write8(p1, size);
});
if((~f & READONLY) && (~f & ONLY32BIT))
info->time.mem_write8_alt = prof_exec({
for(int i = 0; i < region->rounds; i++)
mem_write8_alt(p1, p2, size);
});
}
if((~f & READONLY) && (~f & ONLY32BIT)) {
info->time.mem_write16 = prof_exec({
for(int i = 0; i < region->rounds; i++)
mem_write16(region->pointer, region->size);
mem_write16(p1, size);
});
if(~f & READONLY)
info->time.mem_write16_alt = prof_exec({
for(int i = 0; i < region->rounds; i++)
mem_write16_alt(p1, p2, size);
});
}
if(~f & READONLY) {
info->time.mem_write32 = prof_exec({
for(int i = 0; i < region->rounds; i++)
mem_write32(region->pointer, region->size);
mem_write32(p1, size);
});
info->time.mem_write32_alt = prof_exec({
for(int i = 0; i < region->rounds; i++)
mem_write32_alt(p1, p2, size);
});
}
if((~f & READONLY) && (~f & VIRTUAL))
info->time.dma_memset = prof_exec({
for(int i = 0; i < region->rounds; i++)
dma_memset(region->pointer, 0, region->size);
dma_memset(p1, 0, size);
});
if(f & DSPXRAM) {
info->time.mem_dspx_read16 = prof_exec({
for(int i = 0; i < region->rounds; i++)
mem_dspx_read16(region->pointer, region->size);
mem_dspx_read16(p1, size);
});
info->time.mem_dspx_read32 = prof_exec({
for(int i = 0; i < region->rounds; i++)
mem_dspx_read32(region->pointer, region->size);
mem_dspx_read32(p1, size);
});
info->time.mem_dspx_write16 = prof_exec({
for(int i = 0; i < region->rounds; i++)
mem_dspx_write16(region->pointer, region->size);
mem_dspx_write16(p1, size);
});
info->time.mem_dspx_write32 = prof_exec({
for(int i = 0; i < region->rounds; i++)
mem_dspx_write32(region->pointer, region->size);
mem_dspx_write32(p1, size);
});
}
if((~f & DSPXRAM) && (~f & ONLY32BIT))
info->time.mem_dsps_read16 = prof_exec({
for(int i = 0; i < region->rounds; i++)
mem_dsps_read16(region->pointer, region->size);
mem_dsps_read16(p1, size);
});
if(~f & DSPXRAM)
info->time.mem_dsps_read32 = prof_exec({
for(int i = 0; i < region->rounds; i++)
mem_dsps_read32(region->pointer, region->size);
mem_dsps_read32(p1, size);
});
if((~f & DSPXRAM) && (~f & ONLY32BIT) && (~f & READONLY))
info->time.mem_dsps_write16 = prof_exec({
for(int i = 0; i < region->rounds; i++)
mem_dsps_write16(region->pointer, region->size);
mem_dsps_write16(p1, size);
});
if((~f & DSPXRAM) && (~f & READONLY))
info->time.mem_dsps_write32 = prof_exec({
for(int i = 0; i < region->rounds; i++)
mem_dsps_write32(region->pointer, region->size);
mem_dsps_write32(p1, size);
});
if(~f & READONLY)
info->time.dma_memset = prof_exec({
for(int i = 0; i < region->rounds; i++)
dma_memset(p1, 0, size);
});
/* Cheeky method to read all ints in such a packed struct */
@ -243,35 +296,38 @@ static void benchmark(region_t const *region, info_t *info)
// Main interface
//---
void print_speed(int x, int y, int us, int kBps)
void print_speed(int x, int y, int unit, int us, int kBps)
{
if(us == -1 && kBps == -1) {
dprint_opt(x, y, C_BLACK, C_NONE, DTEXT_CENTER, DTEXT_MIDDLE, "-");
dprint_opt(x, y, C_BLACK, C_NONE, DTEXT_CENTER, DTEXT_TOP, "-");
return;
}
dprint_opt(x, y-2, C_BLACK, C_NONE, DTEXT_CENTER, DTEXT_BOTTOM,
"%d us", us);
char const *fmt;
if(kBps >= 100000) {
fmt = "%.1D M/s";
kBps /= 100;
if(unit == 1) {
dprint_opt(x, y, C_BLACK, C_NONE, DTEXT_CENTER, DTEXT_TOP, "%d us",us);
}
else {
fmt = "%.2D M/s";
kBps /= 10;
char const *fmt;
if(kBps >= 100000) {
fmt = "%.1D M/s";
kBps /= 100;
}
else {
fmt = "%.2D M/s";
kBps /= 10;
}
dprint_opt(x, y, C_BLACK, C_NONE, DTEXT_CENTER, DTEXT_TOP, fmt, kBps);
}
dprint_opt(x, y+1, C_BLACK, C_NONE, DTEXT_CENTER, DTEXT_TOP, fmt, kBps);
}
#define print_speed(x, y, FIELD) \
print_speed(x, y, info[selection].time.FIELD, info[selection].speed.FIELD)
#define print_speed(x, y, unit, FIELD) \
print_speed(x, y, unit, \
info[selection].time.FIELD, info[selection].speed.FIELD)
/* gintctl_perf_memory(): Memory primitives and reading/writing speed */
void gintctl_perf_memory(void)
{
// TODO: Also test copy speed
int key=0, selection=0;
int key=0, selection=0, unit=0;
info_t *info = malloc(REGIONS_COUNT * sizeof *info);
memset(info, 0xff, REGIONS_COUNT * sizeof *info);
@ -279,48 +335,57 @@ void gintctl_perf_memory(void)
while(key != KEY_EXIT) {
dclear(C_WHITE);
row_title("Memory read/write speed");
dprint_opt(DWIDTH/2, row_y(1), C_BLACK, C_NONE, DTEXT_CENTER,
DTEXT_TOP, "[%d/%d] %s", selection+1, REGIONS_COUNT,
REGIONS_NAMES[selection]);
/* row_print(2, 1, "%p (%d bytes, %d rounds)",
row_print(1, 1, "%s", REGIONS_NAMES[selection]);
dprint_opt(DWIDTH-40, row_y(1), C_BLACK, C_NONE, DTEXT_CENTER,
DTEXT_TOP, "%d/%d", selection+1, REGIONS_COUNT);
row_print(2, 1, "%p (%d bytes, %d rounds)",
REGIONS[selection]->pointer,
REGIONS[selection]->size,
REGIONS[selection]->rounds); */
row_print(2, 1, "%p (%d * %d = %d)", info, REGIONS_COUNT, sizeof *info,
REGIONS_COUNT * sizeof *info);
REGIONS[selection]->rounds);
dprint_opt(150, 53, C_BLACK, C_NONE, DTEXT_CENTER, DTEXT_TOP,
"8-bit");
"(8-bit)");
dprint_opt(240, 53, C_BLACK, C_NONE, DTEXT_CENTER, DTEXT_TOP,
"16-bit");
"(16-bit)");
dprint_opt(330, 53, C_BLACK, C_NONE, DTEXT_CENTER, DTEXT_TOP,
"32-bit");
"(32-bit)");
dprint(6, 74, C_BLACK, "CPU read:");
print_speed(150, 78, mem_read8);
print_speed(240, 78, mem_read16);
print_speed(330, 78, mem_read32);
dprint(6, 74, C_BLACK, "CPU read seq:");
print_speed(155, 74, unit, mem_read8);
print_speed(245, 74, unit, mem_read16);
print_speed(335, 74, unit, mem_read32);
dprint(6, 102, C_BLACK, "CPU write:");
print_speed(150, 106, mem_write8);
print_speed(240, 106, mem_write16);
print_speed(330, 106, mem_write32);
dprint(6, 88, C_BLACK, "CPU read alt:");
print_speed(155, 88, unit, mem_read8_alt);
print_speed(245, 88, unit, mem_read16_alt);
print_speed(335, 88, unit, mem_read32_alt);
dprint(6, 130, C_BLACK, "DSP read:");
print_speed(240, 134, mem_dsps_read16);
print_speed(330, 134, mem_dsps_read32);
dprint(6, 102, C_BLACK, "CPU write seq:");
print_speed(155, 102, unit, mem_write8);
print_speed(245, 102, unit, mem_write16);
print_speed(335, 102, unit, mem_write32);
dprint(6, 158, C_BLACK, "DSP write:");
print_speed(240, 162, mem_dsps_write16);
print_speed(330, 162, mem_dsps_write32);
dprint(6, 116, C_BLACK, "CPU write alt:");
print_speed(155, 116, unit, mem_write8_alt);
print_speed(245, 116, unit, mem_write16_alt);
print_speed(335, 116, unit, mem_write32_alt);
dprint(6, 130, C_BLACK, "DSP read seq:");
print_speed(245, 130, unit, mem_dsps_read16);
print_speed(335, 130, unit, mem_dsps_read32);
dprint(6, 158, C_BLACK, "DSP write seq:");
print_speed(245, 158, unit, mem_dsps_write16);
print_speed(335, 158, unit, mem_dsps_write32);
dprint(6, 186, C_BLACK, "dma_memset:");
print_speed(150, 190, dma_memset);
print_speed(155, 186, unit, dma_memset);
if(selection > 0)
dprint(10, row_y(1), C_BLACK, "<");
dprint(DWIDTH-72, row_y(1), C_BLACK, "<");
if(selection < REGIONS_COUNT - 1)
dprint(DWIDTH-15, row_y(1), C_BLACK, ">");
dprint(DWIDTH-12, row_y(1), C_BLACK, ">");
fkey_button(1, "UNIT");
fkey_button(6, "RUN ALL");
dupdate();
@ -329,6 +394,8 @@ void gintctl_perf_memory(void)
selection--;
if(key == KEY_RIGHT && selection < REGIONS_COUNT-1)
selection++;
if(key == KEY_F1)
unit = !unit;
if(key == KEY_F6) {
for(int i = 0; i < REGIONS_COUNT; i++)
benchmark(REGIONS[i], &info[i]);