diff --git a/src/perf/memory.S b/src/perf/memory.S index 8e240f5..e953074 100644 --- a/src/perf/memory.S +++ b/src/perf/memory.S @@ -1,7 +1,7 @@ /* Useful macros to get a repeating DSP loop for instances with more than 4096 iterations. Uses r1 and r2, labels 3: and 4:. */ #define PRELUDE(ITERATIONS) \ - mov.w .w4095, r1; \ + mov.l .l4095, r1; \ mov #0, r0; \ 3: ldrs 1f; \ cmp/hi r1, ITERATIONS; \ @@ -70,6 +70,69 @@ _mem_write32: 1: 2: mov.l r0, @-r4 EPILOGUE(r5) +/* Alternate reads from CPU */ + +.global _mem_read8_alt +.global _mem_read16_alt +.global _mem_read32_alt + +_mem_read8_alt: + shlr r6 + PRELUDE(r6) +1: mov.b @r4, r0 +2: mov.b @r5, r0 + EPILOGUE(r6) + +_mem_read16_alt: + shlr2 r6 + PRELUDE(r6) +1: mov.w @r4, r0 +2: mov.w @r5, r0 + EPILOGUE(r6) + +_mem_read32_alt: + shlr r6 + shlr2 r6 + PRELUDE(r6) +1: mov.l @r4, r0 +2: mov.l @r5, r0 + EPILOGUE(r6) + +/* Alternate writes from CPU */ + +.global _mem_write8_alt +.global _mem_write16_alt +.global _mem_write32_alt + +_mem_write8_alt: + mov.b @r4, r0 + mov.b @r5, r3 + shlr r6 + mov #0, r0 + PRELUDE(r6) +1: mov.b r0, @r4 +2: mov.b r3, @r5 + EPILOGUE(r6) + +_mem_write16_alt: + mov.w @r4, r0 + mov.w @r5, r3 + shlr2 r6 + PRELUDE(r6) +1: mov.w r0, @r4 +2: mov.w r3, @r5 + EPILOGUE(r6) + +_mem_write32_alt: + mov.l @r4, r0 + mov.l @r5, r3 + shlr r6 + shlr2 r6 + PRELUDE(r6) +1: mov.l r0, @r4 +2: mov.l r3, @r5 + EPILOGUE(r6) + /* Memory reads and writes from DSP XRAM */ .global _mem_dspx_read16 @@ -140,4 +203,6 @@ _mem_dsps_write32: 1: 2: movs.l x0, @r4+ EPILOGUE(r5) -.w4095: .word 4095 +.align 4 +.l4095: + .long 4095 diff --git a/src/perf/memory.c b/src/perf/memory.c index 84a2010..2a006ed 100644 --- a/src/perf/memory.c +++ b/src/perf/memory.c @@ -22,10 +22,18 @@ extern void mem_read8 (void *mem, int size); extern void mem_read16 (void *mem, int size); extern void mem_read32 (void *mem, int size); -/* Right asm writes of different sizes. SPU2 memory only supports 32-bit */ +/* Tight asm writes of different sizes. SPU2 memory only supports 32-bit */ extern void mem_write8 (void *mem, int size); extern void mem_write16 (void *mem, int size); extern void mem_write32 (void *mem, int size); +/* Tight asm reads of 2 addresses; size is the total volume; no increment */ +extern void mem_read8_alt (void *mem1, void *mem2, int size); +extern void mem_read16_alt (void *mem1, void *mem2, int size); +extern void mem_read32_alt (void *mem1, void *mem2, int size); +/* Tight asm writes of 2 addresses; size is the total volume; no increment */ +extern void mem_write8_alt (void *mem1, void *mem2, int size); +extern void mem_write16_alt (void *mem1, void *mem2, int size); +extern void mem_write32_alt (void *mem1, void *mem2, int size); /* Same using the DSP's XRAM addressing instructions (movx) */ extern void mem_dspx_read16 (void *mem, int size); extern void mem_dspx_read32 (void *mem, int size); @@ -60,7 +68,7 @@ extern void *dma_memcpy (void *dst, void const *src, size_t size); GILRAM GALIGNED(32) static char ilram_buffer[0x800]; GXRAM GALIGNED(32) static char xram_buffer[0x800]; -// GYRAM GALIGNED(32) static char yram_buffer[0x800]; +GYRAM GALIGNED(32) static char yram_buffer[0x800]; #define pram0_buffer ((void *)0xfe200000) typedef struct @@ -87,12 +95,13 @@ region_t RAM_CU = { (void*)0x8c200000, 65536, 1, 0 }; region_t RAM_NC = { (void*)0xac200000, 2048, 16, 0 }; region_t ILRAM = { ilram_buffer, 2048, 64, 0 }; region_t XRAM = { xram_buffer, 2048, 64, DSPXRAM }; +region_t YRAM = { yram_buffer, 2048, 64, DSPXRAM }; region_t PRAM0 = { pram0_buffer, 2048, 16, ONLY32BIT }; region_t const *REGIONS[] = { &ROM_CF_MMU, &ROM_CU_MMU, &ROM_CF, &ROM_CU, &ROM_NC, &RAM_CF_MMU, &RAM_CU_MMU, &RAM_CF, &RAM_CU, &RAM_NC, - &ILRAM, &XRAM, &PRAM0, + &ILRAM, &XRAM, &YRAM, &PRAM0, }; char const *REGIONS_NAMES[] = { "ROM (cached, MMU)", "ROM (cached linear, MMU)", @@ -101,7 +110,7 @@ char const *REGIONS_NAMES[] = { "RAM (cached, MMU)", "RAM (cached linear, MMU)", "RAM (cached, no MMU)", "RAM (cached linear, no MMU)", "RAM (uncached, no MMU)", - "ILRAM", "XRAM", "PRAM0", + "ILRAM", "XRAM", "YRAM", "PRAM0", }; #define REGIONS_COUNT ((int)(sizeof REGIONS / sizeof REGIONS[0])) @@ -112,7 +121,9 @@ char const *REGIONS_NAMES[] = { typedef struct { int mem_read8, mem_read16, mem_read32; + int mem_read8_alt, mem_read16_alt, mem_read32_alt; int mem_write8, mem_write16, mem_write32; + int mem_write8_alt, mem_write16_alt, mem_write32_alt; int dma_memset; union { @@ -147,82 +158,124 @@ static void benchmark(region_t const *region, info_t *info) memset(info, 0xff, sizeof *info); int f = region->flags; + int size = region->size; + void *p1 = region->pointer; + void *p2 = p1 + size / 2; - if(~f & ONLY32BIT) + /* Hack to switch page on XRAM/YRAM for reading and writing tests */ + if(f & DSPXRAM) + p2 = (void *)((uint32_t)p1 ^ 0x00001000); + + if(~f & ONLY32BIT) { info->time.mem_read8 = prof_exec({ for(int i = 0; i < region->rounds; i++) - mem_read8(region->pointer, region->size); + mem_read8(p1, size); }); - if(~f & ONLY32BIT) + info->time.mem_read8_alt = prof_exec({ + for(int i = 0; i < region->rounds; i++) + mem_read8_alt(p1, p2, size); + }); + } + if(~f & ONLY32BIT) { info->time.mem_read16 = prof_exec({ for(int i = 0; i < region->rounds; i++) - mem_read16(region->pointer, region->size); + mem_read16(p1, size); }); + info->time.mem_read16_alt = prof_exec({ + for(int i = 0; i < region->rounds; i++) + mem_read16_alt(p1, p2, size); + }); + } info->time.mem_read32 = prof_exec({ for(int i = 0; i < region->rounds; i++) - mem_read32(region->pointer, region->size); + mem_read32(p1, size); + }); + info->time.mem_read32_alt = prof_exec({ + for(int i = 0; i < region->rounds; i++) + mem_read32_alt(p1, p2, size); }); - if((~f & READONLY) && (~f & ONLY32BIT)) + if((~f & READONLY) && (~f & ONLY32BIT)) { info->time.mem_write8 = prof_exec({ for(int i = 0; i < region->rounds; i++) - mem_write8(region->pointer, region->size); + mem_write8(p1, size); }); - if((~f & READONLY) && (~f & ONLY32BIT)) + info->time.mem_write8_alt = prof_exec({ + for(int i = 0; i < region->rounds; i++) + mem_write8_alt(p1, p2, size); + }); + } + if((~f & READONLY) && (~f & ONLY32BIT)) { info->time.mem_write16 = prof_exec({ for(int i = 0; i < region->rounds; i++) - mem_write16(region->pointer, region->size); + mem_write16(p1, size); }); - if(~f & READONLY) + info->time.mem_write16_alt = prof_exec({ + for(int i = 0; i < region->rounds; i++) + mem_write16_alt(p1, p2, size); + }); + } + if(~f & READONLY) { info->time.mem_write32 = prof_exec({ for(int i = 0; i < region->rounds; i++) - mem_write32(region->pointer, region->size); + mem_write32(p1, size); }); + info->time.mem_write32_alt = prof_exec({ + for(int i = 0; i < region->rounds; i++) + mem_write32_alt(p1, p2, size); + }); + } if((~f & READONLY) && (~f & VIRTUAL)) info->time.dma_memset = prof_exec({ for(int i = 0; i < region->rounds; i++) - dma_memset(region->pointer, 0, region->size); + dma_memset(p1, 0, size); }); if(f & DSPXRAM) { info->time.mem_dspx_read16 = prof_exec({ for(int i = 0; i < region->rounds; i++) - mem_dspx_read16(region->pointer, region->size); + mem_dspx_read16(p1, size); }); info->time.mem_dspx_read32 = prof_exec({ for(int i = 0; i < region->rounds; i++) - mem_dspx_read32(region->pointer, region->size); + mem_dspx_read32(p1, size); }); info->time.mem_dspx_write16 = prof_exec({ for(int i = 0; i < region->rounds; i++) - mem_dspx_write16(region->pointer, region->size); + mem_dspx_write16(p1, size); }); info->time.mem_dspx_write32 = prof_exec({ for(int i = 0; i < region->rounds; i++) - mem_dspx_write32(region->pointer, region->size); + mem_dspx_write32(p1, size); }); } if((~f & DSPXRAM) && (~f & ONLY32BIT)) info->time.mem_dsps_read16 = prof_exec({ for(int i = 0; i < region->rounds; i++) - mem_dsps_read16(region->pointer, region->size); + mem_dsps_read16(p1, size); }); if(~f & DSPXRAM) info->time.mem_dsps_read32 = prof_exec({ for(int i = 0; i < region->rounds; i++) - mem_dsps_read32(region->pointer, region->size); + mem_dsps_read32(p1, size); }); if((~f & DSPXRAM) && (~f & ONLY32BIT) && (~f & READONLY)) info->time.mem_dsps_write16 = prof_exec({ for(int i = 0; i < region->rounds; i++) - mem_dsps_write16(region->pointer, region->size); + mem_dsps_write16(p1, size); }); if((~f & DSPXRAM) && (~f & READONLY)) info->time.mem_dsps_write32 = prof_exec({ for(int i = 0; i < region->rounds; i++) - mem_dsps_write32(region->pointer, region->size); + mem_dsps_write32(p1, size); + }); + + if(~f & READONLY) + info->time.dma_memset = prof_exec({ + for(int i = 0; i < region->rounds; i++) + dma_memset(p1, 0, size); }); /* Cheeky method to read all ints in such a packed struct */ @@ -243,35 +296,38 @@ static void benchmark(region_t const *region, info_t *info) // Main interface //--- -void print_speed(int x, int y, int us, int kBps) +void print_speed(int x, int y, int unit, int us, int kBps) { if(us == -1 && kBps == -1) { - dprint_opt(x, y, C_BLACK, C_NONE, DTEXT_CENTER, DTEXT_MIDDLE, "-"); + dprint_opt(x, y, C_BLACK, C_NONE, DTEXT_CENTER, DTEXT_TOP, "-"); return; } - dprint_opt(x, y-2, C_BLACK, C_NONE, DTEXT_CENTER, DTEXT_BOTTOM, - "%d us", us); - - char const *fmt; - if(kBps >= 100000) { - fmt = "%.1D M/s"; - kBps /= 100; + if(unit == 1) { + dprint_opt(x, y, C_BLACK, C_NONE, DTEXT_CENTER, DTEXT_TOP, "%d us",us); } else { - fmt = "%.2D M/s"; - kBps /= 10; + char const *fmt; + if(kBps >= 100000) { + fmt = "%.1D M/s"; + kBps /= 100; + } + else { + fmt = "%.2D M/s"; + kBps /= 10; + } + dprint_opt(x, y, C_BLACK, C_NONE, DTEXT_CENTER, DTEXT_TOP, fmt, kBps); } - dprint_opt(x, y+1, C_BLACK, C_NONE, DTEXT_CENTER, DTEXT_TOP, fmt, kBps); } -#define print_speed(x, y, FIELD) \ - print_speed(x, y, info[selection].time.FIELD, info[selection].speed.FIELD) +#define print_speed(x, y, unit, FIELD) \ + print_speed(x, y, unit, \ + info[selection].time.FIELD, info[selection].speed.FIELD) /* gintctl_perf_memory(): Memory primitives and reading/writing speed */ void gintctl_perf_memory(void) { // TODO: Also test copy speed - int key=0, selection=0; + int key=0, selection=0, unit=0; info_t *info = malloc(REGIONS_COUNT * sizeof *info); memset(info, 0xff, REGIONS_COUNT * sizeof *info); @@ -279,48 +335,57 @@ void gintctl_perf_memory(void) while(key != KEY_EXIT) { dclear(C_WHITE); row_title("Memory read/write speed"); - dprint_opt(DWIDTH/2, row_y(1), C_BLACK, C_NONE, DTEXT_CENTER, - DTEXT_TOP, "[%d/%d] %s", selection+1, REGIONS_COUNT, - REGIONS_NAMES[selection]); -/* row_print(2, 1, "%p (%d bytes, %d rounds)", + row_print(1, 1, "%s", REGIONS_NAMES[selection]); + dprint_opt(DWIDTH-40, row_y(1), C_BLACK, C_NONE, DTEXT_CENTER, + DTEXT_TOP, "%d/%d", selection+1, REGIONS_COUNT); + row_print(2, 1, "%p (%d bytes, %d rounds)", REGIONS[selection]->pointer, REGIONS[selection]->size, - REGIONS[selection]->rounds); */ - row_print(2, 1, "%p (%d * %d = %d)", info, REGIONS_COUNT, sizeof *info, - REGIONS_COUNT * sizeof *info); + REGIONS[selection]->rounds); dprint_opt(150, 53, C_BLACK, C_NONE, DTEXT_CENTER, DTEXT_TOP, - "8-bit"); + "(8-bit)"); dprint_opt(240, 53, C_BLACK, C_NONE, DTEXT_CENTER, DTEXT_TOP, - "16-bit"); + "(16-bit)"); dprint_opt(330, 53, C_BLACK, C_NONE, DTEXT_CENTER, DTEXT_TOP, - "32-bit"); + "(32-bit)"); - dprint(6, 74, C_BLACK, "CPU read:"); - print_speed(150, 78, mem_read8); - print_speed(240, 78, mem_read16); - print_speed(330, 78, mem_read32); + dprint(6, 74, C_BLACK, "CPU read seq:"); + print_speed(155, 74, unit, mem_read8); + print_speed(245, 74, unit, mem_read16); + print_speed(335, 74, unit, mem_read32); - dprint(6, 102, C_BLACK, "CPU write:"); - print_speed(150, 106, mem_write8); - print_speed(240, 106, mem_write16); - print_speed(330, 106, mem_write32); + dprint(6, 88, C_BLACK, "CPU read alt:"); + print_speed(155, 88, unit, mem_read8_alt); + print_speed(245, 88, unit, mem_read16_alt); + print_speed(335, 88, unit, mem_read32_alt); - dprint(6, 130, C_BLACK, "DSP read:"); - print_speed(240, 134, mem_dsps_read16); - print_speed(330, 134, mem_dsps_read32); + dprint(6, 102, C_BLACK, "CPU write seq:"); + print_speed(155, 102, unit, mem_write8); + print_speed(245, 102, unit, mem_write16); + print_speed(335, 102, unit, mem_write32); - dprint(6, 158, C_BLACK, "DSP write:"); - print_speed(240, 162, mem_dsps_write16); - print_speed(330, 162, mem_dsps_write32); + dprint(6, 116, C_BLACK, "CPU write alt:"); + print_speed(155, 116, unit, mem_write8_alt); + print_speed(245, 116, unit, mem_write16_alt); + print_speed(335, 116, unit, mem_write32_alt); + + dprint(6, 130, C_BLACK, "DSP read seq:"); + print_speed(245, 130, unit, mem_dsps_read16); + print_speed(335, 130, unit, mem_dsps_read32); + + dprint(6, 158, C_BLACK, "DSP write seq:"); + print_speed(245, 158, unit, mem_dsps_write16); + print_speed(335, 158, unit, mem_dsps_write32); dprint(6, 186, C_BLACK, "dma_memset:"); - print_speed(150, 190, dma_memset); + print_speed(155, 186, unit, dma_memset); if(selection > 0) - dprint(10, row_y(1), C_BLACK, "<"); + dprint(DWIDTH-72, row_y(1), C_BLACK, "<"); if(selection < REGIONS_COUNT - 1) - dprint(DWIDTH-15, row_y(1), C_BLACK, ">"); + dprint(DWIDTH-12, row_y(1), C_BLACK, ">"); + fkey_button(1, "UNIT"); fkey_button(6, "RUN ALL"); dupdate(); @@ -329,6 +394,8 @@ void gintctl_perf_memory(void) selection--; if(key == KEY_RIGHT && selection < REGIONS_COUNT-1) selection++; + if(key == KEY_F1) + unit = !unit; if(key == KEY_F6) { for(int i = 0; i < REGIONS_COUNT; i++) benchmark(REGIONS[i], &info[i]);