diff --git a/CGDOOM-minisdk/CGDOOM/Makefile b/CGDOOM-minisdk/CGDOOM/Makefile index 8fe4866..a436705 100644 --- a/CGDOOM-minisdk/CGDOOM/Makefile +++ b/CGDOOM-minisdk/CGDOOM/Makefile @@ -6,7 +6,8 @@ RM=rm CFLAGS=-m4a-nofpu -mb -fgcse-sm -fgcse-las -fgcse-after-reload -Isrc -O3 -fmerge-all-constants -mhitachi -fuse-linker-plugin -Wall -Wextra -Wno-sign-compare -Wno-unused-but-set-variable -Wno-unused-but-set-parameter -I../../../../include -lgcc -L../../../../lib -I./ -I../../cgdoom -D_FXCG_MINICOMPAT LDFLAGS=$(CFLAGS) -nostartfiles -T../../../../toolchain/prizm.x -Wl,-static -lfxcg -lgcc -Wl,-Map=$(PROJ_NAME).map CSOURCES=$(wildcard ../../cgdoom/*.c) -OBJECTS=$(CSOURCES:.c=.o) +ASMSOURCES=$(wildcard ../../cgdoom/*.s) +OBJECTS=$(CSOURCES:.c=.o) $(ASMSOURCES:.s=.o) PROJ_NAME=CG_Doom BIN=$(PROJ_NAME).bin ELF=$(PROJ_NAME).elf diff --git a/CGDOOM-minisdk/CGDOOM/platform.h b/CGDOOM-minisdk/CGDOOM/platform.h index 9893e86..e7d0622 100644 --- a/CGDOOM-minisdk/CGDOOM/platform.h +++ b/CGDOOM-minisdk/CGDOOM/platform.h @@ -3,26 +3,46 @@ #define PLATFORM_H //--- -// WAD file access method (enable exactly one) +// WAD file access in Flash //--- +/* File access method (define exactly one) */ + /* Use BFile (100% accurate but slows down the game quite a bit because of reads happening all the time; mostly a good reference for testing) */ // #define CGDOOM_WAD_BFILE /* Search fragments in physical ROM when loading the game, and copy by hand from ROM to RAM during accesses (much faster) */ #define CGDOOM_WAD_MAPPING -/* Idem, but copy copy with DMA (even faster) */ -// #define CGDOOM_WAD_MAPPING_DMA /* Settings for file mappings: traverse the whole 32-MiB Flash */ -#define FLASH_START (0xA0000000) -#define FLASH_END (0xA2000000) -/* Storage unit is a cluster of 512 bytes */ +#define FLASH_START ((const void *)0xA0000000) +#define FLASH_END ((const void *)0xA2000000) +/* Where we expect the file system to start, approximately (this region is + searched first to hit sectors more quickly) */ +#define FLASH_FS_HINT ((const void *)0xA0C00000) +/* Flash too, but cached; slower for sector searches but much faster for actual + data loads while in-game */ +#define FLASH_CACHED_START ((const void *)0x80000000) +#define FLASH_CACHED_END ((const void *)0x82000000) + +/* Storage unit is a cluster of 512 bytes; Fugue tries to use clusters of 4 kiB + (8 sectors) but in exceptional circumstances cluster alignment can be lost + (such as when sectors are dead) */ #define FLASH_PAGE_SIZE 512 #define FLASH_PAGE_SIZE_LOG2 9 #define FLASH_PAGE_COUNT ((FLASH_END-FLASH_START) / FLASH_PAGE_SIZE) +/* Size of Bfile reads; performance is good when it's at least a cluster */ +#define FLASH_BFILE_UNIT 4096 + +/* Whether to index ROM sectors most likely to have data to use in sector + searches (comment out to disable) */ +#define FLASH_INDEX +/* Index contains 4 kiB cluster from FLASH_FS_HINT to FLASH_END; fragments are + almost always 4 kiB-aligned, and only occasionally not */ +#define FLASH_INDEX_SIZE ((FLASH_END-FLASH_FS_HINT) / 4096) + //--- #include "keyboard.hpp" diff --git a/README b/README index c1e4f9f..11aaf0e 100644 --- a/README +++ b/README @@ -7,10 +7,12 @@ Credit goes to: * Lephenixnoir for the final fixes and fx-CG 50 version. TODO: +-> Fix level recap texture +-> Fix screen not cleared when changing resolution, add larger resolutions +-> Try and support more WADs -> Reenable LTO if possible -> Try and use more memory regions in z_zone.c -> Overclocking etc. --> Improve file mapping speed (DMA) CGDOOM used to be compiled with the mini-SDK. However, it's become quite difficult to get a copy of that. Instead, this port is built with a slightly @@ -26,5 +28,6 @@ The differences are (I might push it later): * Linker script sets 500k of RAM instead of 64k * LTO disabled (hopefully it could be reenabled later) * Syscall memcpy() (apparently broken) replaced by fxlibc memcpy() +* fxlibc qsort() is added in libc/ [1] https://github.com/Jonimoose/libfxcg/ diff --git a/cgdoom/cgdoom-asm.s b/cgdoom/cgdoom-asm.s new file mode 100644 index 0000000..0156289 --- /dev/null +++ b/cgdoom/cgdoom-asm.s @@ -0,0 +1,105 @@ +.global _CGD_sector_memcmp +.align 4 + +# A pretty fast memcmp for 512-byte sectors, with equal(0)-different(1) output +# r4: 32-aligned pointer to sector in RAM (preferably 1-cycle operand bus RAM) +# r5: 32-aligned pointer to sector in ROM +# r6: 512 (ignored; for compatibility with memcmp prototype) +# +# There are two main ideas in this code: +# +# * Read with words, since such is the affinity of the ROM. (I don't know why.) +# I tested with longwords, the performance is much worse; bytes are somewhere +# in-between, which tormented me as I wondered why the most trivial memcmp() +# with poor assembler from libfxcg was faster than my hand-written function. +# +# * Weave iterations with smart register allocation to exploit superscalar +# parallelism. We read to r0/r1 while comparing r2/r3, then vice-versa. The +# two mov.w (LS) for one comparison execute in parallel with the cmp (EX) and +# bf (BR) of the previous comparison, so overall one comparison takes 2 +# cycles (plus any extra cycles in ROM reads if the cache isn't hit or +# doesn't respond immediately, and some loop overhead). +# +_CGD_sector_memcmp: + # For the first 32 bytes, compare as fast as possible to exit early + # when the sectors don't match (this saves a little bit). + mov #16, r7 +1: mov.w @r5+, r0 + mov.w @r4+, r1 + cmp/eq r0, r1 + bf .fail + dt r7 + bf 1b + + mov #30, r7 + +.line: + # There is a 2-cycle delay for the RAW dependency between each mov.b + # and the corresponding use. Here the delay is honored so there are no + # cycles lost to RAW dependencies. + + mov.w @r5+, r0 + nop + + mov.w @r4+, r1 + nop + + mov.w @r5+, r2 + nop + + mov.w @r4+, r3 + cmp/eq r0, r1 + + mov.w @r5+, r0 + bf .fail + + mov.w @r4+, r1 + cmp/eq r2, r3 + + mov.w @r5+, r2 + bf .fail + + mov.w @r4+, r3 + cmp/eq r0, r1 + + mov.w @r5+, r0 + bf .fail + + mov.w @r4+, r1 + cmp/eq r2, r3 + + mov.w @r5+, r2 + bf .fail + + mov.w @r4+, r3 + cmp/eq r0, r1 + + mov.w @r5+, r0 + bf .fail + + mov.w @r4+, r1 + cmp/eq r2, r3 + + mov.w @r5+, r2 + bf .fail + + mov.w @r4+, r3 + cmp/eq r0, r1 + + # These two can run in parallel (BR/EX) + bf .fail + cmp/eq r2, r3 + + bf .fail + + dt r7 + bf .line + +.success: + rts + mov #0, r0 + +.fail: + # We don't specify an order + rts + mov #1, r0 diff --git a/cgdoom/cgdoom.c b/cgdoom/cgdoom.c index 06907df..3aebc90 100644 --- a/cgdoom/cgdoom.c +++ b/cgdoom/cgdoom.c @@ -263,23 +263,40 @@ int CGDstrnicmp (const char*s1,const char*s2,int iLen) // // The file is obviously fragmented and Yatis reverse-engineered Fugue enough // to determine that storage units are sectors of 512 bytes. While clusters of -// 4 kiB are used too, a file might not start on the first sector of a cluster, -// and some sectors might also be dysfunctional. +// 4 kiB are used in general, a file might not start on the first sector of a +// cluster, and some sectors might also simply be dead. +// +// Although all 65536 Flash sectors are searched when needed, several +// heuristics are used: +// * The region between FLASH_FS_HINT and FLASH_END is searched first, since +// this is roughly where the filesystem is located. +// * All sectors aligned on 4-kiB boundaries between FLASH_FS_HINT and +// FLASH_END (of which there are 4096) are indexed by their first 4 bytes and +// binary searched for matches before anything else. // // See for Flash traversal parameters. /////////////////////////////////////////////////////////////////////////////// -static uint16_t const *cgdoom_wad_path = u"\\\\fls0\\doom.wad"; - -#ifdef CGDOOM_WAD_BFILE -/* File descriptor to WAD file, used in Flash_ReadFile calls from w_wad.c. */ -static int cgdoom_wad_fd = -1; -#endif - //The whole sound doesn't fir onto the RAM. //Reading per partes is not possible as this is synchronnous player (there would be silences when reading). -//So I read each page (4KB)of the wav file and try to find it in the flash. -//Simply finding start of the file is not enough because of fragmentation. + +static uint16_t const *gWADpath = u"\\\\fls0\\doom.wad"; + +/* Fast memcmp() for 512-byte sectors. */ +int CGD_sector_memcmp(const void *fast_ram, const void *rom, size_t _512); + +/* Caching structure to read WAD files by larger chunks than Flash sectors. */ +typedef struct { + int fd; + int size, offset; + char *data; /* of size FLASH_BFILE_UNIT */ +} FileAccessCache; + +/* Index of most likely ROM sectors. */ +typedef struct { + const void *sector; + uint32_t start_bytes; +} SectorIndexInfo; //allocate 1024 items for max 1024 fragments of the file. // 640 KB should to be enough for everyone ;-) @@ -314,6 +331,9 @@ void I_Error (char *error, ...); #ifdef CGDOOM_WAD_BFILE +/* File descriptor to WAD file, used in Flash_ReadFile calls from w_wad.c. */ +static int gWADfd = -1; + int FindInFlash(void **buf, int size, int readpos) { return 0; @@ -321,102 +341,168 @@ int FindInFlash(void **buf, int size, int readpos) int Flash_ReadFile(void *buf, int size, int readpos) { - return Bfile_ReadFile_OS(cgdoom_wad_fd, buf, size, readpos); + return Bfile_ReadFile_OS(gWADfd, buf, size, readpos); } -#else /* CGDOOM_WAD_MAPPING, CGDOOM_WAD_MAPPING_DMA */ +#else /* CGDOOM_WAD_MAPPING */ static FileMapping *gpWADMap = 0; +static SectorIndexInfo *gIndex = NULL; -int CreateFileMapping(const unsigned short *pFileName,FileMapping *pMap) +/* Read next sector from file, while caching into a buffer. */ +const void *ReadNextSector(FileAccessCache *fc, int *size) { - int iResult = 0; - char cBuffer[FLASH_PAGE_SIZE]; - int hFile = Bfile_OpenFile_OS(pFileName,0,0); - int iLength; - char *pFlashFS = (char *)FLASH_START; + if(fc->size == 0) + { + fc->size = Bfile_ReadFile_OS(fc->fd, fc->data, FLASH_BFILE_UNIT, -1); + fc->offset = 0; + } + if(fc->size <= 0) + { + *size = -1; + return NULL; + } + + *size = min(fc->size, FLASH_PAGE_SIZE); + fc->size -= *size; + const void *sector = fc->data + fc->offset; + fc->offset += *size; + return sector; +} + +/* Compare two sectors in ROM for the index. */ +int IndexCompareSectors(const void *p1, const void *p2) +{ + const SectorIndexInfo *i1 = p1; + const SectorIndexInfo *i2 = p2; + return i1->start_bytes - i2->start_bytes; +} + +/* Find all matching sectors in index (returns in-out interval). */ +void IndexSearchSector(SectorIndexInfo *index, const void *buf, int *lo_ptr, int *hi_ptr) +{ + uint32_t needle = *(const uint32_t *)buf; + *lo_ptr = *hi_ptr = -1; + + /* Find the first occurrence, set it in *lo_ptr */ + int lo=0, hi=FLASH_INDEX_SIZE; + + while(lo < hi) { + int m = (lo + hi) / 2; + int diff = index[m].start_bytes - needle; + + if(diff < 0) lo = m + 1; + else hi = m; + } + + if(lo >= FLASH_INDEX_SIZE || index[lo].start_bytes != needle) return; + *lo_ptr = hi = lo; + + /* Store last occurrence in *hi_ptr */ + do hi++; + while(hi < FLASH_INDEX_SIZE && index[hi].start_bytes == needle); + + *hi_ptr = hi; +} + +static int index_hits = 0; +static int sector_searches = 0; + +/* Find a flash sector which contains the same data as buf. */ +int FindSectorInFlash(const void *buf, int size) +{ + typeof(&memcmp) memcmp_fun = &memcmp; + if(size == FLASH_PAGE_SIZE) memcmp_fun = &CGD_sector_memcmp; + + sector_searches++; + +#ifdef FLASH_INDEX + /* If an index has been built, search in it */ + int lo, hi; + IndexSearchSector(gIndex, buf, &lo, &hi); + for(int i = lo; i < hi; i++) { + if(!memcmp_fun(buf, gIndex[i].sector, size)) { + index_hits++; + return (gIndex[i].sector - FLASH_START) / FLASH_PAGE_SIZE; + } + } +#endif + + const void *sector = FLASH_FS_HINT; + do { + if(!memcmp_fun(buf, sector, size)) + return (sector - FLASH_START) / FLASH_PAGE_SIZE; + + sector += FLASH_PAGE_SIZE; + if(sector == FLASH_END) + sector = FLASH_START; + } + while(sector != FLASH_FS_HINT); + return -1; +} + +int CreateFileMapping(int fd, FileMapping *pMap) +{ + /* Cache accesses through a larger buffer */ + FileAccessCache fc = { + .data = (void *)0xe5007000, /* XRAM */ + .fd = fd + }; + int iLength = 0; pMap->miItemCount = 0; pMap->miTotalLength = 0; - iLength = Bfile_ReadFile_OS(hFile,cBuffer,FLASH_PAGE_SIZE,-1); + + const void *pFileData = ReadNextSector(&fc, &iLength); while(iLength > 0) { - //do not optimize (= do not move these 2 variables before loop)! - // fx-cg allocates pages for file in order so page from the end of the file - //can have lower index than page from the beginning - const char *pTgt = pFlashFS; - int iPageIndx = 0; + int iSectorID = FindSectorInFlash(pFileData, iLength); + if(iSectorID == -1) + return -2; // Page not found! - for(;iPageIndx < FLASH_PAGE_COUNT;iPageIndx++) - { - if(!memcmp(pTgt,cBuffer,iLength)) - { - break; - } - pTgt += FLASH_PAGE_SIZE; - } - if(iPageIndx == FLASH_PAGE_COUNT) - { - //page not found ! - iResult = -2; - goto lbExit; - } - pMap->miItemCount ++; + pMap->miItemCount++; if(pMap->miItemCount >= MAX_FRAGMENTS) - { - //file too fragmented ! - iResult = -3; - goto lbExit; - } - pMap->mTable[pMap->miItemCount-1].msOffset = (unsigned short)iPageIndx; + return -3; // File too fragmented! + + pMap->mTable[pMap->miItemCount-1].msOffset = iSectorID; pMap->mTable[pMap->miItemCount-1].msCount = 0; - //assume fragment has more pages + + /* Look for consecutive sectors in the same fragment */ + const void *pFragment = FLASH_START + (iSectorID * FLASH_PAGE_SIZE); for(;;) { pMap->mTable[pMap->miItemCount-1].msCount++; pMap->miTotalLength += iLength; - iPageIndx++; - pTgt += FLASH_PAGE_SIZE; + iSectorID++; + pFragment += FLASH_PAGE_SIZE; if(iLength < FLASH_PAGE_SIZE) { //this was the last page - iResult = pMap->miTotalLength; - goto lbExit; + return pMap->miTotalLength; } - iLength = Bfile_ReadFile_OS(hFile,cBuffer,FLASH_PAGE_SIZE,-1); + pFileData = ReadNextSector(&fc, &iLength); if(iLength <= 0) - { break; - } - if(memcmp(pTgt,cBuffer,iLength)) - { - break; - } + if((iLength == FLASH_PAGE_SIZE) + ? CGD_sector_memcmp(pFileData, pFragment, iLength) + : memcmp(pFileData, pFragment, iLength)) + break; } } if(iLength < 0) - { - iResult = -1; - } - else - { - if(pMap->miTotalLength >50000) - { - pMap->miTotalLength = 50000;//hack - } + return -1; - iResult = pMap->miTotalLength; + if(pMap->miTotalLength >50000) + { + pMap->miTotalLength = 50000;//hack } - -lbExit: - Bfile_CloseFile_OS(hFile); - return iResult; + return pMap->miTotalLength; } -int FindInFlash(void **buf, int size, int readpos) +int FindInFlash(const void **buf, int size, int readpos) { - int iPageReq = readpos >>FLASH_PAGE_SIZE_LOG2; + int iPageReq = readpos / FLASH_PAGE_SIZE; int iPageIndx = 0; int iCurrOffset = 0, iCurrLen; int iSubOffset; @@ -434,7 +520,7 @@ int FindInFlash(void **buf, int size, int readpos) break; } iPageReq -= gpWADMap->mTable[iPageIndx].msCount; - iCurrOffset += ((int)gpWADMap->mTable[iPageIndx].msCount) << FLASH_PAGE_SIZE_LOG2; + iCurrOffset += ((int)gpWADMap->mTable[iPageIndx].msCount) * FLASH_PAGE_SIZE; iPageIndx++; } iSubOffset = readpos - iCurrOffset; @@ -444,13 +530,13 @@ int FindInFlash(void **buf, int size, int readpos) { iCurrLen = size; } - *buf = ((char *)FLASH_START)+(gpWADMap->mTable[iPageIndx].msOffset << FLASH_PAGE_SIZE_LOG2)+iSubOffset; + *buf = FLASH_CACHED_START + (gpWADMap->mTable[iPageIndx].msOffset * FLASH_PAGE_SIZE) + iSubOffset; return iCurrLen; } int Flash_ReadFile(void *buf, int size, int readpos) { - void *pSrc; + const void *pSrc; int iRet = 0; while(size >0) { @@ -479,7 +565,7 @@ void abort(void){ } /////////////////////////////////////////////////////////////////////////////////////////////////// int main(void){ - InitFlashSimu(cgdoom_wad_path); //load wad file to flash simulation on simulator, do nothing on real HW + InitFlashSimu(gWADpath); //load wad file to flash simulation on simulator, do nothing on real HW #ifdef CG_EMULATOR SaveVRAMBuffer = aSaveVRAMBuffer; SystemStack = aSystemStack; @@ -495,12 +581,29 @@ int main(void){ /* Setup access to WAD file */ #ifdef CGDOOM_WAD_BFILE - cgdoom_wad_fd = Bfile_OpenFile_OS(cgdoom_wad_path, 0, 0); + gWADfd = Bfile_OpenFile_OS(gWADpath, 0, 0); #else int time_start = RTC_GetTicks(); + + #ifdef FLASH_INDEX + /* Index most likely flash sectors into a sorted array, so that sectors + can be hit quickly. The index contains every sector on a 4-kiB + boundary (where fragments are most likely to start) between + FLASH_FS_HINT and FLASH_END. */ + gIndex = (void *)SystemStack; + for(int i = 0; i < FLASH_INDEX_SIZE; i++) { + SectorIndexInfo *info = &gIndex[i]; + info->sector = FLASH_FS_HINT + (i * 4096); + info->start_bytes = *(const uint32_t *)info->sector; + } + qsort(gIndex, FLASH_INDEX_SIZE, sizeof *gIndex, IndexCompareSectors); + #endif + gpWADMap = (FileMapping *)(SaveVRAMBuffer + 2*65536); ASSERT(2*65536 + sizeof(FileMapping) < SAVE_VRAM_SIZE); - int size = CreateFileMapping(cgdoom_wad_path,gpWADMap); + int fd = Bfile_OpenFile_OS(gWADpath,0,0); + int size = CreateFileMapping(fd, gpWADMap); + Bfile_CloseFile_OS(fd); int time_end = RTC_GetTicks(); if(size == -1) { @@ -515,17 +618,19 @@ int main(void){ I_Error ("File too fragmented"); return 1; } - else if(size < 0) { - I_ErrorI ("CreateFileMapping", size, 0, 0, 0); - return 1; - } else { int key; char line[22]; int time_ms = (time_end - time_start) * 8; - CGDAppendNum0_999("mmap (ms): ", time_ms, 1, line); Bdisp_AllClr_VRAM(); locate_OS(1, 1); + CGDAppendNum0_999("mmap (ms): ", time_ms, 1, line); + PrintLine(line, 21); + locate_OS(1, 2); + CGDAppendNum0_999("Searches: ", sector_searches, 1, line); + PrintLine(line, 21); + locate_OS(1, 3); + CGDAppendNum0_999("Index hits: ", index_hits, 1, line); PrintLine(line, 21); Bdisp_PutDisp_DD(); GetKey(&key); diff --git a/cgdoom/os.h b/cgdoom/os.h index 002944e..e9b1a4f 100644 --- a/cgdoom/os.h +++ b/cgdoom/os.h @@ -43,9 +43,17 @@ void I_ErrorI(const char *str, int i1, int i2, int i3, int i4); #define sprintf 212 //return ptr to flash -int FindInFlash(void **buf, int size, int readpos); +int FindInFlash(const void **buf, int size, int readpos); //direct read from flash int Flash_ReadFile(void *buf, int size, int readpos); //CGD: bypass for direct pointers to flash -#define PTR_TO_FLASH(x) (((int)x < FLASH_END) && ((int)x >= FLASH_START)) +#define PTR_TO_FLASH(x) ( \ + ((x) >= FLASH_START && (x) < FLASH_END) || \ + ((x) >= FLASH_CACHED_START && (x) < FLASH_CACHED_END)) + +#define min(x,y) ({ \ + __auto_type __x = (x); \ + __auto_type __y = (y); \ + __x < __y ? __x : __y; \ +})