py/gc: Speed up incremental GC cycles by tracking the last used block.

In applications that use little memory and run GC regularly, the cost of
the sweep phase quickly becomes prohibitives as the amount of RAM
increases.

On an ESP32-S3 with 2 MB of external SPIRAM, for example, a trivial GC
cycle takes a minimum of 40ms, virtually all of it in the sweep phase.

Similarly, on the UNIX port with 1 GB of heap, a trivial GC takes 47 ms,
again virtually all of it in the sweep phase.

This commit speeds up the sweep phase in the case most of the heap is empty
by keeping track of the ID of the highest block we allocated in an area
since the last GC.

The performance benchmark run on PYBV10 shows between +0 and +2%
improvement across the existing performance tests.  These tests don't
really stress the GC, so they were also run with gc.threshold(30000) and
gc.threshold(10000).  For the 30000 case, performance improved by up to
+10% with this commit.  For the 10000 case, performance improved by at
least +10% on 6 tests, and up to +25%.

Signed-off-by: Damien George <damien@micropython.org>
This commit is contained in:
Damien Tournoud 2022-12-15 14:09:19 -08:00 committed by Damien George
parent 70c564324c
commit 2dcd745434
2 changed files with 21 additions and 2 deletions

22
py/gc.c
View File

@ -158,6 +158,7 @@ STATIC void gc_setup_area(mp_state_mem_area_t *area, void *start, void *end) {
#endif
area->gc_last_free_atb_index = 0;
area->gc_last_used_block = 0;
#if MICROPY_GC_SPLIT_HEAP
area->next = NULL;
@ -378,7 +379,14 @@ STATIC void gc_sweep(void) {
// free unmarked heads and their tails
int free_tail = 0;
for (mp_state_mem_area_t *area = &MP_STATE_MEM(area); area != NULL; area = NEXT_AREA(area)) {
for (size_t block = 0; block < area->gc_alloc_table_byte_len * BLOCKS_PER_ATB; block++) {
size_t end_block = area->gc_alloc_table_byte_len * BLOCKS_PER_ATB;
if (area->gc_last_used_block < end_block) {
end_block = area->gc_last_used_block + 1;
}
size_t last_used_block = 0;
for (size_t block = 0; block < end_block; block++) {
MICROPY_GC_HOOK_LOOP(block);
switch (ATB_GET_KIND(area, block)) {
case AT_HEAD:
@ -418,15 +426,20 @@ STATIC void gc_sweep(void) {
#if CLEAR_ON_SWEEP
memset((void *)PTR_FROM_BLOCK(area, block), 0, BYTES_PER_BLOCK);
#endif
} else {
last_used_block = block;
}
break;
case AT_MARK:
ATB_MARK_TO_HEAD(area, block);
free_tail = 0;
last_used_block = block;
break;
}
}
area->gc_last_used_block = last_used_block;
}
}
@ -680,6 +693,8 @@ found:
area->gc_last_free_atb_index = (i + 1) / BLOCKS_PER_ATB;
}
area->gc_last_used_block = MAX(area->gc_last_used_block, end_block);
// mark first block as used head
ATB_FREE_TO_HEAD(area, start_block);
@ -971,11 +986,14 @@ void *gc_realloc(void *ptr_in, size_t n_bytes, bool allow_move) {
// check if we can expand in place
if (new_blocks <= n_blocks + n_free) {
// mark few more blocks as used tail
for (size_t bl = block + n_blocks; bl < block + new_blocks; bl++) {
size_t end_block = block + new_blocks;
for (size_t bl = block + n_blocks; bl < end_block; bl++) {
assert(ATB_GET_KIND(area, bl) == AT_FREE);
ATB_FREE_TO_TAIL(area, bl);
}
area->gc_last_used_block = MAX(area->gc_last_used_block, end_block);
GC_EXIT();
#if MICROPY_GC_CONSERVATIVE_CLEAR

View File

@ -93,6 +93,7 @@ typedef struct _mp_state_mem_area_t {
byte *gc_pool_end;
size_t gc_last_free_atb_index;
size_t gc_last_used_block; // The block ID of the highest block allocated in the area
} mp_state_mem_area_t;
// This structure hold information about the memory allocation system.