gint/src/render-fx/bopti.c

297 lines
7.8 KiB
C

#include <gint/defs/types.h>
#include <gint/display.h>
#include "render-fx.h"
#include "bopti-asm.h"
#pragma GCC optimize("O3")
/* struct command: A rendering command
Includes many computed parameters and handy information. Read-only. */
struct command
{
/* x-coordinate of rendering box & 31, used for shifts */
int x;
/* VRAM pointers */
uint32_t *v1;
uint32_t *v2;
/* Initial offset into VRAM */
int offset;
/* Number of VRAM columns affected by the bounding box; this is the
same as the number of rendered image columns if x=0, and this number
plus 1 otherwise. */
int columns;
/* A certain set of rendering masks (see bopti_render()) */
uint32_t *masks;
/* Whether the first column is real (ie. x>=0) or not */
int real_start;
/* Whether the last column is written to VRAM */
int real_end;
/* Ignored elements between two rendered grid rows */
int vram_stride;
/* Ignored elements between two rendered grid columns */
int data_stride;
/* Whether the image should be drawn on gray mode (this may be 1 even
for images of the mono and mono_alpha profiles) */
int gray;
/* Assembly function, prototype depends on image type */
bopti_asm_t f;
};
/* List of rendering functions */
static asm_mono_t *asm_mono[] = {
bopti_asm_mono,
bopti_asm_mono_alpha,
};
static asm_gray_t *asm_gray[] = {
bopti_gasm_mono,
bopti_gasm_mono_alpha,
bopti_gasm_gray,
bopti_gasm_gray_alpha,
};
static asm_mono_scsp_t *asm_mono_scsp[] = {
bopti_asm_mono_scsp,
bopti_asm_mono_alpha_scsp,
};
static asm_gray_scsp_t *asm_gray_scsp[] = {
bopti_gasm_mono_scsp,
bopti_gasm_mono_alpha_scsp,
bopti_gasm_gray_scsp,
bopti_gasm_gray_alpha_scsp,
};
void bopti_grid(void **layer, int rows, struct command *c)
{
/* Pointers to vram data */
uint32_t *v1 = c->v1, *v2 = c->v2;
/* Current offset into video RAM */
uint offset = c->offset;
/* Pairs of VRAM operands. A function that returns such a pair will be
optimized by GCC into a function returning into r0,r1 which will
avoid some memory accesses. */
pair_t p, pret = { 0 };
/* Same with two pairs for the gray version (no optimization here) */
quadr_t q, qret = { 0 };
/* Monochrome version */
if(!c->gray) while(rows--)
{
p.r = pret.r = v1[offset & 0xff];
for(int col = 0; col < c->columns; col++)
{
/* Shift the pair to the left. When x=0, we should have
pret.r = p.r but due to some intentional UB with
32-bit shifts, pret.r != p.r so we reload p.r. */
p.l = (c->x) ? pret.r : p.r;
/* Load new second element, if offset+1 overflows from
the VRAM we load from offset 0. It doesn't matter
because the result will not be written back, I just
want to avoid reading from outside the VRAM. */
p.r = v1[(offset + 1) & 0xff];
/* The assembly routine blends a longword of data onto
the pair and returns the resulting pair. */
pret = c->f.asm_mono(p, layer, c->masks+col+col,-c->x);
/* Write back the result into VRAM, except for column
-1 (occurs once every row, iff visual_x < 0) */
if(c->real_start + col) v1[offset] = pret.l;
offset++;
}
if(c->real_end) v1[offset] = pret.r;
*layer += c->data_stride;
offset += c->vram_stride;
}
/* Gray version */
else while(rows--)
{
if(c->real_start)
{
q.r1 = qret.r1 = v1[offset & 0xff];
q.r2 = qret.r2 = v2[offset & 0xff];
}
/* Same as before, but 2 buffers at the same time */
for(int col = 0; col < c->columns; col++)
{
q.l1 = (c->x) ? qret.r1 : q.r1;
q.r1 = v1[(offset + 1) & 0xff];
q.l2 = (c->x) ? qret.r2 : q.r2;
q.r2 = v2[(offset + 1) & 0xff];
c->f.asm_gray(q, layer, c->masks+col+col, -c->x,&qret);
if(c->real_start + col)
{
v1[offset] = qret.l1;
v2[offset] = qret.l2;
}
offset++;
}
if(c->real_end)
{
v1[offset] = qret.r1;
v2[offset] = qret.r2;
}
*layer += c->data_stride;
offset += c->vram_stride;
}
}
void bopti_render(bopti_image_t const *img, struct rbox *rbox, uint32_t *v1,
uint32_t *v2)
{
/* Rendering function */
bopti_asm_t f;
if(v2) f.asm_gray = asm_gray[img->profile];
else f.asm_mono = asm_mono[img->profile];
/* Compute rendering masks */
uint32_t vm[4];
masks(rbox->visual_x, rbox->visual_x + rbox->width - 1, vm);
/* Number of layers per profile */
static const int layer_count[] = { 1, 2, 2, 3 };
int layers = layer_count[img->profile];
/* For each pair of consecutive VRAM elements involved, create a mask
from the intersection of the standard vram mask with the shift-mask
related to x not being a multiple of 32 */
uint32_t masks[10] = {
0, vm[0],
vm[0], vm[1],
vm[1], vm[2],
vm[2], vm[3],
vm[3], 0,
};
uint32_t mx = 0xffffffff >> (rbox->x & 31);
for(int i = 0; i < 5; i++)
{
masks[2*i] &= mx;
masks[2*i+1] &= ~mx;
}
/* Position, in masks[], of the first column being rendered */
int left_origin = (rbox->x >> 5) + 1;
/* Number of columns in [img] */
int img_columns = (img->width + 31) >> 5;
/* Interwoven layer data. Skip left columns that are not rendered */
const uint32_t *layer = (void *)img->data;
layer += (rbox->top * img_columns + rbox->left) * layers;
/* Compute and execute the command for this parameters */
struct command c = {
.x = rbox->x & 31,
.v1 = v1,
.v2 = v2 ? v2 : v1,
.offset = (rbox->y << 2) + (rbox->x >> 5),
.columns = rbox->columns,
.masks = masks + 2 * left_origin,
.real_start = (left_origin > 0),
.real_end = (rbox->x & 31) && (left_origin + img_columns < 5),
.vram_stride = 4 - rbox->columns,
.data_stride = ((img_columns - rbox->columns) << 2) * layers,
.gray = (v2 != NULL),
.f = f,
};
bopti_grid((void **)&layer, rbox->height, &c);
}
/* Specialized, faster version for single-column single-position instances */
void bopti_render_scsp(bopti_image_t const *img, struct rbox *rbox,
uint32_t *v1, uint32_t *v2)
{
/* Compute the only rendering mask */
uint32_t mask =
(0xffffffff << (32 - rbox->width)) >> (rbox->visual_x & 31);
/* Number of layers */
int layers = img->profile - (img->profile >> 1) + 1;
/* Number of longwords to skip between rows of [img] */
int img_stride = ((img->width + 31) >> 5) * layers;
/* Interwoven layer data. Skip left columns that are not rendered */
const uint32_t *layer = (void *)img->data;
layer += (rbox->top * img_stride) + (rbox->left * layers);
/* Starting value of VRAM pointers */
int offset = (rbox->y << 2) + (rbox->visual_x >> 5);
v1 += offset;
/* Number of rows */
int rows = rbox->height;
/* Render the grid immediately; mono version */
if(!v2)
{
asm_mono_scsp_t *f = asm_mono_scsp[img->profile];
while(rows--)
{
f(v1, layer, mask, rbox->x);
layer += img_stride;
v1 += 4;
}
}
/* Gray version */
else
{
asm_gray_scsp_t *f = asm_gray_scsp[img->profile];
v2 += offset;
while(rows--)
{
f(v1, layer, mask, v2, rbox->x);
layer += img_stride;
v1 += 4;
v2 += 4;
}
}
}
int bopti_clip(bopti_image_t const *img, struct rbox *r)
{
/* This load/save is not elegant but it makes GCC use register-only
operations, which is what we need for efficiency */
int x = r->visual_x, y = r->y;
int left = r->left, top = r->top;
int width = r->width, height = r->height;
/* Adjust the bounding box of the input image */
if(left < 0) width += left, x -= left, left = 0;
if(top < 0) height += top, y -= top, top = 0;
if(left + width > img->width) width = img->width - left;
if(top + height > img->height) height = img->height - top;
/* Intersect with the bounding box on-screen */
if(x < 0) width += x, left -= x, x = 0;
if(y < 0) height += y, top -= y, y = 0;
if(x + width > DWIDTH) width = DWIDTH - x;
if(y + height > DHEIGHT) height = DHEIGHT - y;
r->visual_x = x;
r->y = y;
r->left = left;
r->top = top;
r->width = width;
r->height = height;
/* Return non-zero if the result is empty */
return (width <= 0 || height <= 0);
}