bopti: performance improvements for SCSP cases

* Defined the single-column single-position (SCSP) situation where a
  single column of the input is blit on a single position of the VRAM.
  Provided optimized assembly and a specialized bopti_render_scsp()
  function.
* Improved the rendered by reducing the amount of computation and
  clarifying the semantics of the rbox.
* Separated rbox setup from clipping by making bopti_render_clip() a
  purely abstract superset of bopti_render_noclip().
This commit is contained in:
Lephe 2020-07-23 10:18:46 +02:00
parent 39664e9bd2
commit 11dd04243f
Signed by: Lephenixnoir
GPG Key ID: 1BBA026E13FC0495
8 changed files with 380 additions and 88 deletions

View File

@ -2,14 +2,6 @@
#include "../render-fx/render-fx.h"
#include "../render-fx/bopti-asm.h"
/* List of rendering functions */
static void *bopti_asm[] = {
bopti_gasm_mono,
bopti_gasm_mono_alpha,
bopti_gasm_gray,
bopti_gasm_gray_alpha,
};
/* gsubimage(): Render a section of an image */
void gsubimage(int x, int y, bopti_image_t const *img, int left, int top,
int width, int height, int flags)
@ -20,11 +12,11 @@ void gsubimage(int x, int y, bopti_image_t const *img, int left, int top,
if(flags & DIMAGE_NOCLIP)
{
bopti_render_noclip(x, y, img, left, top, width, height,
light, dark, bopti_asm[img->profile]);
light, dark);
}
else
{
bopti_render_clip(x, y, img, left, top, width, height,
light, dark, bopti_asm[img->profile]);
light, dark);
}
}

View File

@ -0,0 +1,152 @@
.global _bopti_gasm_mono_scsp
.global _bopti_gasm_mono_alpha_scsp
.global _bopti_gasm_gray_scsp
.global _bopti_gasm_gray_alpha_scsp
# REGISTER ALLOCATION:
# r0: OR layer
# r1: (temp)
# r2: light vram longword
# r3: dark vram longword
# --
# r4: light pointer
# r5: layer pointer
# r6: mask
# r7: dark pointer
# --
# @r15: -(x&31)
_bopti_gasm_mono_scsp:
# Read layer longword and shift it
mov.l @r5, r0
mov.l @r15, r1
shld r1, r0
# Clear target VRAM and unwanted image data
and r6, r0
mov.l @r4, r2
not r6, r6
mov.l @r7, r3
and r6, r2
and r6, r3
# Blit and return
or r0, r2
or r0, r3
mov.l r2, @r4
rts
mov.l r3, @r7
# REGISTER ALLOCATION:
# r0: AND layer
# r1: (temp)
# r2: light vram longword
# r3: dark vram longword
# --
# r4: light pointer
# r5: layer pointer, then OR layer
# r6: mask
# r7: dark pointer
# --
# @r15: -(x&31)
_bopti_gasm_mono_alpha_scsp:
# Read layer longwords and shift them
mov.l @r5, r0
mov.l @r15, r1
mov.l @(4,r5), r5
shld r1, r0
shld r1, r5
# Clear any unwanted image data
and r6, r0
mov.l @r4, r2
and r6, r5
mov.l @r7, r3
# Blit and return
not r0, r0
and r0, r2
and r0, r3
or r5, r2
or r5, r3
mov.l r2, @r4
rts
mov.l r3, @r7
# REGISTER ALLOCATION:
# r0: LIGHT layer
# r1: (temp)
# r2: light vram longword
# r3: dark vram longword
# --
# r4: light pointer
# r5: layer pointer, then DARK layer
# r6: mask
# r7: dark pointer
# --
# @r15: -(x&31)
_bopti_gasm_gray_scsp:
# Read layer longwords and shift them
mov.l @r5, r0
mov.l @r15, r1
mov.l @(4,r5), r5
shld r1, r0
shld r1, r5
# Clear target VRAM and unapplied image data
and r6, r0
mov.l @r4, r2
and r6, r5
mov.l @r7, r3
not r6, r6
and r6, r2
and r6, r3
# Blit and return
or r0, r2
or r5, r3
mov.l r2, @r4
rts
mov.l r3, @r7
# REGISTER ALLOCATION:
# r0: AND layer
# r1: LIGHT layer
# r2: (temp), then light vram longword
# r3: dark vram longword
# --
# r4: light pointer
# r5: layer pointer, then DARK layer
# r6: mask
# r7: dark pointer
# --
# @r15: -(x&31)
_bopti_gasm_gray_alpha_scsp:
# Read layer longwords and shift them
mov.l @r5, r0
mov.l @(4,r5), r1
mov.l @(8,r5), r5
mov.l @r15, r2
shld r2, r0
shld r2, r1
shld r2, r5
# Clear unappliqed image data
and r6, r0
and r6, r1
and r6, r5
# Blit the AND layer
mov.l @r4, r2
not r0, r0
mov.l @r7, r3
and r0, r2
and r0, r3
# Blit the LIGHT and DARY layers, and return
or r1, r2
or r5, r3
mov.l r2, @r4
rts
mov.l r3, @r7

View File

@ -1,3 +1,4 @@
.global _bopti_gasm_mono
.global _bopti_gasm_mono_alpha
.global _bopti_gasm_gray

View File

@ -0,0 +1,60 @@
.global _bopti_asm_mono_scsp
.global _bopti_asm_mono_alpha_scsp
# REGISTER ALLOCATION:
# r0: layer
# r1: -
# r2: (temp)
# r3: vram longword
# --
# r4: vram pointer
# r5: layer pointer
# r6: mask
# r7: -(x&31)
_bopti_asm_mono_scsp:
# Read layer longword and shift it
mov.l @r5, r0
shld r7, r0
# Clear the target VRAM and unwanted image data
mov.l @r4, r3
and r6, r0
not r6, r6
and r6, r3
# Blit and return
or r0, r3
rts
mov.l r3, @r4
# REGISTER ALLOCATION:
# r0: AND layer
# r1: OR layer
# r2: (temp)
# r3: vram longword
# --
# r4: vram pointer
# r5: layer pointer
# r6: mask
# r7: -(x&31)
_bopti_asm_mono_alpha_scsp:
# Read layer longwords and shift them
mov.l @r5, r0
mov.l @(4,r5), r1
shld r7, r0
shld r7, r1
# Apply masks to clear layer data
and r6, r0
and r6, r1
# Blit to VRAM
mov.l @r4, r3
not r0, r0
and r0, r3
or r1, r3
rts
mov.l r3, @r4

View File

@ -5,6 +5,9 @@
#ifndef GINT_RENDERFX_BOPTIASM
#define GINT_RENDERFX_BOPTIASM
#include <gint/defs/types.h>
#include <gint/defs/attributes.h>
/* pair_t: A pair of consecutive VRAM longwords */
typedef struct {
uint32_t l;
@ -24,6 +27,20 @@ typedef pair_t asm_mono_t(pair_t p, void **layer, uint32_t *masks, int x);
/* Signature of gray rendering functions */
typedef void asm_gray_t(quadr_t q, void **layer, uint32_t *masks, int x,
quadr_t *ret);
/* Signature of mono single-column single-position rendering functions */
typedef void asm_mono_scsp_t(uint32_t *vram, uint32_t const *layer,
uint32_t mask, int x);
/* Signature of gray single-column single-position rendering functions */
typedef void asm_gray_scsp_t(uint32_t *v1, uint32_t const *layer,
uint32_t mask, uint32_t *v2, int x);
/* Type of any rendering function */
typedef union {
asm_mono_t *asm_mono;
asm_gray_t *asm_gray;
asm_mono_scsp_t *asm_mono_scsp;
asm_gray_scsp_t *asm_gray_scsp;
} bopti_asm_t;
/* Each of the following rendering functions:
1. Takes VRAM data for two longword positions of the screen.
@ -40,11 +57,31 @@ extern asm_mono_t bopti_asm_mono_alpha;
/* bopti_gasm_mono(): "mono" profile on gray VRAMs */
extern asm_gray_t bopti_gasm_mono;
/* bopti_gasm_mono_alpha(): "mono alpha" profile on gray VRAMs */
/* bopti_gasm_mono_alpha(): "mono_alpha" profile on gray VRAMs */
extern asm_gray_t bopti_gasm_mono_alpha;
/* bopti_asm_gray(): Rendering function for the "gray" profile */
extern asm_gray_t bopti_gasm_gray;
/* bpoti_asm_gray_alpha(): Rendering function for the "gray alpha" profile */
/* bpoti_asm_gray_alpha(): Rendering function for the "gray_alpha" profile */
extern asm_gray_t bopti_gasm_gray_alpha;
/* Each of the following rendering functions:
1. Takes VRAM data from one longword position of the screen.
2. Reads data from one longword position of the image from layer.
3. Shifts the image data and applies it to the VRAM position.
None update the layer pointer. */
/* bopti_asm_mono_scsp(): SCSP "mono" profile */
extern asm_mono_scsp_t bopti_asm_mono_scsp;
/* bopti_asm_mono_alpha_scsp(): SCSP "mono_alpha" profile */
extern asm_mono_scsp_t bopti_asm_mono_alpha_scsp;
/* bopti_gasm_mono_scsp(): SCSP "mono" profile on gray VRAMs */
extern asm_gray_scsp_t bopti_gasm_mono_scsp;
/* bopti_gasm_mono_scsp_alpha(): SCSP "mono_alpha" profile on gray VRAMs */
extern asm_gray_scsp_t bopti_gasm_mono_alpha_scsp;
/* bopti_asm_gray_scsp(): SCSP "gray" profile */
extern asm_gray_scsp_t bopti_gasm_gray_scsp;
/* bpoti_asm_gray_alpha_scsp(): SCSP "gray_alpha" profile */
extern asm_gray_scsp_t bopti_gasm_gray_alpha_scsp;
#endif /* GINT_RENDERFX_BOPTIASM */

View File

@ -17,7 +17,7 @@ struct rbox
/* Horizontal bounds of the box in the image (included, in columns) */
int left, right;
/* Vertical bounds of the box in the image (inc-excluded, in pixels) */
int top, bottom;
int top, height;
};
/* struct command: A rendering command
@ -51,11 +51,30 @@ struct command
int gray;
/* Assembly function, prototype depends on image type */
union {
void *asm_void;
asm_mono_t *asm_mono;
asm_gray_t *asm_gray;
};
bopti_asm_t f;
};
/* List of rendering functions */
static asm_mono_t *asm_mono[] = {
bopti_asm_mono,
bopti_asm_mono_alpha,
};
static asm_gray_t *asm_gray[] = {
bopti_gasm_mono,
bopti_gasm_mono_alpha,
bopti_gasm_gray,
bopti_gasm_gray_alpha,
};
static asm_mono_scsp_t *asm_mono_scsp[] = {
bopti_asm_mono_scsp,
bopti_asm_mono_alpha_scsp,
};
static asm_gray_scsp_t *asm_gray_scsp[] = {
bopti_gasm_mono_scsp,
bopti_gasm_mono_alpha_scsp,
bopti_gasm_gray_scsp,
bopti_gasm_gray_alpha_scsp,
};
void bopti_grid(void **layer, int rows, struct command *c)
@ -90,7 +109,7 @@ void bopti_grid(void **layer, int rows, struct command *c)
/* The assembly routine blends a longword of data onto
the pair and returns the resulting pair. */
pret = c->asm_mono(p, layer, c->masks+col+col, -c->x);
pret = c->f.asm_mono(p, layer, c->masks+col+col,-c->x);
/* Write back the result into VRAM, except for column
-1 (occurs once every row, iff visual_x < 0) */
@ -122,7 +141,7 @@ void bopti_grid(void **layer, int rows, struct command *c)
q.l2 = (c->x) ? qret.r2 : q.r2;
q.r2 = v2[(offset + 1) & 0xff];
c->asm_gray(q, layer, c->masks+col+col, -c->x, &qret);
c->f.asm_gray(q, layer, c->masks+col+col, -c->x,&qret);
if(c->real_start + col)
{
@ -145,11 +164,16 @@ void bopti_grid(void **layer, int rows, struct command *c)
}
void bopti_render(bopti_image_t const *img, struct rbox *rbox, uint32_t *v1,
uint32_t *v2, void *bopti_asm)
uint32_t *v2)
{
/* Rendering function */
bopti_asm_t f;
if(v2) f.asm_gray = asm_gray[img->profile];
else f.asm_mono = asm_mono[img->profile];
/* Compute rendering masks */
uint32_t vm[4];
masks(rbox->visual_x, rbox->x + rbox->width - 1, vm);
masks(rbox->visual_x, rbox->visual_x + rbox->width - 1, vm);
/* Number of layers per profile */
int layer_count[] = { 1, 2, 2, 3 };
@ -181,8 +205,7 @@ void bopti_render(bopti_image_t const *img, struct rbox *rbox, uint32_t *v1,
/* Interwoven layer data. Skip left columns that are not rendered */
const uint32_t *layer = (void *)img->data;
layer += rbox->top * img_columns * layers;
layer += rbox->left * layer_count[img->profile];
layer += (rbox->top * img_columns + rbox->left) * layers;
/* Number of grid columns */
int columns = rbox->right - rbox->left + 1;
@ -199,73 +222,108 @@ void bopti_render(bopti_image_t const *img, struct rbox *rbox, uint32_t *v1,
.vram_stride = 4 - columns,
.data_stride = ((img_columns - columns) << 2) * layers,
.gray = (v2 != NULL),
.asm_void = bopti_asm,
.f = f,
};
bopti_grid((void **)&layer, rbox->bottom - rbox->top, &c);
bopti_grid((void **)&layer, rbox->height, &c);
}
void bopti_render_clip(int visual_x, int y, bopti_image_t const *img, int left,
int top, int width, int height, uint32_t *v1, uint32_t *v2,
void *bopti_asm)
/* Specialized, faster version for single-column single-position instances */
void bopti_render_scsp(bopti_image_t const *img, struct rbox *rbox,
uint32_t *v1, uint32_t *v2)
{
/* Left pixel of leftmost column */
int x = visual_x - (left & 31);
width += (left & 31);
left &= ~31;
/* Rendering function */
bopti_asm_t f;
if(v2) f.asm_gray_scsp = asm_gray_scsp[img->profile];
else f.asm_mono_scsp = asm_mono_scsp[img->profile];
/* Compute the only rendering mask. Avoid UB if width = 32 */
uint32_t mask = 0xffffffff;
if(rbox->width < 32)
{
int right = 32 - ((rbox->visual_x & 31) + rbox->width);
mask = ((1 << rbox->width) - 1) << right;
}
/* Number of layers */
int layer_count[] = { 1, 2, 2, 3 };
int layers = layer_count[img->profile];
/* Number of columns in [img] */
int img_columns = (img->width + 31) >> 5;
/* Interwoven layer data. Skip left columns that are not rendered */
const uint32_t *layer = (void *)img->data;
layer += (rbox->top * img_columns + rbox->left) * layers;
/* Starting value of VRAM pointers */
int offset = (rbox->y << 2) + (rbox->visual_x >> 5);
v1 += offset;
if(v2) v2 += offset;
/* Number of rows */
int rows = rbox->height;
/* Mask shift */
int shift = -(rbox->x & 31);
if(rbox->x < 0) shift += 32;
/* Render the grid immediately; mono version */
if(!v2) while(rows--)
{
f.asm_mono_scsp(v1, layer, mask, shift);
layer += img_columns * layers;
v1 += 4;
}
/* Gray version */
else while(rows--)
{
f.asm_gray_scsp(v1, layer, mask, v2, shift);
layer += img_columns * layers;
v1 += 4;
v2 += 4;
}
}
void bopti_render_clip(int x, int y, bopti_image_t const *img, int left,
int top, int width, int height, uint32_t *v1, uint32_t *v2)
{
/* Adjust the bounding box of the input image */
if(left < 0) width += left, x -= left, left = 0;
if(top < 0) height += top, y -= top, top = 0;
if(left + width > img->width) width = img->width - left;
if(top + height > img->height) height = img->height - top;
/* Check whether the box intersects the screen */
/* Intersect with the bounding box on-screen */
if(x < 0) width += x, left -= x, x = 0;
if(y < 0) height += y, top -= y, y = 0;
if(x + width > DWIDTH) width = DWIDTH - x;
if(y + height > DHEIGHT) height = DHEIGHT - y;
/* Early finish for empty intersections */
if(width <= 0 || height <= 0) return;
if(x + width <= 0 || x > 127 || y + height <= 0 || y > 63) return;
/* Intersect with the bounding box on-screen. We only need to make sure
that x>=-31, not x>=0. Setting x=0 would discard the horizontal
alignment information (x & 31). */
if(y < 0) top -= y, height += y, y = 0;
if(y + height > 64) height = (64 - y);
int bottom = top + height;
if(x < -32)
{
int overflow = (x + 32) >> 5;
overflow = -overflow << 5;
left += overflow;
width -= overflow;
x += overflow;
}
if(x + width > 128) width = (128 - x);
int right = (left + width - 1) >> 5;
left >>= 5;
/* Finish with the standard bopti renderer */
struct rbox rbox = { x, visual_x, y, width, left, right, top, bottom };
bopti_render(img, &rbox, v1, v2, bopti_asm);
/* Finish with the noclip variant */
bopti_render_noclip(x, y, img, left, top, width, height, v1, v2);
}
void bopti_render_noclip(int visual_x, int y, bopti_image_t const *img,
int left, int top, int width, int height, uint32_t *v1, uint32_t *v2,
void *bopti_asm)
int left, int top, int width, int height, uint32_t *v1, uint32_t *v2)
{
/* End row (excluded) */
int bottom = top + height;
/* Left pixel of leftmost column */
int x = visual_x - (left & 31);
width += (left & 31);
left &= ~31;
/* Start column and end column (included) */
int right = (left + width - 1) >> 5;
left >>= 5;
/* Start column and end column (both included) */
int cl = (left) >> 5;
int cr = (left + width - 1) >> 5;
/* Finish with the standard bopti renderer */
struct rbox rbox = { x, visual_x, y, width, left, right, top, bottom };
bopti_render(img, &rbox, v1, v2, bopti_asm);
struct rbox rbox = { 0, visual_x, y, width, cl, cr, top, height };
if(cl == cr && (visual_x & 31) + width <= 32)
{
rbox.x = (visual_x & 31) - (left & 31);
bopti_render_scsp(img, &rbox, v1, v2);
}
else
{
/* x-coordinate of the first pixel of the first column */
rbox.x = visual_x - (left & 31);
bopti_render(img, &rbox, v1, v2);
}
}

View File

@ -2,12 +2,6 @@
#include "render-fx.h"
#include "bopti-asm.h"
/* List of rendering functions */
static void *bopti_asm[] = {
bopti_asm_mono,
bopti_asm_mono_alpha,
};
/* dsubimage(): Render a section of an image */
void dsubimage(int x, int y, bopti_image_t const *img, int left, int top,
int width, int height, int flags)
@ -19,11 +13,11 @@ void dsubimage(int x, int y, bopti_image_t const *img, int left, int top,
if(flags & DIMAGE_NOCLIP)
{
bopti_render_noclip(x, y, img, left, top, width, height,
gint_vram, NULL, bopti_asm[img->profile]);
gint_vram, NULL);
}
else
{
bopti_render_clip(x, y, img, left, top, width, height,
gint_vram, NULL, bopti_asm[img->profile]);
gint_vram, NULL);
}
}

View File

@ -28,10 +28,9 @@ void masks(int x1, int x2, uint32_t *masks);
@x @y Location of the top-left corner
@img Image encoded by [fxconv]
@left @top @w @h Bounding box to render
@v1 @v2 VRAMs
@bopti_asm Rendering function */
@v1 @v2 VRAMs (gray rendering is used if v2 != NULL) */
void bopti_render_clip(int x, int y, bopti_image_t const *img, int left,
int top, int w, int h, uint32_t *v1, uint32_t *v2, void *bopti_asm);
int top, int w, int h, uint32_t *v1, uint32_t *v2);
/* bopti_render_noclip(): Render a bopti image without clipping
This function is only ever slightly faster than bopti_render_clip(),
@ -42,10 +41,9 @@ void bopti_render_clip(int x, int y, bopti_image_t const *img, int left,
@x @y Location of the top-left corner
@img Image encoded by [fxconv]
@left @top @w @h Bounding box to render
@v1 @v2 VRAMs
@bopti_asm Rendering function */
@v1 @v2 VRAMs (gray rendering is used if v2 != NULL) */
void bopti_render_noclip(int x, int y, bopti_image_t const *img, int left,
int top, int w, int h, uint32_t *v1, uint32_t *v2, void *bopti_asm);
int top, int w, int h, uint32_t *v1, uint32_t *v2);
//---
// Alternate rendering modes