From 09c13676d3c289b9aa39e1725d82f382154a980d Mon Sep 17 00:00:00 2001
From: Lephe <sebastien.michelland@protonmail.com>
Date: Sun, 15 May 2022 12:56:59 +0100
Subject: [PATCH] image: arbitrary linear transforms

---
 CMakeLists.txt                        |   2 +
 include/gint/image.h                  |  35 ++++---
 src/image/fixed.h                     |  32 +++++--
 src/image/image_linear.S              | 126 ++++++++++++++++++++++++++
 src/image/image_linear.c              |  34 +++++++
 src/image/image_rotate_around_scale.c |  50 +++++++++-
 src/image/image_scale.c               |   8 +-
 7 files changed, 262 insertions(+), 25 deletions(-)
 create mode 100644 src/image/image_linear.S
 create mode 100644 src/image/image_linear.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 15f04ee..574f156 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -174,6 +174,8 @@ set(SOURCES_CG
   src/image/image_get_pixel.c
   src/image/image_hflip.c
   src/image/image_hflip_alloc.c
+  src/image/image_linear.c
+  src/image/image_linear.S
   src/image/image_rotate.c
   src/image/image_rotate_around.c
   src/image/image_rotate_around_scale.c
diff --git a/include/gint/image.h b/include/gint/image.h
index e1f4e39..ce6cff1 100644
--- a/include/gint/image.h
+++ b/include/gint/image.h
@@ -357,9 +357,11 @@ void image_set_pixel(image_t const *img, int x, int y, int value);
    Note that conversions to RGB16 are not lossless because RGB565, P8 and P4
    can represent any color; if a color equal to image_alpha(IMAGE_RGB565A) is
    found during conversion, this function transforms it slightly to look
-   similar instead of being transparent.
+   similar instead of erroneously generating a transparent pixel.
 
-   Formats: RGB16 → RGB16, P8 → Anything, P4 → Anything */
+   Formats: RGB16 → RGB16, P8 → Anything, P4 → Anything
+   Size requirement: none (clipping is performed)
+   Supports in-place: No (useless) */
 void image_copy(image_t const *src, image_t *dst, bool copy_alpha);
 
 /* image_copy_alloc(): Convert and copy into a new image
@@ -373,8 +375,6 @@ void image_fill(image_t *img, int value);
 /* image_clear(): Fill a transparent image with its transparent value */
 void image_clear(image_t *img);
 
-/* TODO: Expand by taking from libimg */
-
 //---
 // Sub-image extraction
 //---
@@ -400,7 +400,9 @@ void image_clear(image_t *img);
      image_hflip(src, image_sub(dst, x, y, w, h));
 
    However, another call to image_sub() or image_at() will override the
-   sub-image, so you should only use this in such temporary settings.
+   sub-image, so you should only use this in such temporary settings. If you
+   need multiple image_sub() or image_at() calls in the same statement, only
+   one can use the short form.
 
    If the requested rectangle does not intersect the source, the sub-image will
    be of dimension 0x0. If the image format does not support sub-images (P4),
@@ -468,16 +470,30 @@ image_t *image_vflip_alloc(image_t const *src);
    function that can perform any combination of rotation, mirroring and scaling
    with nearest-neighbor sampling.
 
-   The [image_linear_opt] structure defines the settings for the transform.
+   The [image_linear_map] structure defines the settings for the transform.
    Users familiar with linear algebra might want to use it directly, but they
    are most conveniently generated with the rotation and scaling functions
    listed below.
 
+   Note: Currently the structure for the transform is modified by the
+   operation and cannot be reused.
+
+   The image_linear_alloc() variant allocates a new image in addition to
+   performing the transform. The image is created with size (map->dst_w,
+   map->dst_h) which is always a reasonable default. If a target image of
+   smaller size is supplied to image_linear(), clipping is performed; only the
+   top-left corner of the full output is actually rendered.
+
    Formats: RGB16, P8
    Size requirement: none (clipping through image_linear_opt settings)
    Supports in-place: No */
 
 struct image_linear_map {
+    /* Dimensions of the source and destination */
+    int src_w, src_h, dst_w, dst_h;
+    /* Input and output stride in bytes */
+    int src_stride, dst_stride;
+
     /* The following parameters define the linear transformation as a mapping
        from coordinates in the destination image (x and y) into coordinates in
        the source image (u and v).
@@ -490,15 +506,12 @@ struct image_linear_map {
        All of these values are specified as 16:16 fixed-point, ie. they encode
        decimal values by multiplying them by 65536. */
     int u, v, dx_u, dx_v, dy_u, dy_v;
-
-    /* Dimensions of the source and destination */
-    int src_w, src_h, dst_w, dst_h;
 };
 
 void image_linear(image_t const *src, image_t *dst,
-    struct image_linear_map const *map);
+    struct image_linear_map *map);
 image_t *image_linear_alloc(image_t const *src,
-    struct image_linear_map const *map);
+    struct image_linear_map *map);
 
 /* image_scale(): Upscale or downscale an image
 
diff --git a/src/image/fixed.h b/src/image/fixed.h
index b4c4474..2384cd3 100644
--- a/src/image/fixed.h
+++ b/src/image/fixed.h
@@ -11,13 +11,7 @@
 /* Multiplication */
 static inline int fmul(int x, int y)
 {
-    return ((int64_t)x * (int64_t)y) >> 32;
-}
-
-/* Multiplication with a scalar */
-static inline int fmuls(int x, int s)
-{
-    return ((int64_t)x * (int64_t)s) >> 16;
+    return ((int64_t)x * (int64_t)y) >> 16;
 }
 
 /* Division */
@@ -26,4 +20,28 @@ static inline int fdiv(int x, int y)
     return ((int64_t)x << 16) / y;
 }
 
+/* Integer square root */
+static inline int isqrt(int n)
+{
+    if(n <= 0) return 0;
+    if(n < 4) return 1;
+
+    int low_bound = isqrt(n / 4) * 2;
+    int high_bound = low_bound + 1;
+
+    return (high_bound * high_bound <= n) ? high_bound : low_bound;
+}
+
+/* Floor operation */
+static inline int ffloor(int x)
+{
+    return (x >> 16);
+}
+
+/* Round operation */
+static inline int fround(int x)
+{
+    return ffloor(x + fconst(0.5));
+}
+
 #endif /* GINT_IMAGE_FIXED */
diff --git a/src/image/image_linear.S b/src/image/image_linear.S
new file mode 100644
index 0000000..ee6b43d
--- /dev/null
+++ b/src/image/image_linear.S
@@ -0,0 +1,126 @@
+.global _image_linear_rgb16
+.global _image_linear_p8
+
+/* The loop nest for the rotation + scaling code, manually optimized.
+   r0, r1: (temporary), u
+   r2, r3: dx_u, dx_v
+   r4:     input_pixels
+   r5:     output_pixels
+   r6, r7: drow_u, drow_v
+   r8:     line counter
+   r9:     dst_w
+   r10:    src_w << 16 (for bound checks)
+   r11:    src_h << 16 (for bound checks)
+   r12:    v
+   r13:    (temporary)
+   r14:    src_stride (for index access to input_pixels)
+   @-4:    dst_stride
+
+   This loop maintains the value of (u,v) at each pixel by adding (dx_u, dx_v)
+   every pixel and (drow_u, drow_v) every row. For each position, it then
+   checks whether 0 <= u < src_w and 0 <= v < src_height as fixed-point; if
+   yes, input[(int)v * src_w + (int)u] is extracted; otherwise, the pixel is
+   skipped. */
+.macro GEN_LINEAR_LOOP MEM, DEPTH
+	mov.l	r8, @-r15
+	mov.l	r9, @-r15
+	mov.l	r10, @-r15
+	mov.l	r11, @-r15
+	mov.l	r12, @-r15
+	mov.l	r13, @-r15
+	mov.l	r14, @-r15
+	mov.l	@r6+, r10	/* map.src_w */
+	mov.l	@r6+, r11	/* map.src_h */
+	mov.l	@r6+, r9	/* map.dst_w */
+	mov.l	@r6+, r8	/* map.dst_h */
+	mov.l	@r6+, r14	/* map.src_stride */
+	mov.l	@r6+, r0	/* map.dst_stride */
+	mov.l	@r6+, r1	/* map.u */
+	mov.l	@r6+, r12	/* map.v */
+	mov.l	@r6+, r2	/* map.dx_u */
+	mov.l	@r6+, r3	/* map.dx_v */
+
+	mov.l	@(4, r6), r7	/* map.dy_v (replaced with drow_v) */
+	shll16	r10
+
+	mov.l	@r6, r6		/* map.dy_u (replaced with drow_u) */
+	shll16	r11
+
+	/* Compute the output stride as map.dst_stride - (DEPTH * map.dst_w) */
+
+	ldrs	1f
+	sub	r9, r0
+
+	ldre	2f
+ .if \DEPTH == 2
+	sub	r9, r0
+ .else
+	nop
+ .endif
+
+ 	mov.l	r0, @-r15
+ 	nop
+
+4:	ldrc	r9
+	nop
+
+1:	cmp/hs	r10, r1
+	nop
+
+	bt	3f
+	cmp/hs	r11, r12
+
+	bt	3f
+	swap.w	r12, r13
+
+	mov	r1, r0
+	mulu.w	r13, r14
+
+	shlr16	r0
+	sts	macl, r13
+
+ .if \DEPTH == 2
+	shll	r0
+	nop
+ .endif
+
+	add	r13, r0
+	\MEM	@(r0, r4), r13
+
+	\MEM	r13, @r5
+     3:	add	#\DEPTH, r5
+
+	add	r2, r1
+	nop
+
+	add	r3, r12
+2:	nop
+
+	dt	r8
+	mov.l	@r15, r0	/* Stride between lines, excluding content */
+
+	add	r6, r1
+	nop
+
+	add	r7, r12
+	nop
+
+	bf.s	4b
+	add	r0, r5
+
+	mov.l	@r15+, r0
+	mov.l	@r15+, r14
+	mov.l	@r15+, r13
+	mov.l	@r15+, r12
+	mov.l	@r15+, r11
+	mov.l	@r15+, r10
+	mov.l	@r15+, r9
+	rts
+	mov.l	@r15+, r8
+.endm
+
+_image_linear_rgb16:
+	GEN_LINEAR_LOOP mov.w, 2
+
+_image_linear_p8:
+	GEN_LINEAR_LOOP mov.b, 1
diff --git a/src/image/image_linear.c b/src/image/image_linear.c
new file mode 100644
index 0000000..f3df33a
--- /dev/null
+++ b/src/image/image_linear.c
@@ -0,0 +1,34 @@
+#include <gint/image.h>
+#include <gint/defs/util.h>
+#include "fixed.h"
+
+void image_linear_rgb16(void *src, void *dst, struct image_linear_map *map);
+void image_linear_p8(void *src, void *dst, struct image_linear_map *map);
+
+void image_linear(image_t const *src, image_t *dst,
+    struct image_linear_map *map)
+{
+    if(!image_target(src, dst, NOT_P4, SAME_DEPTH))
+        return;
+
+    /* Clip the destination */
+    map->dst_w = min(map->dst_w, dst->width);
+    map->dst_h = min(map->dst_h, dst->height);
+
+    int drow_u = -map->dx_u * map->dst_w + map->dy_u;
+    int drow_v = -map->dx_v * map->dst_w + map->dy_v;
+
+    /* Change dy to mean drow before calling the assembler code */
+    map->dy_u = drow_u;
+    map->dy_v = drow_v;
+
+    /* Record strides */
+    map->src_stride = src->stride;
+    map->dst_stride = dst->stride;
+
+    /* Call the assembler implementation */
+    if(IMAGE_IS_RGB16(src->format))
+        image_linear_rgb16(src->data, dst->data, map);
+    else if(IMAGE_IS_P8(src->format))
+        image_linear_p8(src->data, dst->data, map);
+}
diff --git a/src/image/image_rotate_around_scale.c b/src/image/image_rotate_around_scale.c
index 65faf97..a8b725b 100644
--- a/src/image/image_rotate_around_scale.c
+++ b/src/image/image_rotate_around_scale.c
@@ -1,7 +1,8 @@
 #include <gint/image.h>
+#include <math.h>
 #include "fixed.h"
 
-void image_rotate_around_scale(image_t const *src, float angle, int gamma,
+void image_rotate_around_scale(image_t const *src, float alpha, int gamma,
     bool resize, int *center_x, int *center_y, struct image_linear_map *map)
 {
     if(!image_valid(src))
@@ -10,7 +11,50 @@ void image_rotate_around_scale(image_t const *src, float angle, int gamma,
     map->src_w = src->width;
     map->src_h = src->height;
 
-    /* Don't try to resize cleanly; just add a √2 factor in both dimensions if
+    /* Compute the rotation basis */
+    int cos_alpha = fconst(cosf(alpha));
+    int sin_alpha = fconst(sinf(alpha));
+    int inv_gamma = fdiv(fconst(1.0), gamma);
+
+    map->dx_u = fmul(cos_alpha, inv_gamma);
+    map->dx_v = fmul(sin_alpha, inv_gamma);
+
+    map->dy_u = -fmul(sin_alpha, inv_gamma);
+    map->dy_v =  fmul(cos_alpha, inv_gamma);
+
+    /* Don't try to resize cleanly; just make the longest diagonal the width if
        [resize=true] to make sure everything fits */
-    ;
+    if(resize) {
+        int diag = isqrt(src->width * src->width + src->height * src->height);
+        map->dst_w = fround(gamma * diag);
+        map->dst_h = fround(gamma * diag);
+    }
+    else {
+        map->dst_w = fround(gamma * src->width);
+        map->dst_h = fround(gamma * src->height);
+    }
+
+    /* Compute the new location of the anchor relative to the image center.
+       This is found by a neat trick: rotate it with the same angle */
+    int ax = *center_x - map->src_w / 2;
+    int ay = *center_y - map->src_h / 2;
+
+    int ax2 = fround(fmul(gamma,  cos_alpha * ax + sin_alpha * ay));
+    int ay2 = fround(fmul(gamma, -sin_alpha * ax + cos_alpha * ay));
+
+    int new_center_x = ax2 + map->dst_w / 2;
+    int new_center_y = ay2 + map->dst_h / 2;
+
+    /* Finally, determine the initial value of (u,v). We now that it evaluates
+       to (center_x, center_y) when on the new center point (new_center_x,
+       new_center_y); apply the difference accordingly. */
+    map->u = fconst(*center_x)
+           - map->dx_u * new_center_x
+           - map->dy_u * new_center_y;
+    map->v = fconst(*center_y)
+           - map->dx_v * new_center_x
+           - map->dy_v * new_center_y;
+
+    *center_x = new_center_x;
+    *center_y = new_center_y;
 }
diff --git a/src/image/image_scale.c b/src/image/image_scale.c
index f038c57..7910bfd 100644
--- a/src/image/image_scale.c
+++ b/src/image/image_scale.c
@@ -7,8 +7,8 @@ void image_scale(image_t const *src, int gamma_x, int gamma_y,
     if(!image_valid(src))
         return;
 
-    int inv_gamma_x = fdiv(fconst(1), gamma_x);
-    int inv_gamma_y = fdiv(fconst(1), gamma_y);
+    int inv_gamma_x = fdiv(fconst(1.0), gamma_x);
+    int inv_gamma_y = fdiv(fconst(1.0), gamma_y);
 
     map->u = fconst(0);
     map->v = fconst(0);
@@ -19,6 +19,6 @@ void image_scale(image_t const *src, int gamma_x, int gamma_y,
 
     map->src_w = src->width;
     map->src_h = src->height;
-    map->dst_w = fmuls(src->width, gamma_x);
-    map->dst_h = fmuls(src->height, gamma_y);
+    map->dst_w = fround(src->width * gamma_x);
+    map->dst_h = fround(src->height * gamma_y);
 }