From 1c42799d43ed1004de2cd3e33830e7ea00ff5066 Mon Sep 17 00:00:00 2001 From: Luke Wren Date: Mon, 22 Nov 2021 02:44:39 +0000 Subject: [PATCH] Faster sprite_blit16 routine, more sprites in sprite_bounce --- software/apps/sprite_bounce/main.c | 2 +- software/libsprite/sprite.S | 96 ++++++++++++++++++------------ 2 files changed, 58 insertions(+), 40 deletions(-) diff --git a/software/apps/sprite_bounce/main.c b/software/apps/sprite_bounce/main.c index 52fa370..1553a65 100644 --- a/software/apps/sprite_bounce/main.c +++ b/software/apps/sprite_bounce/main.c @@ -64,7 +64,7 @@ #error "Select a video mode!" #endif -#define N_BERRIES 50 +#define N_BERRIES 65 #define LED_PIN 21 struct dvi_inst dvi0; diff --git a/software/libsprite/sprite.S b/software/libsprite/sprite.S index c1d4807..b4fd190 100644 --- a/software/libsprite/sprite.S +++ b/software/libsprite/sprite.S @@ -197,15 +197,13 @@ decl_func sprite_fill16 // r2: pixel count // -// Unrolled loop body with an initial computed branch. Note we can go much -// faster if r0 and r1 are co-aligned, but it's not all that helpful to have a -// 1 in 4 chance of being really fast when minimising worst-case scanline time +// Unrolled loop body with an initial computed branch. decl_func sprite_blit8 mov ip, r0 lsrs r3, r2, #3 lsls r3, #3 - eors r2, r3 // r2 = pixels % 8, r3 = pixels = pixels % 8 + eors r2, r3 // r2 = pixels % 8, r3 = pixels - pixels % 8 add r0, r3 add r1, r3 @@ -282,45 +280,65 @@ decl_func sprite_blit8_alpha bx lr +.macro storew_alignh rd ra offs + strh \rd, [\ra, #\offs] + lsrs \rd, #16 + strh \rd, [\ra, #\offs + 2] +.endm + decl_func sprite_blit16 - mov ip, r0 - lsrs r3, r2, #3 - lsls r3, #3 - eors r2, r3 // r2 = pixels % 8, r3 = pixels = pixels % 8 - - lsls r3, #1 - add r0, r3 - add r1, r3 - - adr r3, 2f - lsls r2, #2 - subs r3, r2 - adds r3, #1 // thumb bit >:( - bx r3 - -.align 2 + // Force source pointer to be word-aligned + lsrs r3, r1, #2 + bcc 1f + ldrh r3, [r1] + strh r3, [r0] + adds r0, #2 + adds r1, #2 + subs r2, #1 1: - subs r0, #16 - subs r1, #16 - ldrh r3, [r1, #14] - strh r3, [r0, #14] - ldrh r3, [r1, #12] - strh r3, [r0, #12] - ldrh r3, [r1, #10] - strh r3, [r0, #10] - ldrh r3, [r1, #8] - strh r3, [r0, #8] - ldrh r3, [r1, #6] - strh r3, [r0, #6] - ldrh r3, [r1, #4] - strh r3, [r0, #4] - ldrh r3, [r1, #2] - strh r3, [r0, #2] - ldrh r3, [r1, #0] - strh r3, [r0, #0] + // Each loop is 8 pixels. Place limit pointer at 16 bytes before + // end, loop until past it. There will be 0 to 7 pixels remaining. + lsls r2, #1 + adds r2, r0 + subs r2, #16 + mov ip, r2 + b 2f +1: + ldmia r1!, {r2, r3} + storew_alignh r2, r0, 0 + storew_alignh r3, r0, 4 + ldmia r1!, {r2, r3} + storew_alignh r2, r0, 8 + storew_alignh r3, r0, 12 + adds r0, #16 2: cmp r0, ip - bhi 1b + bls 1b + + mov r2, ip + subs r2, r0 + // At least 4 pixels? + lsls r2, #29 + bcc 1f + ldmia r1!, {r3} + storew_alignh r3, r0, 0 + ldmia r1!, {r3} + storew_alignh r3, r0, 4 + adds r0, #8 +1: + // At least 2 pixels? + lsls r2, #1 + bcc 1f + ldmia r1!, {r3} + storew_alignh r3, r0, 0 + adds r0, #4 +1: + // One more pixel? + lsls r2, #1 + bcc 1f + ldrh r3, [r1] + strh r3, [r0] +1: bx lr .macro sprite_blit16_alpha_body n