pico-playground/scanvideo/sprite/sprite.S

585 wiersze
11 KiB
ArmAsm

/*
* Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
// Functions for doing simple 2D graphics operations on a RGB scanline buffer.
#include "hardware/regs/addressmap.h"
#include "hardware/regs/sio.h"
#define POP2_OFFS (SIO_INTERP0_POP_FULL_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define CTRL0_OFFS (SIO_INTERP0_CTRL_LANE0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define INTERP1 (SIO_INTERP1_ACCUM0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
.syntax unified
.cpu cortex-m0plus
.thumb
// Put every function in its own ELF section, to permit linker GC
.macro decl_func name
.section .time_critical.\name, "ax"
.global \name
.type \name,%function
.thumb_func
\name:
.endm
// ----------------------------------------------------------------------------
// Colour fill
// r0: dst
// r1: value
// r2: count
decl_func sprite_fill8
// Slide for short fills
cmp r2, #18
bhi 2f
adr r3, 1f
lsls r2, #1
subs r3, r2
adds r3, #1 // thumb bit
bx r3
.align 2
strb r1, [r0, #17]
strb r1, [r0, #16]
strb r1, [r0, #15]
strb r1, [r0, #14]
strb r1, [r0, #13]
strb r1, [r0, #12]
strb r1, [r0, #11]
strb r1, [r0, #10]
strb r1, [r0, #9]
strb r1, [r0, #8]
strb r1, [r0, #7]
strb r1, [r0, #6]
strb r1, [r0, #5]
strb r1, [r0, #4]
strb r1, [r0, #3]
strb r1, [r0, #2]
strb r1, [r0, #1]
strb r1, [r0, #0]
1:
bx lr
2:
lsls r3, r1, #8
orrs r1, r3
lsls r3, r1, #16
orrs r1, r3
// Get r0 word-aligned:
lsrs r3, r0, #1
bcc 1f
strb r1, [r0]
adds r0, #1
subs r2, #1
1:
lsrs r3, r0, #2
bcc 1f
strh r1, [r0]
adds r0, #2
subs r2, #2
1:
// Set up for main loop. Limit pointer at end - (loop body size - 1)
push {r4}
adds r2, r0
subs r2, #15
mov ip, r2
mov r2, r1
mov r3, r1
mov r4, r1
// Fall straight into loop, because cases less than (loop body + max misalignment) are handled by slide
1:
stmia r0!, {r1, r2, r3, r4}
cmp r0, ip
blo 1b
// Main loop done, now tidy up the odds and ends
mov r4, ip
subs r4, r0
adds r4, #15
// No more than 15 bytes remaining -- first test bit 3
lsls r4, #29
bcc 1f
stmia r0!, {r1, r2}
1:
lsls r4, #1
bcc 1f
stmia r0!, {r1}
1:
lsls r4, #1
bcc 1f
strh r1, [r0]
adds r0, #2
1:
lsls r4, #1
bcc 1f
strb r1, [r0]
1:
pop {r4}
bx lr
decl_func sprite_fill16
// Slide for short fills
cmp r2, #15
bhi 2f
adr r3, 1f
lsls r2, #1
subs r3, r2
adds r3, #1
bx r3
.align 2
strh r1, [r0, #30]
strh r1, [r0, #28]
strh r1, [r0, #26]
strh r1, [r0, #24]
strh r1, [r0, #22]
strh r1, [r0, #20]
strh r1, [r0, #18]
strh r1, [r0, #16]
strh r1, [r0, #14]
strh r1, [r0, #12]
strh r1, [r0, #10]
strh r1, [r0, #8]
strh r1, [r0, #6]
strh r1, [r0, #4]
strh r1, [r0, #2]
strh r1, [r0, #0]
1:
bx lr
2:
push {r4, r5, r6, r7, lr}
// Get word-aligned before main fill loop
lsrs r3, r2, #2
bcc 1f
strh r1, [r0]
adds r0, #2
subs r2, #1
1:
// Set limit pointer at end - (loop body size - 1)
lsls r2, #1
adds r2, r0
subs r2, #26
mov ip, r2
lsls r2, r1, #16
orrs r1, r2
mov r2, r1
mov r3, r1
mov r4, r1
mov r5, r1
mov r6, r1
mov r7, r1
// We can fall through because cases < 1 loop are handled by slide
1:
stmia r0!, {r1, r2, r3, r4, r5, r6, r7} // wheeeeeeeeeee
cmp r0, ip
blo 1b
// Most of the work done, we have a few more to tidy up
movs r2, #26
add r2, ip
subs r2, r0
lsls r2, #28
bcc 1f
stmia r0!, {r4, r5, r6, r7}
1:
lsls r2, #1
bcc 1f
stmia r0!, {r4, r5}
1:
lsls r2, #1
bcc 1f
stmia r0!, {r4}
1:
lsls r2, #1
bcc 1f
strh r4, [r0]
1:
pop {r4, r5, r6, r7, pc}
// ----------------------------------------------------------------------------
// Non-AT sprite
// r0: dst
// r1: src
// r2: pixel count
//
// Unrolled loop body with an initial computed branch. Note we can go much
// faster if r0 and r1 are co-aligned, but it's not all that helpful to have a
// 1 in 4 chance of being really fast when minimising worst-case scanline time
decl_func sprite_blit8
mov ip, r0
lsrs r3, r2, #3
lsls r3, #3
eors r2, r3 // r2 = pixels % 8, r3 = pixels = pixels % 8
add r0, r3
add r1, r3
adr r3, 2f
lsls r2, #2
subs r3, r2
adds r3, #1 // thumb bit >:(
bx r3
.align 2
1:
subs r0, #8
subs r1, #8
ldrb r3, [r1, #7]
strb r3, [r0, #7]
ldrb r3, [r1, #6]
strb r3, [r0, #6]
ldrb r3, [r1, #5]
strb r3, [r0, #5]
ldrb r3, [r1, #4]
strb r3, [r0, #4]
ldrb r3, [r1, #3]
strb r3, [r0, #3]
ldrb r3, [r1, #2]
strb r3, [r0, #2]
ldrb r3, [r1, #1]
strb r3, [r0, #1]
ldrb r3, [r1, #0]
strb r3, [r0, #0]
2:
cmp r0, ip
bhi 1b
bx lr
// Assume RAGB2132 (so alpha is bit 5)
#define ALPHA_SHIFT_8BPP 6
.macro sprite_blit8_alpha_body n
ldrb r3, [r1, #\n]
lsrs r2, r3, #ALPHA_SHIFT_8BPP
bcc 2f
strb r3, [r0, #\n]
2:
.endm
decl_func sprite_blit8_alpha
mov ip, r0
lsrs r3, r2, #3
lsls r3, #3
eors r2, r3
add r0, r3
add r1, r3
adr r3, 3f
lsls r2, #3
subs r3, r2
adds r3, #1
bx r3
.align 2
1:
subs r0, #8
subs r1, #8
sprite_blit8_alpha_body 7
sprite_blit8_alpha_body 6
sprite_blit8_alpha_body 5
sprite_blit8_alpha_body 4
sprite_blit8_alpha_body 3
sprite_blit8_alpha_body 2
sprite_blit8_alpha_body 1
sprite_blit8_alpha_body 0
3:
cmp r0, ip
bhi 1b
bx lr
decl_func sprite_blit16
mov ip, r0
lsrs r3, r2, #3
lsls r3, #3
eors r2, r3 // r2 = pixels % 8, r3 = pixels = pixels % 8
lsls r3, #1
add r0, r3
add r1, r3
adr r3, 2f
lsls r2, #2
subs r3, r2
adds r3, #1 // thumb bit >:(
bx r3
.align 2
1:
subs r0, #16
subs r1, #16
ldrh r3, [r1, #14]
strh r3, [r0, #14]
ldrh r3, [r1, #12]
strh r3, [r0, #12]
ldrh r3, [r1, #10]
strh r3, [r0, #10]
ldrh r3, [r1, #8]
strh r3, [r0, #8]
ldrh r3, [r1, #6]
strh r3, [r0, #6]
ldrh r3, [r1, #4]
strh r3, [r0, #4]
ldrh r3, [r1, #2]
strh r3, [r0, #2]
ldrh r3, [r1, #0]
strh r3, [r0, #0]
2:
cmp r0, ip
bhi 1b
bx lr
// Assume RGAB5515 (so alpha is bit 5)
// Note the alpha bit being in the same position as RAGB2132 is a coincidence.
// We are just stealing an LSB such that we can treat our alpha pixels in the
// same way as non-alpha pixels when encoding (and the co-opted channel LSB
// always ends up being set on alpha pixels, which is pretty harmless)
#define ALPHA_SHIFT_16BPP 6
.macro sprite_blit16_alpha_body n
ldrh r3, [r1, #2*\n]
lsrs r2, r3, #ALPHA_SHIFT_16BPP
bcc 2f
strh r3, [r0, #2*\n]
2:
.endm
decl_func sprite_blit16_alpha
mov ip, r0
lsrs r3, r2, #3
lsls r3, #3
eors r2, r3
lsls r3, #1
add r0, r3
add r1, r3
adr r3, 3f
lsls r2, #3
subs r3, r2
adds r3, #1
bx r3
.align 2
1:
subs r0, #16
subs r1, #16
sprite_blit16_alpha_body 7
sprite_blit16_alpha_body 6
sprite_blit16_alpha_body 5
sprite_blit16_alpha_body 4
sprite_blit16_alpha_body 3
sprite_blit16_alpha_body 2
sprite_blit16_alpha_body 1
sprite_blit16_alpha_body 0
3:
cmp r0, ip
bhi 1b
bx lr
// ----------------------------------------------------------------------------
// Affine-transformed sprite (note these are just the inner loops -- INTERP0
// must be configured by the caller, which is presumably not written in asm)
// r0: raster start pointer
// r1: raster span size (pixels)
.macro sprite_ablit8_loop_body n
ldr r1, [r3, #CTRL0_OFFS]
ldr r2, [r3, #POP2_OFFS]
lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1
bcs 2f
ldrb r2, [r2]
strb r2, [r0, #\n]
2:
.endm
decl_func sprite_ablit8_loop
mov ip, r0
lsrs r2, r1, #3
lsls r2, #3
eors r1, r2
add r0, r2
adr r2, 3f
movs r3, #12 // Each (non-unrolled) loop body is 12 bytes
muls r1, r3
subs r2, r1
adds r2, #1
ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
bx r2
.align 2
nop
1:
subs r0, #8
sprite_ablit8_loop_body 7
sprite_ablit8_loop_body 6
sprite_ablit8_loop_body 5
sprite_ablit8_loop_body 4
sprite_ablit8_loop_body 3
sprite_ablit8_loop_body 2
sprite_ablit8_loop_body 1
sprite_ablit8_loop_body 0
3:
cmp r0, ip
bne 1b
bx lr
// As above but bit 5 is assumed to be an alpha bit (RAGB2132)
.macro sprite_ablit8_alpha_loop_body n
ldr r1, [r3, #CTRL0_OFFS]
ldr r2, [r3, #POP2_OFFS]
lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1
bcs 2f
ldrb r2, [r2]
lsrs r1, r2, #ALPHA_SHIFT_8BPP
bcc 2f
strb r2, [r0, #\n]
2:
.endm
decl_func sprite_ablit8_alpha_loop
mov ip, r0
ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
lsrs r2, r1, #3
lsls r2, #3
eors r1, r2
add r0, r2
adr r2, 3f
lsls r1, #4 // Each (non-unrolled) loop body is 16 bytes
subs r2, r1
adds r2, #1
bx r2
.align 2
nop
1:
subs r0, #8
sprite_ablit8_alpha_loop_body 7
sprite_ablit8_alpha_loop_body 6
sprite_ablit8_alpha_loop_body 5
sprite_ablit8_alpha_loop_body 4
sprite_ablit8_alpha_loop_body 3
sprite_ablit8_alpha_loop_body 2
sprite_ablit8_alpha_loop_body 1
sprite_ablit8_alpha_loop_body 0
3:
cmp r0, ip
bhi 1b
bx lr
.macro sprite_ablit16_loop_body n
ldr r1, [r3, #CTRL0_OFFS]
ldr r2, [r3, #POP2_OFFS]
lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1
bcs 2f
ldrh r2, [r2]
strh r2, [r0, #2*\n]
2:
.endm
decl_func sprite_ablit16_loop
mov ip, r0
lsrs r2, r1, #3
lsls r2, #3
eors r1, r2
lsls r2, #1 // Each pixel is 2 bytes
add r0, r2
adr r2, 3f
movs r3, #12 // Each (non-unrolled) loop body is 12 bytes
muls r1, r3
subs r2, r1
adds r2, #1
ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
bx r2
.align 2
nop
1:
subs r0, #16
sprite_ablit16_loop_body 7
sprite_ablit16_loop_body 6
sprite_ablit16_loop_body 5
sprite_ablit16_loop_body 4
sprite_ablit16_loop_body 3
sprite_ablit16_loop_body 2
sprite_ablit16_loop_body 1
sprite_ablit16_loop_body 0
3:
cmp r0, ip
bne 1b
bx lr
.macro sprite_ablit16_alpha_loop_body n
ldr r1, [r3, #CTRL0_OFFS]
ldr r2, [r3, #POP2_OFFS]
lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1
bcs 2f
ldrh r2, [r2]
lsrs r1, r2, #ALPHA_SHIFT_16BPP
bcc 2f
strh r2, [r0, #2*\n]
2:
.endm
decl_func sprite_ablit16_alpha_loop
mov ip, r0
ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
lsrs r2, r1, #3
lsls r2, #3
eors r1, r2
lsls r2, #1 // Each pixel is 2 bytes
add r0, r2
adr r2, 3f
lsls r1, #4 // Each (non-unrolled) loop body is 16 bytes
subs r2, r1
adds r2, #1
bx r2
.align 2
nop
1:
subs r0, #16
sprite_ablit16_alpha_loop_body 7
sprite_ablit16_alpha_loop_body 6
sprite_ablit16_alpha_loop_body 5
sprite_ablit16_alpha_loop_body 4
sprite_ablit16_alpha_loop_body 3
sprite_ablit16_alpha_loop_body 2
sprite_ablit16_alpha_loop_body 1
sprite_ablit16_alpha_loop_body 0
3:
cmp r0, ip
bhi 1b
bx lr