/* * Copyright (c) 2020 Raspberry Pi (Trading) Ltd. * * SPDX-License-Identifier: BSD-3-Clause */ // Functions for doing simple 2D graphics operations on a RGB scanline buffer. #include "hardware/regs/addressmap.h" #include "hardware/regs/sio.h" #define POP2_OFFS (SIO_INTERP0_POP_FULL_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) #define CTRL0_OFFS (SIO_INTERP0_CTRL_LANE0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) #define INTERP1 (SIO_INTERP1_ACCUM0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) .syntax unified .cpu cortex-m0plus .thumb // Put every function in its own ELF section, to permit linker GC .macro decl_func name .section .time_critical.\name, "ax" .global \name .type \name,%function .thumb_func \name: .endm // ---------------------------------------------------------------------------- // Colour fill // r0: dst // r1: value // r2: count decl_func sprite_fill8 // Slide for short fills cmp r2, #18 bhi 2f adr r3, 1f lsls r2, #1 subs r3, r2 adds r3, #1 // thumb bit bx r3 .align 2 strb r1, [r0, #17] strb r1, [r0, #16] strb r1, [r0, #15] strb r1, [r0, #14] strb r1, [r0, #13] strb r1, [r0, #12] strb r1, [r0, #11] strb r1, [r0, #10] strb r1, [r0, #9] strb r1, [r0, #8] strb r1, [r0, #7] strb r1, [r0, #6] strb r1, [r0, #5] strb r1, [r0, #4] strb r1, [r0, #3] strb r1, [r0, #2] strb r1, [r0, #1] strb r1, [r0, #0] 1: bx lr 2: lsls r3, r1, #8 orrs r1, r3 lsls r3, r1, #16 orrs r1, r3 // Get r0 word-aligned: lsrs r3, r0, #1 bcc 1f strb r1, [r0] adds r0, #1 subs r2, #1 1: lsrs r3, r0, #2 bcc 1f strh r1, [r0] adds r0, #2 subs r2, #2 1: // Set up for main loop. Limit pointer at end - (loop body size - 1) push {r4} adds r2, r0 subs r2, #15 mov ip, r2 mov r2, r1 mov r3, r1 mov r4, r1 // Fall straight into loop, because cases less than (loop body + max misalignment) are handled by slide 1: stmia r0!, {r1, r2, r3, r4} cmp r0, ip blo 1b // Main loop done, now tidy up the odds and ends mov r4, ip subs r4, r0 adds r4, #15 // No more than 15 bytes remaining -- first test bit 3 lsls r4, #29 bcc 1f stmia r0!, {r1, r2} 1: lsls r4, #1 bcc 1f stmia r0!, {r1} 1: lsls r4, #1 bcc 1f strh r1, [r0] adds r0, #2 1: lsls r4, #1 bcc 1f strb r1, [r0] 1: pop {r4} bx lr decl_func sprite_fill16 // Slide for short fills cmp r2, #15 bhi 2f adr r3, 1f lsls r2, #1 subs r3, r2 adds r3, #1 bx r3 .align 2 strh r1, [r0, #30] strh r1, [r0, #28] strh r1, [r0, #26] strh r1, [r0, #24] strh r1, [r0, #22] strh r1, [r0, #20] strh r1, [r0, #18] strh r1, [r0, #16] strh r1, [r0, #14] strh r1, [r0, #12] strh r1, [r0, #10] strh r1, [r0, #8] strh r1, [r0, #6] strh r1, [r0, #4] strh r1, [r0, #2] strh r1, [r0, #0] 1: bx lr 2: push {r4, r5, r6, r7, lr} // Get word-aligned before main fill loop lsrs r3, r2, #2 bcc 1f strh r1, [r0] adds r0, #2 subs r2, #1 1: // Set limit pointer at end - (loop body size - 1) lsls r2, #1 adds r2, r0 subs r2, #26 mov ip, r2 lsls r2, r1, #16 orrs r1, r2 mov r2, r1 mov r3, r1 mov r4, r1 mov r5, r1 mov r6, r1 mov r7, r1 // We can fall through because cases < 1 loop are handled by slide 1: stmia r0!, {r1, r2, r3, r4, r5, r6, r7} // wheeeeeeeeeee cmp r0, ip blo 1b // Most of the work done, we have a few more to tidy up movs r2, #26 add r2, ip subs r2, r0 lsls r2, #28 bcc 1f stmia r0!, {r4, r5, r6, r7} 1: lsls r2, #1 bcc 1f stmia r0!, {r4, r5} 1: lsls r2, #1 bcc 1f stmia r0!, {r4} 1: lsls r2, #1 bcc 1f strh r4, [r0] 1: pop {r4, r5, r6, r7, pc} // ---------------------------------------------------------------------------- // Non-AT sprite // r0: dst // r1: src // r2: pixel count // // Unrolled loop body with an initial computed branch. Note we can go much // faster if r0 and r1 are co-aligned, but it's not all that helpful to have a // 1 in 4 chance of being really fast when minimising worst-case scanline time decl_func sprite_blit8 mov ip, r0 lsrs r3, r2, #3 lsls r3, #3 eors r2, r3 // r2 = pixels % 8, r3 = pixels = pixels % 8 add r0, r3 add r1, r3 adr r3, 2f lsls r2, #2 subs r3, r2 adds r3, #1 // thumb bit >:( bx r3 .align 2 1: subs r0, #8 subs r1, #8 ldrb r3, [r1, #7] strb r3, [r0, #7] ldrb r3, [r1, #6] strb r3, [r0, #6] ldrb r3, [r1, #5] strb r3, [r0, #5] ldrb r3, [r1, #4] strb r3, [r0, #4] ldrb r3, [r1, #3] strb r3, [r0, #3] ldrb r3, [r1, #2] strb r3, [r0, #2] ldrb r3, [r1, #1] strb r3, [r0, #1] ldrb r3, [r1, #0] strb r3, [r0, #0] 2: cmp r0, ip bhi 1b bx lr // Assume RAGB2132 (so alpha is bit 5) #define ALPHA_SHIFT_8BPP 6 .macro sprite_blit8_alpha_body n ldrb r3, [r1, #\n] lsrs r2, r3, #ALPHA_SHIFT_8BPP bcc 2f strb r3, [r0, #\n] 2: .endm decl_func sprite_blit8_alpha mov ip, r0 lsrs r3, r2, #3 lsls r3, #3 eors r2, r3 add r0, r3 add r1, r3 adr r3, 3f lsls r2, #3 subs r3, r2 adds r3, #1 bx r3 .align 2 1: subs r0, #8 subs r1, #8 sprite_blit8_alpha_body 7 sprite_blit8_alpha_body 6 sprite_blit8_alpha_body 5 sprite_blit8_alpha_body 4 sprite_blit8_alpha_body 3 sprite_blit8_alpha_body 2 sprite_blit8_alpha_body 1 sprite_blit8_alpha_body 0 3: cmp r0, ip bhi 1b bx lr decl_func sprite_blit16 mov ip, r0 lsrs r3, r2, #3 lsls r3, #3 eors r2, r3 // r2 = pixels % 8, r3 = pixels = pixels % 8 lsls r3, #1 add r0, r3 add r1, r3 adr r3, 2f lsls r2, #2 subs r3, r2 adds r3, #1 // thumb bit >:( bx r3 .align 2 1: subs r0, #16 subs r1, #16 ldrh r3, [r1, #14] strh r3, [r0, #14] ldrh r3, [r1, #12] strh r3, [r0, #12] ldrh r3, [r1, #10] strh r3, [r0, #10] ldrh r3, [r1, #8] strh r3, [r0, #8] ldrh r3, [r1, #6] strh r3, [r0, #6] ldrh r3, [r1, #4] strh r3, [r0, #4] ldrh r3, [r1, #2] strh r3, [r0, #2] ldrh r3, [r1, #0] strh r3, [r0, #0] 2: cmp r0, ip bhi 1b bx lr // Assume RGAB5515 (so alpha is bit 5) // Note the alpha bit being in the same position as RAGB2132 is a coincidence. // We are just stealing an LSB such that we can treat our alpha pixels in the // same way as non-alpha pixels when encoding (and the co-opted channel LSB // always ends up being set on alpha pixels, which is pretty harmless) #define ALPHA_SHIFT_16BPP 6 .macro sprite_blit16_alpha_body n ldrh r3, [r1, #2*\n] lsrs r2, r3, #ALPHA_SHIFT_16BPP bcc 2f strh r3, [r0, #2*\n] 2: .endm decl_func sprite_blit16_alpha mov ip, r0 lsrs r3, r2, #3 lsls r3, #3 eors r2, r3 lsls r3, #1 add r0, r3 add r1, r3 adr r3, 3f lsls r2, #3 subs r3, r2 adds r3, #1 bx r3 .align 2 1: subs r0, #16 subs r1, #16 sprite_blit16_alpha_body 7 sprite_blit16_alpha_body 6 sprite_blit16_alpha_body 5 sprite_blit16_alpha_body 4 sprite_blit16_alpha_body 3 sprite_blit16_alpha_body 2 sprite_blit16_alpha_body 1 sprite_blit16_alpha_body 0 3: cmp r0, ip bhi 1b bx lr // ---------------------------------------------------------------------------- // Affine-transformed sprite (note these are just the inner loops -- INTERP0 // must be configured by the caller, which is presumably not written in asm) // r0: raster start pointer // r1: raster span size (pixels) .macro sprite_ablit8_loop_body n ldr r1, [r3, #CTRL0_OFFS] ldr r2, [r3, #POP2_OFFS] lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1 bcs 2f ldrb r2, [r2] strb r2, [r0, #\n] 2: .endm decl_func sprite_ablit8_loop mov ip, r0 lsrs r2, r1, #3 lsls r2, #3 eors r1, r2 add r0, r2 adr r2, 3f movs r3, #12 // Each (non-unrolled) loop body is 12 bytes muls r1, r3 subs r2, r1 adds r2, #1 ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) bx r2 .align 2 nop 1: subs r0, #8 sprite_ablit8_loop_body 7 sprite_ablit8_loop_body 6 sprite_ablit8_loop_body 5 sprite_ablit8_loop_body 4 sprite_ablit8_loop_body 3 sprite_ablit8_loop_body 2 sprite_ablit8_loop_body 1 sprite_ablit8_loop_body 0 3: cmp r0, ip bne 1b bx lr // As above but bit 5 is assumed to be an alpha bit (RAGB2132) .macro sprite_ablit8_alpha_loop_body n ldr r1, [r3, #CTRL0_OFFS] ldr r2, [r3, #POP2_OFFS] lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1 bcs 2f ldrb r2, [r2] lsrs r1, r2, #ALPHA_SHIFT_8BPP bcc 2f strb r2, [r0, #\n] 2: .endm decl_func sprite_ablit8_alpha_loop mov ip, r0 ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) lsrs r2, r1, #3 lsls r2, #3 eors r1, r2 add r0, r2 adr r2, 3f lsls r1, #4 // Each (non-unrolled) loop body is 16 bytes subs r2, r1 adds r2, #1 bx r2 .align 2 nop 1: subs r0, #8 sprite_ablit8_alpha_loop_body 7 sprite_ablit8_alpha_loop_body 6 sprite_ablit8_alpha_loop_body 5 sprite_ablit8_alpha_loop_body 4 sprite_ablit8_alpha_loop_body 3 sprite_ablit8_alpha_loop_body 2 sprite_ablit8_alpha_loop_body 1 sprite_ablit8_alpha_loop_body 0 3: cmp r0, ip bhi 1b bx lr .macro sprite_ablit16_loop_body n ldr r1, [r3, #CTRL0_OFFS] ldr r2, [r3, #POP2_OFFS] lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1 bcs 2f ldrh r2, [r2] strh r2, [r0, #2*\n] 2: .endm decl_func sprite_ablit16_loop mov ip, r0 lsrs r2, r1, #3 lsls r2, #3 eors r1, r2 lsls r2, #1 // Each pixel is 2 bytes add r0, r2 adr r2, 3f movs r3, #12 // Each (non-unrolled) loop body is 12 bytes muls r1, r3 subs r2, r1 adds r2, #1 ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) bx r2 .align 2 nop 1: subs r0, #16 sprite_ablit16_loop_body 7 sprite_ablit16_loop_body 6 sprite_ablit16_loop_body 5 sprite_ablit16_loop_body 4 sprite_ablit16_loop_body 3 sprite_ablit16_loop_body 2 sprite_ablit16_loop_body 1 sprite_ablit16_loop_body 0 3: cmp r0, ip bne 1b bx lr .macro sprite_ablit16_alpha_loop_body n ldr r1, [r3, #CTRL0_OFFS] ldr r2, [r3, #POP2_OFFS] lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1 bcs 2f ldrh r2, [r2] lsrs r1, r2, #ALPHA_SHIFT_16BPP bcc 2f strh r2, [r0, #2*\n] 2: .endm decl_func sprite_ablit16_alpha_loop mov ip, r0 ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) lsrs r2, r1, #3 lsls r2, #3 eors r1, r2 lsls r2, #1 // Each pixel is 2 bytes add r0, r2 adr r2, 3f lsls r1, #4 // Each (non-unrolled) loop body is 16 bytes subs r2, r1 adds r2, #1 bx r2 .align 2 nop 1: subs r0, #16 sprite_ablit16_alpha_loop_body 7 sprite_ablit16_alpha_loop_body 6 sprite_ablit16_alpha_loop_body 5 sprite_ablit16_alpha_loop_body 4 sprite_ablit16_alpha_loop_body 3 sprite_ablit16_alpha_loop_body 2 sprite_ablit16_alpha_loop_body 1 sprite_ablit16_alpha_loop_body 0 3: cmp r0, ip bhi 1b bx lr