RP2350 changes (including RISC-V)

pull/73/head
Luke Wren 2024-08-10 13:29:14 -07:00
rodzic 8ecccce7b7
commit ca941baf37
28 zmienionych plików z 1711 dodań i 175 usunięć

Wyświetl plik

@ -1,3 +1,30 @@
RP2350 PicoDVI Preview
======================
Changes from the public GitHub version:
* All Arm assembly in `libdvi` has been ported to RISC-V and tuned for Hazard3
* Some of the existing Arm assembly in `libdvi` has been tweaked for better performance on Cortex-M33
* RGB encode now uses the SIO TMDS encoders by default on RP2350 (can be disabled by defining `DVI_USE_SIO_TMDS_ENCODE=0` -- see `software/libdvi/dvi_config_defs.h`)
* Much of the Arm assembly in `libsprite` has been ported to RISC-V -- enough to run the stock demos
Build instructions:
```bash
cd software
mkdir build
# PICO_PLATFORM can also be rp2350-riscv
# List of DVI configs is in software/include/common_dvi_pin_configs.h
cmake -DPICO_SDK_PATH=/path/to/sdk -DPICO_PLATFORM=rp2350 -DPICO_COPY_TO_RAM=1 -DDVI_DEFAULT_SERIAL_CONFIG=pico_sock_cfg ..
make -j$(nproc)
# Then flash a binary, e.g.:
cp apps/tiles_and_sprites/tiles_and_sprites.uf2
```
If you plan to run the `vista` demo, then note that there are now two UF2 data files, `software/assets/vista_data_rp2040.uf2` and `software/assets/vista_data_rp2350.uf2`. The only difference is the family IDs: the first can be dragged on RP2040 and on RP2350 A0, and the second can be dragged on RP2350 A1 and later.
The following is the original RP2040 writeup:
Bitbanged DVI on the RP2040 Microcontroller
===========================================

1
software/.gitignore vendored
Wyświetl plik

@ -1,2 +1,3 @@
build
*.swp
build-*

Wyświetl plik

@ -1,4 +1,7 @@
add_subdirectory(bad_apple)
if (NOT PICO_RISCV)
# Arm assembly needs porting to RISC-V
add_subdirectory(bad_apple)
endif()
add_subdirectory(colour_terminal)
add_subdirectory(christmas_snowflakes)
add_subdirectory(dht_logging)
@ -12,5 +15,8 @@ add_subdirectory(tiles)
add_subdirectory(tiles_and_sprites)
add_subdirectory(tiles_parallax)
add_subdirectory(vista)
add_subdirectory(vista-palette)
if (PICO_RP2040)
# Needs porting to use XIP stream instead of SSI, as was done to vista
add_subdirectory(vista-palette)
endif()
add_subdirectory(mandel-full)

Wyświetl plik

@ -8,7 +8,6 @@
#include "hardware/gpio.h"
#include "hardware/vreg.h"
#include "hardware/structs/bus_ctrl.h"
#include "hardware/structs/ssi.h"
#include "hardware/dma.h"
#include "pico/sem.h"

Wyświetl plik

@ -1,9 +1,11 @@
#include "hardware/regs/addressmap.h"
#include "hardware/regs/sio.h"
#ifndef __riscv
.syntax unified
.cpu cortex-m0plus
.thumb
#endif
// Using the following:
//
@ -46,12 +48,13 @@
// r8 contains a pointer to the font bitmap for this scanline.
// r9 contains the TMDS LUT base.
.macro do_char charbuf_offs colour_shift_instr colour_shamt
#ifndef __riscv
// Get 8x font bits for next character, put 4 LSBs in bits 6:3 of r4 (so
// scaled to 8-byte LUT entries), and 4 MSBs in bits 6:3 of r6.
ldrb r4, [r0, #\charbuf_offs] // 2
add r4, r8 // 1
ldrb r4, [r4] // 2
lsrs r6, r4, #4 // 1
ldrb r4, [r0, #\charbuf_offs] // 2 (note these cycle
add r4, r8 // 1 counts are for M0+
ldrb r4, [r4] // 2 and are a little
lsrs r6, r4, #4 // 1 pessimistic on M33)
lsls r6, #3 // 1
lsls r4, #28 // 1
lsrs r4, #25 // 1
@ -67,6 +70,31 @@
ldmia r4, {r4, r5} // 3
ldmia r6, {r6, r7} // 3
stmia r2!, {r4-r7} // 5
#else
lbu a4, \charbuf_offs(a0) // 1
\colour_shift_instr a5, a1, \colour_shamt // 1
add a4, a4, t1 // 1
lbu a4, (a4) // 2
srli a6, a4, 4 // 1
andi a4, a4, 0xf // 1
// Get colour bits, add to TMDS LUT base and font bits
and a5, a5, a3 // 1
add a5, a5, t2 // 1
sh3add a4, a4, a5 // 1
sh3add a6, a6, a5 // 1
// Look up and write out 8 TMDS symbols
lw a5, 4(a4) // 1
lw a4, 0(a4) // 1
lw a7, 4(a6) // 1
lw a6, 0(a6) // 1
sw a4, 0(a2) // 1
sw a5, 4(a2) // 1
sw a6, 8(a2) // 1
sw a7, 12(a2) // 1
addi a2, a2, 16 // 1
#endif
.endm
@ -78,9 +106,12 @@
.section .scratch_x.tmds_encode_font_2bpp, "ax"
.global tmds_encode_font_2bpp
#ifndef __riscv
.type tmds_encode_font_2bpp,%function
.thumb_func
#endif
tmds_encode_font_2bpp:
#ifndef __riscv
push {r4-r7, lr}
mov r4, r8
mov r5, r9
@ -123,6 +154,32 @@ tmds_encode_font_2bpp:
mov r10, r6
pop {r4-r7, pc}
#else
sh1add t0, a3, a2
li a3, 0xf0 * 8
mv t1, a4
la t2, palettised_1bpp_tables
mv t3, a1
bgeu a2, t0, 2f
1:
lw a1, (t3)
addi t3, t3, 4
do_char 0 slli 7
do_char 1 slli 3
do_char 2 srli 1
do_char 3 srli 5
do_char 4 srli 9
do_char 5 srli 13
do_char 6 srli 17
do_char 7 srli 21
addi a0, a0, 8
bltu a2, t0, 1b
2:
ret
#endif
// Table generation:
// levels_2bpp_even = [0x05, 0x50, 0xaf, 0xfa]

Wyświetl plik

@ -8,7 +8,6 @@
#include "hardware/pll.h"
#include "hardware/sync.h"
#include "hardware/structs/bus_ctrl.h"
#include "hardware/structs/ssi.h"
#include "hardware/vreg.h"
#include "pico/multicore.h"
#include "pico/sem.h"

Wyświetl plik

@ -8,7 +8,6 @@
#include "hardware/gpio.h"
#include "hardware/vreg.h"
#include "hardware/structs/bus_ctrl.h"
#include "hardware/structs/ssi.h"
#include "hardware/dma.h"
#include "pico/sem.h"

Wyświetl plik

@ -127,8 +127,10 @@ void __not_in_flash("render") render_loop() {
tile16(pixbuf, &bg1, y, FRAME_WIDTH);
queue_add_blocking(&dvi0.q_colour_valid, &pixbuf);
}
bg0.xscroll += 1;
bg1.xscroll += 2;
bg1.xscroll += 1;
if (frame_ctr & 1) {
bg0.xscroll += 1;
}
++frame_ctr;
}
}

Wyświetl plik

@ -10,9 +10,19 @@ add_executable(vista-palette
# flash using direct SSI DMA, which would trample on XIP.
pico_set_binary_type(vista-palette copy_to_ram)
pico_define_boot_stage2(vista-palette_boot2 ${PICO_SDK_PATH}/src/rp2_common/boot_stage2/boot2_w25q080.S)
target_compile_definitions(vista-palette_boot2 PRIVATE PICO_FLASH_SPI_CLKDIV=4)
pico_set_boot_stage2(vista-palette vista-palette_boot2)
if (PICO_RP2040)
pico_define_boot_stage2(vista-palette_boot2 ${PICO_SDK_PATH}/src/rp2040/boot_stage2/boot2_w25q080.S)
pico_set_boot_stage2(vista-palette vista-palette_boot2)
target_compile_definitions(vista-palette_boot2 PRIVATE PICO_FLASH_SPI_CLKDIV=4)
else ()
target_compile_definitions(vista-palette PRIVATE
PICO_EMBED_XIP_SETUP=1
PICO_BOOT_STAGE2_CHOOSE_W25Q080=1
PICO_FLASH_SPI_CLKDIV=2
PICO_FLASH_SPI_RXDELAY=3
)
endif()
target_compile_definitions(vista-palette PRIVATE
DVI_DEFAULT_SERIAL_CONFIG=${DVI_DEFAULT_SERIAL_CONFIG}

Wyświetl plik

@ -13,6 +13,19 @@ target_compile_definitions(vista PRIVATE
DVI_SYMBOLS_PER_WORD=1
)
if (PICO_RP2040)
pico_define_boot_stage2(vista_boot2 ${PICO_SDK_PATH}/src/rp2040/boot_stage2/boot2_w25q080.S)
pico_set_boot_stage2(vista vista_boot2)
target_compile_definitions(vista_boot2 PRIVATE PICO_FLASH_SPI_CLKDIV=4)
else ()
target_compile_definitions(vista PRIVATE
PICO_EMBED_XIP_SETUP=1
PICO_BOOT_STAGE2_CHOOSE_W25Q080=1
PICO_FLASH_SPI_CLKDIV=2
PICO_FLASH_SPI_RXDELAY=3
)
endif()
target_compile_definitions(vista PRIVATE PICO_STACK_SIZE=0x200)
target_link_libraries(vista

Wyświetl plik

@ -7,11 +7,17 @@
#include "hardware/pll.h"
#include "hardware/sync.h"
#include "hardware/structs/bus_ctrl.h"
#include "hardware/structs/ssi.h"
#include "hardware/vreg.h"
#include "pico/multicore.h"
#include "pico/sem.h"
#include "pico/stdlib.h"
#if PICO_RP2040
#include "hardware/structs/ssi.h"
#else
#include "hardware/structs/xip_ctrl.h"
#include "hardware/structs/xip_aux.h"
#include "hardware/structs/qmi.h"
#endif
#include "tmds_encode.h"
@ -45,27 +51,40 @@ static inline void prepare_scanline(const uint32_t *colourbuf, uint32_t *tmdsbuf
tmds_encode_data_channel_fullres_16bpp(colourbuf, tmdsbuf + 2 * pixwidth, pixwidth, 15, 11);
}
void __no_inline_not_in_flash_func(flash_bulk_dma_start)(uint32_t *rxbuf, uint32_t flash_offs, size_t len, uint dma_chan)
{
void __no_inline_not_in_flash_func(flash_bulk_dma_start)(uint32_t *rxbuf, uint32_t flash_offs, size_t len, uint dma_chan) {
#if PICO_RP2040
// On RP2040, program the SSI to clock the correct amount of data without stopping
ssi_hw->ssienr = 0;
ssi_hw->ctrlr1 = len - 1; // NDF, number of data frames
ssi_hw->dmacr = SSI_DMACR_TDMAE_BITS | SSI_DMACR_RDMAE_BITS;
ssi_hw->ssienr = 1;
// Other than NDF, the SSI configuration used for XIP is suitable for a bulk read too.
dma_hw->ch[dma_chan].read_addr = (uint32_t)&ssi_hw->dr0;
const uintptr_t read_addr = (uintptr_t)&ssi_hw->dr0;
const uint dreq = DREQ_XIP_SSIRX;
const bool bswap = true;
#else
// On RP2350, SSI is gone, but XIP streaming is fast enough to keep up with this demo
// (you can still DMA to the DIRECT_MODE FIFOs if you really need 100%)
xip_ctrl_hw->stream_addr = flash_offs;
xip_ctrl_hw->stream_ctr = len;
const uintptr_t read_addr = (uintptr_t)&xip_aux_hw->stream;
const uint dreq = DREQ_XIP_STREAM;
const bool bswap = false;
#endif
dma_hw->ch[dma_chan].read_addr = read_addr;
dma_hw->ch[dma_chan].write_addr = (uint32_t)rxbuf;
dma_hw->ch[dma_chan].transfer_count = len;
dma_hw->ch[dma_chan].ctrl_trig =
DMA_CH0_CTRL_TRIG_BSWAP_BITS |
DREQ_XIP_SSIRX << DMA_CH0_CTRL_TRIG_TREQ_SEL_LSB |
(uint)bswap << DMA_CH0_CTRL_TRIG_BSWAP_LSB |
dreq << DMA_CH0_CTRL_TRIG_TREQ_SEL_LSB |
dma_chan << DMA_CH0_CTRL_TRIG_CHAIN_TO_LSB |
DMA_CH0_CTRL_TRIG_INCR_WRITE_BITS |
DMA_CH0_CTRL_TRIG_DATA_SIZE_VALUE_SIZE_WORD << DMA_CH0_CTRL_TRIG_DATA_SIZE_LSB |
DMA_CH0_CTRL_TRIG_EN_BITS;
#if PICO_RP2040
// Now DMA is waiting, kick off the SSI transfer (mode continuation bits in LSBs)
ssi_hw->dr0 = (flash_offs << 8) | 0xa0;
#endif
}
// Core 1 handles DMA IRQs and runs TMDS encode on scanline buffers it
@ -91,6 +110,15 @@ int __not_in_flash("main") main() {
sleep_ms(10);
set_sys_clock_khz(DVI_TIMING.bit_clk_khz, true);
// A0 SDK won't pick up on the PICO_EMBED_XIP_SETUP flag, so just to make sure:
#if PICO_RP2350
hw_write_masked(
&qmi_hw->m[0].timing,
3 << QMI_M0_TIMING_RXDELAY_LSB | 2 << QMI_M0_TIMING_CLKDIV_LSB,
QMI_M0_TIMING_RXDELAY_BITS | QMI_M0_TIMING_CLKDIV_BITS
);
#endif
setup_default_uart();
gpio_init(LED_PIN);
@ -131,21 +159,25 @@ int __not_in_flash("main") main() {
}
for (int y = 0; y < 2 * FRAME_HEIGHT; y += 2) {
// Start DMA to back buffer before starting to encode the front buffer (each buffer is two scanlines)
#if !PICO_RP2040
// On RP2040 we could never reach this point early, because of the slow encode!
dma_channel_wait_for_finish_blocking(img_dma_chan);
#endif
flash_bulk_dma_start(
(uint32_t*)img_buf[img_buf_back],
current_image_base + ((y + 2) % (2 * FRAME_HEIGHT)) * IMAGE_SCANLINE_SIZE,
IMAGE_SCANLINE_SIZE * 2 / sizeof(uint32_t),
img_dma_chan
);
const uint16_t *img = (const uint16_t*)img_buf[img_buf_front];
const uint16_t *img = (const uint16_t*)img_buf[img_buf_front];
uint32_t *our_tmds_buf, *their_tmds_buf;
queue_remove_blocking_u32(&dvi0.q_tmds_free, &their_tmds_buf);
multicore_fifo_push_blocking((uint32_t)(img));
multicore_fifo_push_blocking((uint32_t)their_tmds_buf);
queue_remove_blocking_u32(&dvi0.q_tmds_free, &our_tmds_buf);
prepare_scanline((const uint32_t*)(img + FRAME_WIDTH * 2), our_tmds_buf);
multicore_fifo_pop_blocking();
queue_add_blocking_u32(&dvi0.q_tmds_valid, &their_tmds_buf);
queue_add_blocking_u32(&dvi0.q_tmds_valid, &our_tmds_buf);
@ -156,4 +188,3 @@ int __not_in_flash("main") main() {
}
__builtin_unreachable();
}

Plik binarny nie jest wyświetlany.

Wyświetl plik

@ -28,6 +28,17 @@ static const struct dvi_serialiser_cfg picodvi_reva_dvi_cfg = {
.invert_diffpairs = true
};
// AMY-DVI board, for getting HDMI from the RP2350 FPGA development platform,
// again a cursed board that only a couple of people in the world possess:
static const struct dvi_serialiser_cfg amy_dvi_cfg = {
.pio = DVI_DEFAULT_PIO_INST,
.sm_tmds = {0, 1, 2},
.pins_tmds = {14, 16, 18},
.pins_clk = 12,
.invert_diffpairs = true
};
// The not-HDMI socket on Rev C PicoDVI boards
// (we don't talk about Rev B)
static const struct dvi_serialiser_cfg picodvi_dvi_cfg = {

Wyświetl plik

@ -184,7 +184,7 @@ static void __dvi_func(dvi_dma_irq_handler)(struct dvi_inst *inst) {
// Make sure all three channels have definitely loaded their last block
// (should be within a few cycles of one another)
for (int i = 0; i < N_TMDS_LANES; ++i) {
while (dma_debug_hw->ch[inst->dma_cfg[i].chan_data].tcr != inst->timing->h_active_pixels / DVI_SYMBOLS_PER_WORD)
while (dma_debug_hw->ch[inst->dma_cfg[i].chan_data].dbg_tcr != inst->timing->h_active_pixels / DVI_SYMBOLS_PER_WORD)
tight_loop_contents();
}

Wyświetl plik

@ -51,8 +51,16 @@
#define DVI_SYMBOLS_PER_WORD 2
#endif
#if DVI_SYMBOLS_PER_WORD != 1 && DVI_SYMBOLS_PER_WORD !=2
#error "Unsupported value for DVI_SYMBOLS_PER_WORD"
// Implement TMDS encode with hardware encoders in SIO, instead of
// interpolators + LUTs. The processor still has to crank the encoder, but
// it's much faster. This still works with PIO serialisers, which can appear
// on any GPIO, unlike the HSTX which is limited to specific GPIOs.
#ifndef DVI_USE_SIO_TMDS_ENCODER
#if PICO_RP2040
#define DVI_USE_SIO_TMDS_ENCODER 0
#else
#define DVI_USE_SIO_TMDS_ENCODER 1
#endif
#endif
// ----------------------------------------------------------------------------

Wyświetl plik

@ -2,6 +2,10 @@
#include "hardware/regs/sio.h"
#include "dvi_config_defs.h"
// This file contains both Arm and RISC-V source, with the correct version
// selected via the __arm__ and __riscv predefined macros. The targeted Arm
// dialect is Armv6-M, and the targeted RISC-V dialect is RV32IZba
// Offsets suitable for ldr/str (must be <= 0x7c):
#define ACCUM0_OFFS (SIO_INTERP0_ACCUM0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define ACCUM1_OFFS (SIO_INTERP0_ACCUM1_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
@ -13,23 +17,33 @@
// Note the entirety of INTERP0 and INTERP1 fits inside this 5-bit
// word-addressed space... almost as though it were intentional! :)
#if defined(__arm__) && defined(__riscv)
#error "wat"
#endif
#ifdef __arm__
.syntax unified
.cpu cortex-m0plus
.thumb
#endif
.macro decl_func_x name
.section .scratch_x.\name, "ax"
.global \name
#ifdef __arm__
.type \name,%function
.thumb_func
#endif
\name:
.endm
.macro decl_func_y name
.section .scratch_y.\name, "ax"
.global \name
#ifdef __arm__
.type \name,%function
.thumb_func
#endif
\name:
.endm
@ -41,7 +55,10 @@
// r0: Input buffer (word-aligned)
// r1: Output buffer (word-aligned)
// r2: Input size (pixels)
// r3: Left shift (for the *_leftshift variant only -- costs 1 cycle per 2 pixels)
#if defined(__arm__)
// Armv6-M:
.macro do_channel_16bpp r_ibase r_inout0 r_out1
str \r_inout0, [\r_ibase, #ACCUM0_OFFS]
ldr \r_inout0, [\r_ibase, #PEEK0_OFFS]
@ -50,8 +67,11 @@
ldr \r_out1, [\r_out1]
.endm
decl_func tmds_encode_loop_16bpp
.macro tmds_encode_loop_16bpp_impl leftshift
push {r4, r5, r6, r7, lr}
// Bounds calculation: each input pixel results in two output pixels,
// whose two TMDS symbols are packed in a single 32-bit word. So, 4 bytes
// out per one pixel in.
lsls r2, #2
add r2, r1
mov ip, r2
@ -61,7 +81,13 @@ decl_func tmds_encode_loop_16bpp
1:
.rept TMDS_ENCODE_UNROLL
ldmia r0!, {r4, r6}
.if \leftshift
lsls r4, r3
.endif
do_channel_16bpp r2, r4, r5
.if \leftshift
lsls r6, r3
.endif
do_channel_16bpp r2, r6, r7
stmia r1!, {r4, r5, r6, r7}
.endr
@ -69,82 +95,72 @@ decl_func tmds_encode_loop_16bpp
cmp r1, ip
bne 1b
pop {r4, r5, r6, r7, pc}
.endm
// Same as above, but scale data to make up for lack of left shift
// in interpolator (costs 1 cycle per 2 pixels)
//
// r0: Input buffer (word-aligned)
// r1: Output buffer (word-aligned)
// r2: Input size (pixels)
// r3: Left shift amount
#elif defined(__riscv)
.macro do_channel_16bpp r_ibase r_inout0 r_out1
sw \r_inout0, ACCUM0_OFFS(\r_ibase)
// Note two halves are interleaved to avoid load->addr dependency
lw \r_inout0, PEEK0_OFFS(\r_ibase)
lw \r_out1, PEEK1_OFFS(\r_ibase)
lw \r_inout0, (\r_inout0)
lw \r_out1, (\r_out1)
.endm
.macro tmds_encode_loop_16bpp_impl leftshift
slli a2, a2, 2
add t0, a2, a1
bgeu a1, t0, 2f
li a2, SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET
.align 2
1:
.set i, 0
.rept TMDS_ENCODE_UNROLL
lw a4, 8 * i + 0(a0)
lw a6, 8 * i + 4(a0)
.if \leftshift
sll a4, a4, a3
sll a6, a6, a3
.endif
do_channel_16bpp a2, a4, a5
do_channel_16bpp a2, a6, a7
sw a4, 16 * i + 0(a1)
sw a5, 16 * i + 4(a1)
sw a6, 16 * i + 8(a1)
sw a7, 16 * i + 12(a1)
.set i, i + 1
.endr
addi a0, a0, 8 * TMDS_ENCODE_UNROLL
addi a1, a1, 16 * TMDS_ENCODE_UNROLL
bltu a1, t0, 1b
2:
ret
.endm
#else
#error "Unknown architecture"
#endif
decl_func tmds_encode_loop_16bpp
tmds_encode_loop_16bpp_impl 0
decl_func tmds_encode_loop_16bpp_leftshift
push {r4, r5, r6, r7, lr}
lsls r2, #2
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
b 2f
.align 2
1:
.rept TMDS_ENCODE_UNROLL
ldmia r0!, {r4, r6}
lsls r4, r3
do_channel_16bpp r2, r4, r5
lsls r6, r3
do_channel_16bpp r2, r6, r7
stmia r1!, {r4, r5, r6, r7}
.endr
2:
cmp r1, ip
bne 1b
pop {r4, r5, r6, r7, pc}
tmds_encode_loop_16bpp_impl 1
// r0: Input buffer (word-aligned)
// r1: Output buffer (word-aligned)
// r2: Input size (pixels)
decl_func tmds_encode_loop_8bpp
push {r4, r5, r6, r7, lr}
lsls r2, #2
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
b 2f
.align 2
1:
.rept TMDS_ENCODE_UNROLL
ldmia r0!, {r4}
str r4, [r2, #ACCUM0_OFFS + INTERP1]
str r4, [r2, #ACCUM0_OFFS]
ldr r4, [r2, #PEEK0_OFFS]
ldr r4, [r4]
ldr r5, [r2, #PEEK1_OFFS]
ldr r5, [r5]
ldr r6, [r2, #PEEK0_OFFS + INTERP1]
ldr r6, [r6]
ldr r7, [r2, #PEEK1_OFFS + INTERP1]
ldr r7, [r7]
stmia r1!, {r4, r5, r6, r7}
.endr
2:
cmp r1, ip
bne 1b
pop {r4, r5, r6, r7, pc}
// r0: Input buffer (word-aligned)
// r1: Output buffer (word-aligned)
// r2: Input size (pixels)
// r3: Left shift amount
// r3: Left shift amount (for the *_leftshift variant of the function)
//
// Note that only the data written to interp0 (pixel 0, 1) is leftshifted, not
// the data written to interp1 (pixel 2, 3). Otherwise we always lose MSBs, as
// the LUT offset MSB is at bit 8, so pixel 0 always requires some left shift,
// since its channel MSBs are no greater than 7.
decl_func tmds_encode_loop_8bpp_leftshift
#if defined(__arm__)
.macro tmds_encode_loop_8bpp_impl leftshift
push {r4, r5, r6, r7, lr}
lsls r2, #3
lsls r2, #2
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
@ -154,7 +170,9 @@ decl_func tmds_encode_loop_8bpp_leftshift
.rept TMDS_ENCODE_UNROLL
ldmia r0!, {r4}
str r4, [r2, #ACCUM0_OFFS + INTERP1]
.if \leftshift
lsls r4, r3
.endif
str r4, [r2, #ACCUM0_OFFS]
ldr r4, [r2, #PEEK0_OFFS]
ldr r4, [r4]
@ -170,6 +188,54 @@ decl_func tmds_encode_loop_8bpp_leftshift
cmp r1, ip
bne 1b
pop {r4, r5, r6, r7, pc}
.endm
#elif defined(__riscv)
.macro tmds_encode_loop_8bpp_impl leftshift
slli a2, a2, 2
add a2, a2, a1
bgeu a1, a2, 2f
mv t0, a2
li a2, SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET
.align 2
1:
.set i, 0
.rept TMDS_ENCODE_UNROLL
lw a4, 4 * i(a0)
sw a4, ACCUM0_OFFS + INTERP1(a2)
.if \leftshift
sll a4, a4, a3
.endif
sw a4, ACCUM0_OFFS(a2)
lw a4, PEEK0_OFFS(a2)
lw a5, PEEK1_OFFS(a2)
lw a4, (a4)
lw a5, (a5)
lw a6, PEEK0_OFFS + INTERP1(a2)
lw a7, PEEK1_OFFS + INTERP1(a2)
lw a6, (a6)
lw a7, (a7)
sw a4, 16 * i + 0(a1)
sw a5, 16 * i + 4(a1)
sw a6, 16 * i + 8(a1)
sw a7, 16 * i + 12(a1)
.set i, i + 1
.endr
addi a0, a0, TMDS_ENCODE_UNROLL * 4
addi a1, a1, TMDS_ENCODE_UNROLL * 16
bltu a1, t0, 1b
2:
ret
.endm
#else
#error "Unknown architecture"
#endif
decl_func tmds_encode_loop_8bpp
tmds_encode_loop_8bpp_impl 0
decl_func tmds_encode_loop_8bpp_leftshift
tmds_encode_loop_8bpp_impl 1
// ----------------------------------------------------------------------------
// Fast 1bpp black/white encoder (full res)
@ -190,6 +256,8 @@ decl_func tmds_encode_loop_8bpp_leftshift
// r3 contains lookup mask (preshifted)
// r8 contains pointer to encode table
// 2.125 cyc/pix
#if defined(__arm__)
.macro tmds_encode_1bpp_body shift_instr0 shamt0 shift_instr1 shamt1
\shift_instr0 r4, r2, #\shamt0
ands r4, r3
@ -238,6 +306,58 @@ decl_func tmds_encode_1bpp
mov r8, r7
pop {r4-r7, pc}
#elif defined(__riscv)
// TODO the register allocation is not optimal here for code size
.macro tmds_encode_1bpp_body shift_instr0 shamt0 shift_instr1 shamt1
\shift_instr0 a4, a2, \shamt0
and a4, a4, a3
add a4, a4, t1
lw a5, 4(a4)
lw a4, 0(a4)
\shift_instr1 a6, a2, \shamt1
and a6, a6, a3
add a6, a6, t1
lw a7, 4(a6)
lw a6, 0(a6)
sw a4, 0(a1)
sw a5, 4(a1)
sw a6, 8(a1)
sw a7, 12(a1)
addi a1, a1, 16
.endm
// a0: input buffer (word-aligned)
// a1: output buffer (word-aligned)
// a2: output pixel count
decl_func tmds_encode_1bpp
slli a2, a2, 1
add t0, a2, a1
la t1, tmds_1bpp_table
// Mask: 4 bit index, 8 bytes per entry
li a3, 0x78
bgeu a1, t0, 2f
1:
lw a2, (a0)
addi a0, a0, 4
#if !DVI_1BPP_BIT_REVERSE
tmds_encode_1bpp_body slli 3 srli 1
tmds_encode_1bpp_body srli 5 srli 9
tmds_encode_1bpp_body srli 13 srli 17
tmds_encode_1bpp_body srli 21 srli 25
#else
tmds_encode_1bpp_body srli 1 slli 3
tmds_encode_1bpp_body srli 9 srli 5
tmds_encode_1bpp_body srli 17 srli 13
tmds_encode_1bpp_body srli 25 srli 21
#endif
bltu a1, t0, 1b
2:
ret
#else
#error "Unknown architecture"
#endif
.align 2
tmds_1bpp_table:
#if !DVI_1BPP_BIT_REVERSE
@ -299,6 +419,7 @@ tmds_1bpp_table:
// level 2: (a5 -> 163) always
// level 3: (ef -> 2f0) always
#if defined(__arm__)
// Table base pointer in r0. Input pixels in r2.
.macro encode_2bpp_body shift_instr shamt rd
\shift_instr \rd, r2, #\shamt
@ -343,6 +464,55 @@ decl_func tmds_encode_2bpp
mov r8, r7
pop {r4-r7, pc}
#elif defined(__riscv)
// Table base pointer in a0. Input pixels in a2.
.macro encode_2bpp_body shift_instr shamt rd
\shift_instr \rd, a2, \shamt
and \rd, \rd, a3
add \rd, \rd, a0
lw \rd, (\rd)
.endm
// a0: input buffer (word-aligned)
// a1: output buffer (word-aligned)
// a2: output pixel count
decl_func tmds_encode_2bpp
mv t1, a0
la a0, tmds_2bpp_table
// Mask: 4-bit index into 4-byte entries.
li a3, 0x3c
// Limit pointer: 1 word per 2 pixels
slli a2, a2, 1
add t0, a2, a1
bgeu a1, t0, 1b
1:
lw a2, (t1)
addi t1, t1, 4
encode_2bpp_body slli 2 a4
encode_2bpp_body srli 2 a5
encode_2bpp_body srli 6 a6
encode_2bpp_body srli 10 a7
sw a4, 0(a1)
sw a5, 4(a1)
sw a6, 8(a1)
sw a7, 12(a1)
encode_2bpp_body srli 14 a4
encode_2bpp_body srli 18 a5
encode_2bpp_body srli 22 a6
encode_2bpp_body srli 26 a7
sw a4, 16(a1)
sw a5, 20(a1)
sw a6, 24(a1)
sw a7, 28(a1)
addi a1, a1, 32
bltu a1, t0, 1b
2:
ret
#else
#error "Unknown architecture"
#endif
.align 2
tmds_2bpp_table:
.word 0x7f103 // 00, 00
@ -404,17 +574,20 @@ tmds_2bpp_table:
// much better, and many monitors will still accept the signals as long as you
// DC couple your DVI signals.
.macro tmds_fullres_encode_loop_body ra rb
#if defined(__arm__)
.macro tmds_fullres_encode_loop_body leftshift ra rb
str \ra, [r2, #ACCUM0_OFFS + INTERP1]
.if \leftshift
lsls \ra, r3
.endif
str \ra, [r2, #ACCUM0_OFFS]
// Loads interleaved to avoid rdata->addr stall on M33
ldr \ra, [r2, #PEEK2_OFFS]
ldr \ra, [\ra]
#if !TMDS_FULLRES_NO_DC_BALANCE
str \ra, [r2, #ACCUM1_ADD_OFFS]
#endif
ldr \rb, [r2, #PEEK2_OFFS + INTERP1]
ldr \ra, [\ra]
ldr \rb, [\rb]
#if !TMDS_FULLRES_NO_DC_BALANCE
str \ra, [r2, #ACCUM1_ADD_OFFS]
str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1]
#endif
.endm
@ -422,8 +595,9 @@ tmds_2bpp_table:
// r0: Input buffer (word-aligned)
// r1: Output buffer (word-aligned)
// r2: Pixel count
// r3: Left shift amount
.macro tmds_fullres_encode_loop_16bpp
.macro tmds_fullres_encode_loop_16bpp leftshift
push {r4-r7, lr}
mov r4, r8
push {r4}
@ -451,8 +625,8 @@ tmds_2bpp_table:
1:
.rept 16
ldmia r0!, {r4, r6}
tmds_fullres_encode_loop_body r4 r5
tmds_fullres_encode_loop_body r6 r7
tmds_fullres_encode_loop_body \leftshift r4 r5
tmds_fullres_encode_loop_body \leftshift r6 r7
stmia r1!, {r4, r5, r6, r7}
.endr
2:
@ -465,82 +639,77 @@ tmds_2bpp_table:
pop {r4-r7, pc}
.endm
// One copy each in X and Y, so the two cores don't step on each other
decl_func_x tmds_fullres_encode_loop_16bpp_x
tmds_fullres_encode_loop_16bpp
decl_func_y tmds_fullres_encode_loop_16bpp_y
tmds_fullres_encode_loop_16bpp
#elif defined(__riscv)
.macro tmds_fullres_encode_loop_body_leftshift ra rb
// Note we apply the leftshift for INTERP0 only
str \ra, [r2, #ACCUM0_OFFS + INTERP1]
lsls \ra, r3
str \ra, [r2, #ACCUM0_OFFS]
ldr \ra, [r2, #PEEK2_OFFS]
ldr \ra, [\ra]
.macro tmds_fullres_encode_loop_body leftshift ra rb
sw \ra, ACCUM0_OFFS + INTERP1(a2)
.if \leftshift
sll \ra, \ra, a3
.endif
sw \ra, ACCUM0_OFFS(a2)
lw \ra, PEEK2_OFFS(a2)
lw \rb, PEEK2_OFFS + INTERP1(a2)
lw \ra, (\ra)
lw \rb, (\rb)
#if !TMDS_FULLRES_NO_DC_BALANCE
str \ra, [r2, #ACCUM1_ADD_OFFS]
#endif
ldr \rb, [r2, #PEEK2_OFFS + INTERP1]
ldr \rb, [\rb]
#if !TMDS_FULLRES_NO_DC_BALANCE
str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1]
sw \ra, ACCUM1_ADD_OFFS(a2)
sw \rb, ACCUM1_ADD_OFFS + INTERP1(a2)
#endif
.endm
// r0: Input buffer (word-aligned)
// r1: Output buffer (word-aligned)
// r2: Pixel count
// r3: Left shift amount
// a0: Input buffer (word-aligned)
// a1: Output buffer (word-aligned)
// a2: Pixel count
// a3: Left shift amount
.macro tmds_fullres_encode_loop_16bpp_leftshift
push {r4-r7, lr}
mov r4, r8
mov r5, r9
push {r4-r5}
lsls r2, #2
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
.macro tmds_fullres_encode_loop_16bpp leftshift
sh2add t0, a2, a1
li a2, SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET
// DC balance defined to be 0 at start of scanline:
movs r4, #0
str r4, [r2, #ACCUM1_OFFS]
li a4, 0
sw a4, ACCUM1_OFFS(a2)
#if TMDS_FULLRES_NO_DC_BALANCE
// Alternate parity between odd/even symbols if there's no balance feedback
mvns r4, r4
// Alternate parity between odd/even symbols if no feedback
li a4, -1
#endif
str r4, [r2, #ACCUM1_OFFS + INTERP1]
sw a4, ACCUM1_OFFS + INTERP1(a2)
adr r4, 1f
adds r4, #1
mov r8, r4
b 2f
bgeu a1, t0, 2f
.align 2
1:
.rept 16 // 64 pixels per iteration
ldmia r0!, {r4, r6}
tmds_fullres_encode_loop_body_leftshift r4 r5
tmds_fullres_encode_loop_body_leftshift r6 r7
stmia r1!, {r4, r5, r6, r7}
.set i, 0
.rept 16
lw a4, 8 * i + 0(a0)
lw a6, 8 * i + 4(a0)
tmds_fullres_encode_loop_body \leftshift a4 a5
tmds_fullres_encode_loop_body \leftshift a6 a7
sw a4, 16 * i + 0(a1)
sw a5, 16 * i + 4(a1)
sw a6, 16 * i + 8(a1)
sw a7, 16 * i + 12(a1)
.set i, i + 1
.endr
addi a0, a0, 8 * i
addi a1, a1, 16 * i
bltu a1, t0, 1b
2:
cmp r1, ip
beq 1f
bx r8
1:
pop {r4-r5}
mov r8, r4
mov r9, r5
pop {r4-r7, pc}
ret
.endm
decl_func_x tmds_fullres_encode_loop_16bpp_leftshift_x
tmds_fullres_encode_loop_16bpp_leftshift
decl_func_y tmds_fullres_encode_loop_16bpp_leftshift_y
tmds_fullres_encode_loop_16bpp_leftshift
#else
#error "Unknown architecture"
#endif
// One copy each in X and Y, so the two cores don't step on each other
decl_func_x tmds_fullres_encode_loop_16bpp_x
tmds_fullres_encode_loop_16bpp 0
decl_func_y tmds_fullres_encode_loop_16bpp_y
tmds_fullres_encode_loop_16bpp 0
decl_func_x tmds_fullres_encode_loop_16bpp_leftshift_x
tmds_fullres_encode_loop_16bpp 1
decl_func_y tmds_fullres_encode_loop_16bpp_leftshift_y
tmds_fullres_encode_loop_16bpp 1
// ----------------------------------------------------------------------------
// Full-resolution 8bpp paletted encode
@ -550,19 +719,19 @@ decl_func_y tmds_fullres_encode_loop_16bpp_leftshift_y
// base is set to a reordered list of TMDS symbols based
// on a user colour palette.
#ifdef __arm__
// Two pixels input in rd[17:2]. Two symbols output in rd[19:0]. r2 contains
// interp base pointer. r7 used as temporary.
.macro tmds_palette_encode_loop_body rd
str \rd, [r2, #ACCUM0_OFFS]
str \rd, [r2, #ACCUM0_OFFS + INTERP1]
// Loads interleaved to avoid rdata->addr stall on M33
ldr \rd, [r2, #PEEK2_OFFS]
ldr \rd, [\rd]
#if !TMDS_FULLRES_NO_DC_BALANCE
str \rd, [r2, #ACCUM1_ADD_OFFS]
#endif
ldr r7, [r2, #PEEK2_OFFS + INTERP1]
ldr \rd, [\rd]
ldr r7, [r7]
#if !TMDS_FULLRES_NO_DC_BALANCE
str \rd, [r2, #ACCUM1_ADD_OFFS]
str r7, [r2, #ACCUM1_ADD_OFFS + INTERP1]
#endif
lsls r7, #10
@ -617,7 +786,241 @@ decl_func_y tmds_fullres_encode_loop_16bpp_leftshift_y
pop {r4-r7, pc}
.endm
#elif defined(__riscv)
// Two pixels input in rd[17:2]. Two symbols output in rd[19:0]. a2 contains
// interp base pointer. a5 used as temporary.
.macro tmds_palette_encode_loop_body rd
sw \rd, ACCUM0_OFFS(a2)
sw \rd, ACCUM0_OFFS + INTERP1(a2)
lw \rd, PEEK2_OFFS(a2)
lw a5, PEEK2_OFFS + INTERP1(a2)
lw \rd, (\rd)
lw a5, (a5)
#if !TMDS_FULLRES_NO_DC_BALANCE
sw \rd, ACCUM1_ADD_OFFS(a2)
sw a5, ACCUM1_ADD_OFFS + INTERP1(a2)
#endif
slli a5, a5, 10
or \rd, \rd, a5
.endm
.macro tmds_palette_encode_loop
mv t1, s0
sh1add t0, a2, a1
li a2, SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET
// DC balance defined to be 0 at start of scanline:
li a4, 0
sw a4, ACCUM1_OFFS(a2)
#if TMDS_FULLRES_NO_DC_BALANCE
// Alternate parity between odd/even symbols if there's no balance feedback
li a4, -1
#endif
sw a4, ACCUM1_OFFS + INTERP1(a2)
bgeu a1, t0, 2f
.align 2
1:
.set i, 0
.rept 10
lw a3, 8 * i + 0(a0)
lw s0, 8 * i + 4(a0)
srli a4, a3, 14
slli a3, a3, 2
tmds_palette_encode_loop_body a3
tmds_palette_encode_loop_body a4
sw a3, 16 * i + 0(a1)
sw a4, 16 * i + 4(a1)
srli a4, s0, 14
slli s0, s0, 2
tmds_palette_encode_loop_body s0
tmds_palette_encode_loop_body a4
sw s0, 16 * i + 8(a1)
sw a4, 16 * i + 12(a1)
.set i, i + 1
.endr
addi a0, a0, 8 * i
addi a1, a1, 16 * i
bltu a1, t0, 1b
2:
mv s0, t1
ret
.endm
#endif
decl_func_x tmds_palette_encode_loop_x
tmds_palette_encode_loop
decl_func_y tmds_palette_encode_loop_y
tmds_palette_encode_loop
// ----------------------------------------------------------------------------
// Hand-cranking loops for SIO TMDS encoders
#if DVI_USE_SIO_TMDS_ENCODER
#if defined(__arm__)
// r0: input buffer (word-aligned)
// r1: output buffer (word-aligned)
// r2: pixel count
.macro tmds_encode_sio_loop size_ratio peek
// For larger load/store offsets at high ratios/unroll:
.cpu cortex-m33
.if \size_ratio > 4 * TMDS_ENCODE_UNROLL
.set unroll, 1
.else
.set unroll, 4 * TMDS_ENCODE_UNROLL / \size_ratio
.endif
.if \peek
.set even_offset_adj, (SIO_TMDS_PEEK_SINGLE_OFFSET - SIO_TMDS_POP_SINGLE_OFFSET)
.else
.set even_offset_adj, 0
.endif
push {r4, lr}
#if DVI_SYMBOLS_PER_WORD == 1
lsls r2, r2, #2
#else
lsls r2, r2, #1
#endif
adds r2, r1
ldr r3, =SIO_BASE + SIO_TMDS_CTRL_OFFSET
b 2f
1:
.set i, 0
.rept unroll
ldr r4, [r0, #i * 4]
str r4, [r3, #SIO_TMDS_WDATA_OFFSET - SIO_TMDS_CTRL_OFFSET]
.set j, 0
.rept \size_ratio
.set offset_adj, even_offset_adj * ((1 + j + \size_ratio * i) & 0x1)
#if DVI_SYMBOLS_PER_WORD == 2
ldr r4, [r3, #offset_adj + SIO_TMDS_POP_DOUBLE_L0_OFFSET - SIO_TMDS_CTRL_OFFSET]
#else
ldr r4, [r3, #offset_adj + SIO_TMDS_POP_SINGLE_OFFSET - SIO_TMDS_CTRL_OFFSET]
#endif
str r4, [r1, #4 * (j + i * \size_ratio)]
.set j, j + 1
.endr
.set i, i + 1
.endr
adds r0, 4 * unroll
adds r1, 4 * unroll * \size_ratio
2:
cmp r1, r2
blo 1b
pop {r4, pc}
.cpu cortex-m0plus
.endm
#elif defined(__riscv)
// a0: input buffer (word-aligned)
// a1: output buffer (word-aligned)
// a2: pixel count
.macro tmds_encode_sio_loop size_ratio peek
.if \size_ratio > 4 * TMDS_ENCODE_UNROLL
.set unroll, 1
.else
.set unroll, 4 * TMDS_ENCODE_UNROLL / \size_ratio
.endif
.if \peek
.set even_offset_adj, (SIO_TMDS_PEEK_SINGLE_OFFSET - SIO_TMDS_POP_SINGLE_OFFSET)
.else
.set even_offset_adj, 0
.endif
#if DVI_SYMBOLS_PER_WORD == 1
sh2add a2, a2, a1
#else
sh1add a2, a2, a1
#endif
li a3, SIO_BASE + SIO_TMDS_CTRL_OFFSET
bgeu a1, a2, 2f
1:
.set i, 0
.rept unroll
lw a4, i * 4(a0)
sw a4, SIO_TMDS_WDATA_OFFSET - SIO_TMDS_CTRL_OFFSET(a3)
.set j, 0
.rept \size_ratio
.set offset_adj, even_offset_adj * ((1 + j + \size_ratio * i) & 0x1)
#if DVI_SYMBOLS_PER_WORD == 2
lw a4, offset_adj + SIO_TMDS_POP_DOUBLE_L0_OFFSET - SIO_TMDS_CTRL_OFFSET(a3)
#else
lw a4, offset_adj + SIO_TMDS_POP_SINGLE_OFFSET - SIO_TMDS_CTRL_OFFSET(a3)
#endif
sw a4, 4 * (j + i * \size_ratio)(a1)
.set j, j + 1
.endr
.set i, i + 1
.endr
addi a0, a0, 4 * unroll
addi a1, a1, 4 * unroll * \size_ratio
bltu a1, a2, 1b
2:
ret
.endm
#else
#error "Unknown architecture"
#endif
// For DVI_SYMBOLS_PER_WORD == 2, the ratio of output : input buffer size is:
//
// Bits/pixel | Ratio (with hdouble) | Ratio (no hdouble)
// -----------+----------------------+-------------------
// 1 | 32 | 16
// 2 | 16 | 8
// 4 | 8 | 4
// 8 | 4 | 2
// 16 | 2 | 1
//
// For DVI_SYMBOLS_PER_WORD == 1, these ratios are doubled.
// poppop variants will read from a xxx_POP register for every output word
decl_func tmds_encode_sio_loop_poppop_ratio1
tmds_encode_sio_loop 1, 0
decl_func tmds_encode_sio_loop_poppop_ratio2
tmds_encode_sio_loop 2, 0
decl_func tmds_encode_sio_loop_poppop_ratio4
tmds_encode_sio_loop 4, 0
decl_func tmds_encode_sio_loop_poppop_ratio8
tmds_encode_sio_loop 8, 0
decl_func tmds_encode_sio_loop_poppop_ratio16
tmds_encode_sio_loop 16, 0
decl_func tmds_encode_sio_loop_poppop_ratio32
tmds_encode_sio_loop 32, 0
decl_func tmds_encode_sio_loop_poppop_ratio64
tmds_encode_sio_loop 64, 0
// peekpop variants will read alternately from xxx_PEEK and xxx_POP: this is
// needed for pixel-doubled output when DVI_PIXELS_PER_WORD == 1 (note the
// POP value is different from the PEEK value, as it's the same data but with
// different running DC balance)
decl_func tmds_encode_sio_loop_peekpop_ratio1
tmds_encode_sio_loop 1, 1
decl_func tmds_encode_sio_loop_peekpop_ratio2
tmds_encode_sio_loop 2, 1
decl_func tmds_encode_sio_loop_peekpop_ratio4
tmds_encode_sio_loop 4, 1
decl_func tmds_encode_sio_loop_peekpop_ratio8
tmds_encode_sio_loop 8, 1
decl_func tmds_encode_sio_loop_peekpop_ratio16
tmds_encode_sio_loop 16, 1
decl_func tmds_encode_sio_loop_peekpop_ratio32
tmds_encode_sio_loop 32, 1
decl_func tmds_encode_sio_loop_peekpop_ratio64
tmds_encode_sio_loop 64, 1
#endif

Wyświetl plik

@ -3,7 +3,7 @@
#include "hardware/gpio.h"
#include "hardware/sync.h"
static const uint32_t __scratch_x("tmds_table") tmds_table[] = {
static const __unused uint32_t __scratch_x("tmds_table") tmds_table[] = {
#include "tmds_table.h"
};
@ -11,14 +11,15 @@ static const uint32_t __scratch_x("tmds_table") tmds_table[] = {
// memory. There is a third copy which can go in flash, because it's just used
// to generate palette LUTs. The ones we don't use will get garbage collected
// during linking.
const uint32_t __scratch_x("tmds_table_fullres_x") tmds_table_fullres_x[] = {
const __unused uint32_t __scratch_x("tmds_table_fullres_x") tmds_table_fullres_x[] = {
#include "tmds_table_fullres.h"
};
const uint32_t __scratch_y("tmds_table_fullres_y") tmds_table_fullres_y[] = {
const __unused uint32_t __scratch_y("tmds_table_fullres_y") tmds_table_fullres_y[] = {
#include "tmds_table_fullres.h"
};
#if !DVI_USE_SIO_TMDS_ENCODER
// Configure an interpolator to extract a single colour channel from each of a pair
// of pixels, with the first pixel's lsb at pixel_lsb, and the pixels being
// pixel_width wide. Produce a LUT address for the first pixel's colour data on
@ -35,11 +36,16 @@ static int __not_in_flash_func(configure_interp_for_addrgen)(interp_hw_t *interp
int shift_channel_to_index = pixel_lsb + channel_msb - (lut_index_width - 1) - index_shift;
int oops = 0;
#if PICO_RP2040
if (shift_channel_to_index < 0) {
// "It's ok we'll fix it in software"
oops = -shift_channel_to_index;
shift_channel_to_index = 0;
}
#else
// Now a right-rotate, not a right-shift
shift_channel_to_index &= 0x1f;
#endif
uint index_msb = index_shift + lut_index_width - 1;
@ -60,23 +66,60 @@ static int __not_in_flash_func(configure_interp_for_addrgen)(interp_hw_t *interp
return oops;
}
#else
// Encoding a single channel at a time is not the most efficient way to use
// this hardware, because it means we read the colour buffer multiple times,
// but it fits better with how things are done in software on RP2040.
static void __not_in_flash_func(configure_sio_tmds_for_single_channel)(uint channel_msb, uint channel_lsb, uint pixel_width, bool hdouble) {
assert(channel_msb - channel_lsb <= 7); // 1 through 8 bits, inclusive
sio_hw->tmds_ctrl =
SIO_TMDS_CTRL_CLEAR_BALANCE_BITS |
((channel_msb - channel_lsb) << SIO_TMDS_CTRL_L0_NBITS_LSB) |
(((channel_msb - 7u) & 0xfu) << SIO_TMDS_CTRL_L0_ROT_LSB) |
((1 + __builtin_ctz(pixel_width)) << SIO_TMDS_CTRL_PIX_SHIFT_LSB) |
((uint)hdouble << SIO_TMDS_CTRL_PIX2_NOSHIFT_LSB);
}
#endif
// Extract up to 6 bits from a buffer of 16 bit pixels, and produce a buffer
// of TMDS symbols from this colour channel. Number of pixels must be even,
// pixel buffer must be word-aligned.
void __not_in_flash_func(tmds_encode_data_channel_16bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) {
#if DVI_USE_SIO_TMDS_ENCODER
configure_sio_tmds_for_single_channel(channel_msb, channel_lsb, 16, true);
#if DVI_SYMBOLS_PER_WORD == 1
tmds_encode_sio_loop_peekpop_ratio4(pixbuf, symbuf, 2 * n_pix);
#else
tmds_encode_sio_loop_poppop_ratio2(pixbuf, symbuf, 2 * n_pix);
#endif
#else
interp_hw_save_t interp0_save;
interp_save(interp0_hw, &interp0_save);
int require_lshift = configure_interp_for_addrgen(interp0_hw, channel_msb, channel_lsb, 0, 16, 6, tmds_table);
#if PICO_RP2040
if (require_lshift)
tmds_encode_loop_16bpp_leftshift(pixbuf, symbuf, n_pix, require_lshift);
else
tmds_encode_loop_16bpp(pixbuf, symbuf, n_pix);
#else
assert(!require_lshift); (void)require_lshift;
tmds_encode_loop_16bpp(pixbuf, symbuf, n_pix);
#endif
interp_restore(interp0_hw, &interp0_save);
#endif
}
// As above, but 8 bits per pixel, multiple of 4 pixels, and still word-aligned.
void __not_in_flash_func(tmds_encode_data_channel_8bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) {
#if DVI_USE_SIO_TMDS_ENCODER
configure_sio_tmds_for_single_channel(channel_msb, channel_lsb, 8, true);
#if DVI_SYMBOLS_PER_WORD == 1
tmds_encode_sio_loop_peekpop_ratio8(pixbuf, symbuf, 2 * n_pix);
#else
tmds_encode_sio_loop_poppop_ratio4(pixbuf, symbuf, 2 * n_pix);
#endif
#else
interp_hw_save_t interp0_save, interp1_save;
interp_save(interp0_hw, &interp0_save);
interp_save(interp1_hw, &interp1_save);
@ -86,12 +129,18 @@ void __not_in_flash_func(tmds_encode_data_channel_8bpp)(const uint32_t *pixbuf,
int require_lshift = configure_interp_for_addrgen(interp0_hw, channel_msb, channel_lsb, 0, 8, 6, tmds_table);
int lshift_upper = configure_interp_for_addrgen(interp1_hw, channel_msb, channel_lsb, 16, 8, 6, tmds_table);
assert(!lshift_upper); (void)lshift_upper;
#if PICO_RP2040
if (require_lshift)
tmds_encode_loop_8bpp_leftshift(pixbuf, symbuf, n_pix, require_lshift);
else
tmds_encode_loop_8bpp(pixbuf, symbuf, n_pix);
#else
assert(!require_lshift); (void)require_lshift;
tmds_encode_loop_8bpp(pixbuf, symbuf, n_pix);
#endif
interp_restore(interp0_hw, &interp0_save);
interp_restore(interp1_hw, &interp1_save);
#endif
}
// ----------------------------------------------------------------------------
@ -103,16 +152,22 @@ void __not_in_flash_func(tmds_encode_data_channel_8bpp)(const uint32_t *pixbuf,
// pixels, and INTERP1 for odd pixels. Note this means that even and odd
// symbols have their DC balance handled separately, which is not to spec.
#if !DVI_USE_SIO_TMDS_ENCODER
static int __not_in_flash_func(configure_interp_for_addrgen_fullres)(interp_hw_t *interp, uint channel_msb, uint channel_lsb, uint lut_index_width, const uint32_t *lutbase) {
const uint index_shift = 2; // scaled lookup for 4-byte LUT entries
int shift_channel_to_index = channel_msb - (lut_index_width - 1) - index_shift;
int oops = 0;
#if PICO_RP2040
if (shift_channel_to_index < 0) {
// "It's ok we'll fix it in software"
oops = -shift_channel_to_index;
shift_channel_to_index = 0;
}
#else
// Now a right-rotate rather than right-shift
shift_channel_to_index &= 0x1f;
#endif
uint index_msb = index_shift + lut_index_width - 1;
@ -133,8 +188,17 @@ static int __not_in_flash_func(configure_interp_for_addrgen_fullres)(interp_hw_t
return oops;
}
#endif
void __not_in_flash_func(tmds_encode_data_channel_fullres_16bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) {
#if DVI_USE_SIO_TMDS_ENCODER
configure_sio_tmds_for_single_channel(channel_msb, channel_lsb, 16, false);
#if DVI_SYMBOLS_PER_WORD == 1
tmds_encode_sio_loop_poppop_ratio2(pixbuf, symbuf, n_pix);
#else
tmds_encode_sio_loop_poppop_ratio1(pixbuf, symbuf, n_pix);
#endif
#else
uint core = get_core_num();
#if !TMDS_FULLRES_NO_INTERP_SAVE
interp_hw_save_t interp0_save, interp1_save;
@ -165,17 +229,16 @@ void __not_in_flash_func(tmds_encode_data_channel_fullres_16bpp)(const uint32_t
interp_restore(interp0_hw, &interp0_save);
interp_restore(interp1_hw, &interp1_save);
#endif
#endif
}
static const int8_t imbalance_lookup[16] = { -4, -2, -2, 0, -2, 0, 0, 2, -2, 0, 0, 2, 0, 2, 2, 4 };
static inline int byte_imbalance(uint32_t x)
{
static inline int byte_imbalance(uint32_t x) {
return imbalance_lookup[x >> 4] + imbalance_lookup[x & 0xF];
}
static void tmds_encode_symbols(uint8_t pixel, uint32_t* negative_balance_sym, uint32_t* positive_balance_sym)
{
static void tmds_encode_symbols(uint8_t pixel, uint32_t* negative_balance_sym, uint32_t* positive_balance_sym) {
int pixel_imbalance = byte_imbalance(pixel);
uint32_t sym = pixel & 1;
if (pixel_imbalance > 0 || (pixel_imbalance == 0 && sym == 0)) {

Wyświetl plik

@ -34,4 +34,23 @@ void tmds_fullres_encode_loop_16bpp_leftshift_y(const uint32_t *pixbuf, uint32_t
void tmds_palette_encode_loop_x(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
void tmds_palette_encode_loop_y(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
#if !PICO_RP2040
// Crank the SIO TMDS encoder:
void tmds_encode_sio_loop_poppop_ratio1(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
void tmds_encode_sio_loop_poppop_ratio2(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
void tmds_encode_sio_loop_poppop_ratio4(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
void tmds_encode_sio_loop_poppop_ratio8(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
void tmds_encode_sio_loop_poppop_ratio16(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
void tmds_encode_sio_loop_poppop_ratio32(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
void tmds_encode_sio_loop_poppop_ratio64(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
void tmds_encode_sio_loop_peekpop_ratio1(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
void tmds_encode_sio_loop_peekpop_ratio2(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
void tmds_encode_sio_loop_peekpop_ratio4(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
void tmds_encode_sio_loop_peekpop_ratio8(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
void tmds_encode_sio_loop_peekpop_ratio16(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
void tmds_encode_sio_loop_peekpop_ratio32(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
void tmds_encode_sio_loop_peekpop_ratio64(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
#endif
#endif

Wyświetl plik

@ -3,13 +3,24 @@ add_library(libsprite INTERFACE)
target_sources(libsprite INTERFACE
${CMAKE_CURRENT_LIST_DIR}/affine_transform.h
${CMAKE_CURRENT_LIST_DIR}/sprite_asm_const.h
${CMAKE_CURRENT_LIST_DIR}/sprite.S
${CMAKE_CURRENT_LIST_DIR}/sprite.c
${CMAKE_CURRENT_LIST_DIR}/sprite.h
${CMAKE_CURRENT_LIST_DIR}/tile.S
${CMAKE_CURRENT_LIST_DIR}/tile.c
${CMAKE_CURRENT_LIST_DIR}/tile.h
)
if (PICO_RISCV)
target_sources(libsprite INTERFACE
${CMAKE_CURRENT_LIST_DIR}/sprite_riscv.S
${CMAKE_CURRENT_LIST_DIR}/tile_riscv.S
)
else ()
target_sources(libsprite INTERFACE
${CMAKE_CURRENT_LIST_DIR}/sprite_armv6m.S
${CMAKE_CURRENT_LIST_DIR}/tile_armv6m.S
)
endif()
target_include_directories(libsprite INTERFACE ${CMAKE_CURRENT_LIST_DIR})
target_link_libraries(libsprite INTERFACE pico_base_headers hardware_interp)

Wyświetl plik

@ -4,7 +4,7 @@
// Stolen from RISCBoy
#include <stdint.h>
#include "pico/platform.h"
#include "pico.h"
// Store unpacked affine transforms as signed 16.16 fixed point in the following order:
// a00, a01, b0, a10, a11, b1

Wyświetl plik

@ -1,7 +1,7 @@
#include "sprite.h"
#include "affine_transform.h"
#include "pico/platform.h" // for __not_in_flash
#include "pico.h" // for __not_in_flash
#include "hardware/interp.h"
// Note some of the sprite routines are quite large (unrolled), so trying to

Wyświetl plik

@ -5,8 +5,11 @@
.macro decl_func name
.section .time_critical.\name, "ax"
.global \name
.p2align 2
#ifndef __riscv
.type \name,%function
.thumb_func
#endif
\name:
.endm
@ -16,11 +19,40 @@
// same way as non-alpha pixels when encoding (and the co-opted channel LSB
// always ends up being set on alpha pixels, which is pretty harmless)
// Also note this is expressed as a right-shift into the carry flag (on Arm),
// so this is equal to the bit index of the alpha bit plus 1. On RISC-V it's
// idiomatic to shift up to the sign bit instead, so a left shift of 32 - x
// should be used instead of a right shift of x.
#define ALPHA_SHIFT_16BPP 6
// Assume RAGB2132 (so alpha is bit 5)
#define ALPHA_SHIFT_8BPP 6
#ifdef __riscv
// Macros for forcing individual instructions to be 32 bits, to maintain
// branch target alignment without adding NOPs
.macro norvc_1a instr, arg0
.option push
.option norvc
\instr \arg0
.option pop
.endm
.macro norvc_2a instr, arg0, arg1
.option push
.option norvc
\instr \arg0, \arg1
.option pop
.endm
.macro norvc_3a instr, arg0, arg1, arg2
.option push
.option norvc
\instr \arg0, \arg1, \arg2
.option pop
.endm
#endif
#endif

Wyświetl plik

@ -0,0 +1,657 @@
// Functions for doing simple 2D graphics operations on a RGB scanline buffer.
#include "hardware/regs/addressmap.h"
#include "hardware/regs/sio.h"
#include "sprite_asm_const.h"
#define POP2_OFFS (SIO_INTERP0_POP_FULL_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define PEEK0_OFFS (SIO_INTERP0_PEEK_LANE0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define PEEK1_OFFS (SIO_INTERP0_PEEK_LANE1_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define ACCUM0_OFFS (SIO_INTERP0_ACCUM0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define ACCUM1_OFFS (SIO_INTERP0_ACCUM1_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define CTRL0_OFFS (SIO_INTERP0_CTRL_LANE0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define INTERP1 (SIO_INTERP1_ACCUM0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#if defined(__riscv_c) || defined(__riscv_zca)
#define RISCV_HAVE_COMPRESSED_ISA 1
#endif
// ----------------------------------------------------------------------------
// Colour fill
// a0: dst
// a1: value
// a2: count
decl_func sprite_fill8
// Slide for short fills
li a3, 18
bltu a3, a2, 2f
#ifndef RISCV_HAVE_COMPRESSED_ISA
#error "This address computation is wrong for non-RVC:"
#endif
auipc a3, 0 // 32-bit instruction after address of auipc
slli a2, a2, 2 // 16-bit instruction after address of auipc
sub a3, a3, a2 // 16-bit instruction after address of auipc
jr a3, 18 * 4 + 12 // 32-bit instruction after address of auipc
.align 2
// With Zcb this is a mix of 16-bit and 32-bit instructions due to the
// limited immediate size. Force 32-bit so we can do a computed branch.
.option push
.option norvc
sb a1, 17(a0)
sb a1, 16(a0)
sb a1, 15(a0)
sb a1, 14(a0)
sb a1, 13(a0)
sb a1, 12(a0)
sb a1, 11(a0)
sb a1, 10(a0)
sb a1, 9(a0)
sb a1, 8(a0)
sb a1, 7(a0)
sb a1, 6(a0)
sb a1, 5(a0)
sb a1, 4(a0)
sb a1, 3(a0)
sb a1, 2(a0)
sb a1, 1(a0)
sb a1, 0(a0)
.option pop
ret
2:
// Duplicate byte x4
packh a1, a1, a1
pack a1, a1, a1
// Get a0 word-aligned:
andi a3, a0, 0x1
bnez a3, 1f
sb a1, (a0)
addi a0, a0, 1
addi a2, a2, -1
1:
andi a3, a0, 0x2
bnez a3, 1f
sh a1, (a0)
addi a0, a0, 2
addi a2, a2, -2
1:
// Set up for main loop. Limit pointer at end - (loop body size)
add a2, a2, a0
addi a2, a2, -16
// Fall straight into loop, because cases less than (loop body + max misalignment) are handled by slide
1:
sw a1, 0(a0)
sw a1, 4(a0)
sw a1, 8(a0)
sw a1, 12(a0)
addi a0, a0, 16
bgeu a2, a0, 1b
// Main loop done, now tidy up the odds and ends. Note bits 3:0 of the
// pointer difference are not affected by us subtracting 16 earlier.
sub a2, a2, a0
// No more than 15 bytes remaining -- first test bit 3 by shifting it to sign bit
slli a2, a2, 28
bgez a2, 1f
sw a1, 0(a0)
sw a1, 4(a0)
addi a0, a0, 8
1:
slli a2, a2, 1
bgez a2, 1f
sw a1, (a0)
addi a0, a0, 4
1:
slli a2, a2, 1
bgez a2, 1f
sh a1, (a0)
addi a0, a0, 2
1:
slli a2, a2, 1
bgez a2, 1f
sb a1, (a0)
1:
ret
.p2align 2
decl_func sprite_fill16
// Slide for short fills
norvc_2a li a3, 16
bltu a3, a2, 2f
#ifndef RISCV_HAVE_COMPRESSED_ISA
#error "This address computation is wrong for non-RVC:"
#endif
auipc a3, 0 // 32-bit instruction after address of auipc
slli a2, a2, 2 // 16-bit instruction after address of auipc
sub a3, a3, a2 // 16-bit instruction after address of auipc
jr a3, 16 * 4 + 12 // 32-bit instruction after address of auipc
.option push
.option norvc
sh a1, 30(a0)
sh a1, 28(a0)
sh a1, 26(a0)
sh a1, 24(a0)
sh a1, 22(a0)
sh a1, 20(a0)
sh a1, 18(a0)
sh a1, 16(a0)
sh a1, 14(a0)
sh a1, 12(a0)
sh a1, 10(a0)
sh a1, 8(a0)
sh a1, 6(a0)
sh a1, 4(a0)
sh a1, 2(a0)
sh a1, 0(a0)
.option pop
ret
2:
// Get word-aligned before main fill loop
andi a3, a2, 0x2
beqz a3, 1f
sh a1, (a0)
addi a0, a0, 2
addi a2, a2, -1
1:
// Set limit pointer at end - (loop body size)
slli a2, a2, 1
add a2, a2, a0
addi a2, a2, -32
pack a1, a1, a1
// We can fall through because cases < 1 loop are handled by slide
1:
sw a1, 0(a0)
sw a1, 4(a0)
sw a1, 8(a0)
sw a1, 12(a0)
sw a1, 16(a0)
sw a1, 20(a0)
sw a1, 24(a0)
sw a1, 28(a0)
addi a0, a0, 32
bgeu a2, a0, 1b
// Most of the work done, we have a few more to tidy up -- note bits 4:1
// of the pointer difference are not affected by earlier subtraction of 32
sub a2, a2, a0
// Bit 4 becomes sign bit
slli a2, a2, 27
bgez a2, 1f
sw a1, 0(a0)
sw a1, 4(a0)
sw a1, 8(a0)
sw a1, 12(a0)
addi a0, a0, 16
1:
slli a2, a2, 1
bgez a2, 1f
sw a1, 0(a0)
sw a1, 4(a0)
addi a0, a0, 8
1:
slli a2, a2, 1
bgez a2, 1f
sw a1, 0(a0)
addi a0, a0, 4
1:
slli a2, a2, 1
bgez a2, 1f
sh a1, 0(a0)
1:
ret
// ----------------------------------------------------------------------------
// Non-AT sprite
// TODO 8-bit version not yet ported to RISC-V
#if 0
// Unrolled loop body with an initial computed branch.
// a0: dst
// a1: src
// a2: pixel count
decl_func sprite_blit8
mov ip, a0
lsrs a3, a2, #3
lsls a3, #3
eors a2, a3 // a2 = pixels % 8, a3 = pixels - pixels % 8
add a0, a3
add a1, a3
adr a3, 2f
lsls a2, #2
subs a3, a2
adds a3, #1 // thumb bit >:(
bx a3
.align 2
1:
subs a0, #8
subs a1, #8
ldrb a3, [a1, #7]
strb a3, [a0, #7]
ldrb a3, [a1, #6]
strb a3, [a0, #6]
ldrb a3, [a1, #5]
strb a3, [a0, #5]
ldrb a3, [a1, #4]
strb a3, [a0, #4]
ldrb a3, [a1, #3]
strb a3, [a0, #3]
ldrb a3, [a1, #2]
strb a3, [a0, #2]
ldrb a3, [a1, #1]
strb a3, [a0, #1]
ldrb a3, [a1, #0]
strb a3, [a0, #0]
2:
cmp a0, ip
bhi 1b
bx lr
.macro sprite_blit8_alpha_body n
ldrb a3, [a1, #\n]
lsrs a2, a3, #ALPHA_SHIFT_8BPP
bcc 2f
strb a3, [a0, #\n]
2:
.endm
// a0: dst
// a1: src
// a2: pixel count
decl_func sprite_blit8_alpha
mov ip, a0
lsrs a3, a2, #3
lsls a3, #3
eors a2, a3
add a0, a3
add a1, a3
adr a3, 3f
lsls a2, #3
subs a3, a2
adds a3, #1
bx a3
.align 2
1:
subs a0, #8
subs a1, #8
sprite_blit8_alpha_body 7
sprite_blit8_alpha_body 6
sprite_blit8_alpha_body 5
sprite_blit8_alpha_body 4
sprite_blit8_alpha_body 3
sprite_blit8_alpha_body 2
sprite_blit8_alpha_body 1
sprite_blit8_alpha_body 0
3:
cmp a0, ip
bhi 1b
bx lr
#endif
// Note this is the same ideal cycle count as lhu; lhu; sh; sh; but it reduces
// the number of memory accesses by 25%, so less bus contention
.macro storew_alignh rd ra offs
sh \rd, \offs(\ra)
srli \rd, \rd, 16
sh \rd, \offs+2(\ra)
.endm
// a0: dst
// a1: src
// a2: pixel count
decl_func sprite_blit16
// Force source pointer to be word-aligned
andi a3, a1, 2
beqz a3, 1f
lhu a3, (a1)
sh a3, (a0)
addi a0, a0, 2
addi a1, a1, 2
addi a2, a2, -1
1:
// Each loop is 8 pixels. Place limit pointer at 16 bytes before
// end, loop until past it. There will be 0 to 7 pixels remaining.
slli a2, a2, 1
add a2, a2, a0
addi a5, a2, -16
// Early out:
bltu a5, a0, 2f
1:
lw a2, 0(a1)
lw a3, 4(a1)
storew_alignh a2, a0, 0
storew_alignh a3, a0, 4
lw a2, 8(a1)
lw a3, 12(a1)
storew_alignh a2, a0, 8
storew_alignh a3, a0, 12
addi a0, a0, 16
addi a1, a1, 16
bgeu a5, a0, 1b
2:
sub a5, a5, a0
// At least 4 pixels? (bit 3 -> sign bit)
slli a5, a5, 28
bgez a5, 1f
lw a2, 0(a1)
lw a3, 4(a1)
storew_alignh a2, a0, 0
storew_alignh a3, a0, 4
addi a0, a0, 8
addi a1, a1, 8
1:
// At least 2 pixels?
slli a5, a5, 1
bgez a5, 1f
lw a2, 0(a1)
storew_alignh a2, a0, 0
addi a0, a0, 4
addi a1, a1, 4
1:
// One more pixel?
slli a5, a5, 1
bgez a5, 1f
lhu a3, (a1)
sh a3, (a0)
1:
ret
// dst: a0, src: a1, clobbers: a4-a7
.macro sprite_blit16_alpha_body_x2 n
// Disable RVC to force 32-bit alignment of branch targets without adding
// alignment nops (lhu/sh *may* be 16-bit if Zcb is enabled)
.option push
.option norvc
// Interleave two loads to avoid load->shift dependency stall
lhu a4, 4*\n(a1)
lhu a5, 4*\n+2(a1)
slli a6, a4, 32 - ALPHA_SHIFT_16BPP
slli a7, a5, 32 - ALPHA_SHIFT_16BPP
bgez a6, 3f
sh a4, 4*\n(a0)
3:
bgez a7, 3f
sh a5, 4*\n+2(a0)
3:
.option pop
.endm
// a0: dst
// a1: src
// a2: pixel count
decl_func sprite_blit16_alpha
// Not using the computed branch approach of the v6-M code as it doesn't
// play nicely with the pairing of pixels used in the loop body here.
slli a2, a2, 1
add a2, a2, a0
norvc_3a addi, a2, a2, -16
bltu a2, a0, 2f
1:
// 8 pixels per loop
sprite_blit16_alpha_body_x2 0
sprite_blit16_alpha_body_x2 1
sprite_blit16_alpha_body_x2 2
sprite_blit16_alpha_body_x2 3
addi a0, a0, 16
addi a1, a1, 16
bgeu a2, a0, 1b
2:
sub a2, a2, a0
// At least 4 pixels? (bit 3 -> sign bit)
slli a2, a2, 28
bgez a2, 1f
sprite_blit16_alpha_body_x2 0
sprite_blit16_alpha_body_x2 1
addi a0, a0, 8
addi a1, a1, 8
1:
// At least 2 pixels?
norvc_3a slli, a2, a2, 1
bgez a2, 1f
sprite_blit16_alpha_body_x2 0
addi a1, a1, 4
addi a0, a0, 4
1:
// One more pixel?
slli a2, a2, 1
bgez a2, 1f
lhu a4, (a1)
slli a6, a4, 32 - ALPHA_SHIFT_16BPP
bgez a6, 1f
sh a4, (a0)
1:
ret
// ----------------------------------------------------------------------------
// Affine-transformed sprite (note these are just the inner loops -- INTERP0
// must be configured by the caller, which is presumably not written in asm)
// TODO not yet ported to RISC-V
#if 0
// r0: raster start pointer
// r1: raster span size (pixels)
.macro sprite_ablit8_loop_body n
ldr r1, [r3, #CTRL0_OFFS]
ldr r2, [r3, #POP2_OFFS]
lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1
bcs 2f
ldrb r2, [r2]
strb r2, [r0, #\n]
2:
.endm
decl_func sprite_ablit8_loop
mov ip, r0
lsrs r2, r1, #3
lsls r2, #3
eors r1, r2
add r0, r2
adr r2, 3f
movs r3, #12 // Each (non-unrolled) loop body is 12 bytes
muls r1, r3
subs r2, r1
adds r2, #1
ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
bx r2
.align 2
nop
1:
subs r0, #8
sprite_ablit8_loop_body 7
sprite_ablit8_loop_body 6
sprite_ablit8_loop_body 5
sprite_ablit8_loop_body 4
sprite_ablit8_loop_body 3
sprite_ablit8_loop_body 2
sprite_ablit8_loop_body 1
sprite_ablit8_loop_body 0
3:
cmp r0, ip
bne 1b
bx lr
// As above but bit 5 is assumed to be an alpha bit (RAGB2132)
.macro sprite_ablit8_alpha_loop_body n
ldr r1, [r3, #CTRL0_OFFS]
ldr r2, [r3, #POP2_OFFS]
lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1
bcs 2f
ldrb r2, [r2]
lsrs r1, r2, #ALPHA_SHIFT_8BPP
bcc 2f
strb r2, [r0, #\n]
2:
.endm
decl_func sprite_ablit8_alpha_loop
mov ip, r0
ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
lsrs r2, r1, #3
lsls r2, #3
eors r1, r2
add r0, r2
adr r2, 3f
lsls r1, #4 // Each (non-unrolled) loop body is 16 bytes
subs r2, r1
adds r2, #1
bx r2
.align 2
nop
1:
subs r0, #8
sprite_ablit8_alpha_loop_body 7
sprite_ablit8_alpha_loop_body 6
sprite_ablit8_alpha_loop_body 5
sprite_ablit8_alpha_loop_body 4
sprite_ablit8_alpha_loop_body 3
sprite_ablit8_alpha_loop_body 2
sprite_ablit8_alpha_loop_body 1
sprite_ablit8_alpha_loop_body 0
3:
cmp r0, ip
bhi 1b
bx lr
.macro sprite_ablit16_loop_body n
ldr r1, [r3, #CTRL0_OFFS]
ldr r2, [r3, #POP2_OFFS]
lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1
bcs 2f
ldrh r2, [r2]
strh r2, [r0, #2*\n]
2:
.endm
decl_func sprite_ablit16_loop
mov ip, r0
lsrs r2, r1, #3
lsls r2, #3
eors r1, r2
lsls r2, #1 // Each pixel is 2 bytes
add r0, r2
adr r2, 3f
movs r3, #12 // Each (non-unrolled) loop body is 12 bytes
muls r1, r3
subs r2, r1
adds r2, #1
ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
bx r2
.align 2
nop
1:
subs r0, #16
sprite_ablit16_loop_body 7
sprite_ablit16_loop_body 6
sprite_ablit16_loop_body 5
sprite_ablit16_loop_body 4
sprite_ablit16_loop_body 3
sprite_ablit16_loop_body 2
sprite_ablit16_loop_body 1
sprite_ablit16_loop_body 0
3:
cmp r0, ip
bne 1b
bx lr
#endif
#define FIX_OVERF_CHECK 1
#ifndef RISCV_HAVE_COMPRESSED_ISA
#error "Address calculations are incorrect if not assembled with C extension"
#endif
.macro sprite_ablit16_alpha_loop_body n
// Instructions which are only compressible under Zcb (e.g. lhu, sh) are
// forced uncompressed, to get consistent size for address calculations.
// This code should be exactly 24 bytes.
// Bit 25 is OVERF, bit 24 is OVERF1, bits 31:26 are zero, so can test for
// overflow by testing the uppermost byte of CTRL0 for nonzero.
#if !FIX_OVERF_CHECK
norvc_2a lbu a1, CTRL0_OFFS+3(a5)
lw a2, POP2_OFFS(a5)
bnez a1, 2f
#else
lw a1, ACCUM0_OFFS(a5)
lw a3, ACCUM1_OFFS(a5)
lw a2, POP2_OFFS(a5)
srli a1, a1, 7 + 16
bnez a1, 2f
srli a3, a3, 7 + 16
bnez a3, 2f
#endif
norvc_2a lhu a2, (a2)
// TODO dep stall on lhu, but it makes the OVERF case faster:
slli a1, a2, 32 - ALPHA_SHIFT_16BPP
bgez a1, 2f
norvc_2a sh a2, 2*\n(a0)
2:
.endm
decl_func sprite_ablit16_alpha_loop
mv a4, a0
li a5, SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET
// Split off pixels modulo 8
andi a2, a1, 0x7
sub a1, a1, a2
// Pointer to beginning of endmost block of 8 pixels:
sh1add a0, a1, a0
// Compute branch into first loop, which has the modulo-8 pixels.
// Each pixel takes 24 bytes of instructions.
#if !FIX_OVERF_CHECK
slli a2, a2, 3
sh1add a2, a2, a2
#else
li a3, 30
mul a2, a2, a3
#endif
la a1, 3f
sub a1, a1, a2
jr a1
.align 2
1:
norvc_3a addi a0, a0, -16
sprite_ablit16_alpha_loop_body 7
sprite_ablit16_alpha_loop_body 6
sprite_ablit16_alpha_loop_body 5
sprite_ablit16_alpha_loop_body 4
sprite_ablit16_alpha_loop_body 3
sprite_ablit16_alpha_loop_body 2
sprite_ablit16_alpha_loop_body 1
sprite_ablit16_alpha_loop_body 0
3:
bltu a4, a0, 1b
ret

Wyświetl plik

@ -1,6 +1,6 @@
#include "tile.h"
#include "pico/platform.h" // for __not_in_flash
#include "pico.h" // for __not_in_flash
#include "hardware/interp.h"
#define __ram_func(foo) __not_in_flash(#foo) foo

Wyświetl plik

@ -0,0 +1,188 @@
#include "hardware/regs/addressmap.h"
#include "hardware/regs/sio.h"
#include "sprite_asm_const.h"
#define POP2_OFFS (SIO_INTERP0_POP_FULL_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
// ----------------------------------------------------------------------------
// Tile layout
//
// Some terms:
// Tileset: 1D array of tile images, concatenated image-after-image
// Tilemap: 2D array of tileset indices
//
// Each tile image in a tileset is the same size. Tiles are square, either 8 x
// 8 px or 16 x 16 px. This makes it easy to find the start of a tile image
// given the tileset base pointer and a tile index (add + shift).
//
// Tilemaps are 8 bits per tile, always.
//
// One advantage of this layout is that y coordinates can be handled outside
// of the loops in this file, which are all scanline-oriented, by offsetting
// the tileset and tilemap pointers passed in. These routines only care about
// x. The tileset pointer is offset by y modulo tile height, and the tilemap
// pointer is offset by y divided by tile height, modulo tileset height in
// tiles.
// Tileset: 16px tiles, 16bpp, with 1-bit alpha.
// Tilemap: 8 bit indices.
.macro do_2px_16bpp_alpha rd rs rx dstoffs
.option push
.option norvc
// TODO we could save a shift here by making alpha the MSB (not worth it
// on Arm due to lack of sign-extension or flag update on loads)
slli \rx, \rs, 32 - ALPHA_SHIFT_16BPP
bgez \rx, 1f
sh \rs, \dstoffs(\rd)
1:
slli \rx, \rs, 16 - ALPHA_SHIFT_16BPP
bgez \rx, 1f
srli \rs, \rs, 16
sh \rs, \dstoffs+2(\rd)
1:
.option pop
.endm
.macro do_2px_16bpp rd rs dstoffs
sh \rs, \dstoffs(\rd)
srli \rs, \rs, 16
sh \rs, \dstoffs+2(\rd)
.endm
// interp1 has been set up to give the next x-ward pointer into the tilemap
// with each pop. This saves us having to remember the tilemap pointer and
// tilemap x size mask in core registers.
// a0: dst
// a1: tileset
// a2: x0 (start pos in tile space)
// a3: x1 (end pos in tile space, exclusive)
// Instantiated with alpha=1 and alpha=0 to get both variants of the loop.
// Linker garbage collection ensures we only keep the versions we use.
.macro tile16_16px_loop_alpha_or_nonalpha alpha
li a7, SIO_BASE + SIO_INTERP1_ACCUM0_OFFSET
// The main loop only handles whole tiles, so we may need to first copy
// individual pixels to get tile-aligned. Skip this entirely if we are
// already aligned, to avoid the extra interp pop.
andi a5, a2, 0xf
beqz a5, 3f
// Get pointer to tileset image
lw a4, POP2_OFFS(a7)
lbu a4, (a4) // dep stall
slli a4, a4, 9 // 16 px wide * 16 px high * 2 bytes/px
add a4, a4, a1
// Offset tile image pointer to align with x0
sh1add a4, a5, a4
// Fall through into copy loop
1:
lhu a5, (a4)
addi a4, a4, 2 // hoisted to fill load dependency slot
.if \alpha
slli a6, a5, 32 - ALPHA_SHIFT_16BPP
bgez a6, 2f
.endif
sh a5, (a0)
2:
addi a0, a0, 2
addi a2, a2, 1
// Skip out if we have already reached end of span:
bgeu a2, a3, 3f
// Loop if we are not yet aligned: (TODO these checks could be merged)
andi a6, a2, 0xf
bnez a6, 1b
3:
// The next output pixel is aligned to the start of a tile. Set up main loop.
// Tileset pointer is only needed occasionally, so free up a1 for better
// code density:
mv t0, a1
// t1: dst limit pointer at end of all pixels:
sub a3, a3, a2
sh1add t1, a3, a0
// a5: dst limit pointer at end of whole tiles:
andi a4, a3, ~0xf
sh1add a5, a4, a0
// a0 is dst, a7 is interp base, a1-a4 are trashed by loop, a5 is dst limit.
// Early skip for case of 0 whole tiles:
bgeu a0, a5, 3f
2:
// Get next tilemap pointer
lw a1, POP2_OFFS(a7)
// Get tile image pointer
lbu a1, (a1) // dep stall
slli a1, a1, 9
add a1, a1, t0
.if \alpha
lw a3, 0(a1)
lw a4, 4(a1)
do_2px_16bpp_alpha a0 a3 a2 0
do_2px_16bpp_alpha a0 a4 a2 4
lw a3, 8(a1)
lw a4, 12(a1)
do_2px_16bpp_alpha a0 a3 a2 8
do_2px_16bpp_alpha a0 a4 a2 12
lw a3, 16(a1)
lw a4, 20(a1)
do_2px_16bpp_alpha a0 a3 a2 16
do_2px_16bpp_alpha a0 a4 a2 20
lw a3, 24(a1)
lw a4, 28(a1)
do_2px_16bpp_alpha a0 a3 a2 24
do_2px_16bpp_alpha a0 a4 a2 28
.else
lw a3, 0(a1)
lw a4, 4(a1)
do_2px_16bpp a0 a3 0
do_2px_16bpp a0 a4 4
lw a3, 8(a1)
lw a4, 12(a1)
do_2px_16bpp a0 a3 8
do_2px_16bpp a0 a4 12
lw a3, 16(a1)
lw a4, 20(a1)
do_2px_16bpp a0 a3 16
do_2px_16bpp a0 a4 20
lw a3, 24(a1)
lw a4, 28(a1)
do_2px_16bpp a0 a3 24
do_2px_16bpp a0 a4 28
.endif
addi a0, a0, 32
bltu a0, a5, 2b
3:
// Skip ahead if there are no spare pixels to tidy up
bgeu a0, t1, 3f
// Copy <1 tile's worth of loose pixels
lw a4, POP2_OFFS(a7)
lbu a4, (a4) // dep stall
slli a4, a4, 9
add a4, a4, t0
1:
lh a5, (a4)
addi a4, a4, 2
.if \alpha
slli a6, a5, 32 - ALPHA_SHIFT_16BPP
bgez a6, 2f
.endif
sh a5, (a0)
2:
addi a0, a0, 2
bltu a0, t1, 1b
3:
ret
.endm
decl_func tile16_16px_alpha_loop
tile16_16px_loop_alpha_or_nonalpha 1
decl_func tile16_16px_loop
tile16_16px_loop_alpha_or_nonalpha 0