kopia lustrzana https://github.com/Wren6991/PicoDVI
RP2350 changes (including RISC-V)
rodzic
8ecccce7b7
commit
ca941baf37
27
Readme.md
27
Readme.md
|
@ -1,3 +1,30 @@
|
|||
RP2350 PicoDVI Preview
|
||||
======================
|
||||
|
||||
Changes from the public GitHub version:
|
||||
|
||||
* All Arm assembly in `libdvi` has been ported to RISC-V and tuned for Hazard3
|
||||
* Some of the existing Arm assembly in `libdvi` has been tweaked for better performance on Cortex-M33
|
||||
* RGB encode now uses the SIO TMDS encoders by default on RP2350 (can be disabled by defining `DVI_USE_SIO_TMDS_ENCODE=0` -- see `software/libdvi/dvi_config_defs.h`)
|
||||
* Much of the Arm assembly in `libsprite` has been ported to RISC-V -- enough to run the stock demos
|
||||
|
||||
Build instructions:
|
||||
|
||||
```bash
|
||||
cd software
|
||||
mkdir build
|
||||
# PICO_PLATFORM can also be rp2350-riscv
|
||||
# List of DVI configs is in software/include/common_dvi_pin_configs.h
|
||||
cmake -DPICO_SDK_PATH=/path/to/sdk -DPICO_PLATFORM=rp2350 -DPICO_COPY_TO_RAM=1 -DDVI_DEFAULT_SERIAL_CONFIG=pico_sock_cfg ..
|
||||
make -j$(nproc)
|
||||
# Then flash a binary, e.g.:
|
||||
cp apps/tiles_and_sprites/tiles_and_sprites.uf2
|
||||
```
|
||||
|
||||
If you plan to run the `vista` demo, then note that there are now two UF2 data files, `software/assets/vista_data_rp2040.uf2` and `software/assets/vista_data_rp2350.uf2`. The only difference is the family IDs: the first can be dragged on RP2040 and on RP2350 A0, and the second can be dragged on RP2350 A1 and later.
|
||||
|
||||
The following is the original RP2040 writeup:
|
||||
|
||||
Bitbanged DVI on the RP2040 Microcontroller
|
||||
===========================================
|
||||
|
||||
|
|
|
@ -1,2 +1,3 @@
|
|||
build
|
||||
*.swp
|
||||
build-*
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
add_subdirectory(bad_apple)
|
||||
if (NOT PICO_RISCV)
|
||||
# Arm assembly needs porting to RISC-V
|
||||
add_subdirectory(bad_apple)
|
||||
endif()
|
||||
add_subdirectory(colour_terminal)
|
||||
add_subdirectory(christmas_snowflakes)
|
||||
add_subdirectory(dht_logging)
|
||||
|
@ -12,5 +15,8 @@ add_subdirectory(tiles)
|
|||
add_subdirectory(tiles_and_sprites)
|
||||
add_subdirectory(tiles_parallax)
|
||||
add_subdirectory(vista)
|
||||
add_subdirectory(vista-palette)
|
||||
if (PICO_RP2040)
|
||||
# Needs porting to use XIP stream instead of SSI, as was done to vista
|
||||
add_subdirectory(vista-palette)
|
||||
endif()
|
||||
add_subdirectory(mandel-full)
|
||||
|
|
|
@ -8,7 +8,6 @@
|
|||
#include "hardware/gpio.h"
|
||||
#include "hardware/vreg.h"
|
||||
#include "hardware/structs/bus_ctrl.h"
|
||||
#include "hardware/structs/ssi.h"
|
||||
#include "hardware/dma.h"
|
||||
#include "pico/sem.h"
|
||||
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
#include "hardware/regs/addressmap.h"
|
||||
#include "hardware/regs/sio.h"
|
||||
|
||||
#ifndef __riscv
|
||||
.syntax unified
|
||||
.cpu cortex-m0plus
|
||||
.thumb
|
||||
#endif
|
||||
|
||||
// Using the following:
|
||||
//
|
||||
|
@ -46,12 +48,13 @@
|
|||
// r8 contains a pointer to the font bitmap for this scanline.
|
||||
// r9 contains the TMDS LUT base.
|
||||
.macro do_char charbuf_offs colour_shift_instr colour_shamt
|
||||
#ifndef __riscv
|
||||
// Get 8x font bits for next character, put 4 LSBs in bits 6:3 of r4 (so
|
||||
// scaled to 8-byte LUT entries), and 4 MSBs in bits 6:3 of r6.
|
||||
ldrb r4, [r0, #\charbuf_offs] // 2
|
||||
add r4, r8 // 1
|
||||
ldrb r4, [r4] // 2
|
||||
lsrs r6, r4, #4 // 1
|
||||
ldrb r4, [r0, #\charbuf_offs] // 2 (note these cycle
|
||||
add r4, r8 // 1 counts are for M0+
|
||||
ldrb r4, [r4] // 2 and are a little
|
||||
lsrs r6, r4, #4 // 1 pessimistic on M33)
|
||||
lsls r6, #3 // 1
|
||||
lsls r4, #28 // 1
|
||||
lsrs r4, #25 // 1
|
||||
|
@ -67,6 +70,31 @@
|
|||
ldmia r4, {r4, r5} // 3
|
||||
ldmia r6, {r6, r7} // 3
|
||||
stmia r2!, {r4-r7} // 5
|
||||
#else
|
||||
lbu a4, \charbuf_offs(a0) // 1
|
||||
\colour_shift_instr a5, a1, \colour_shamt // 1
|
||||
add a4, a4, t1 // 1
|
||||
lbu a4, (a4) // 2
|
||||
srli a6, a4, 4 // 1
|
||||
andi a4, a4, 0xf // 1
|
||||
|
||||
// Get colour bits, add to TMDS LUT base and font bits
|
||||
and a5, a5, a3 // 1
|
||||
add a5, a5, t2 // 1
|
||||
sh3add a4, a4, a5 // 1
|
||||
sh3add a6, a6, a5 // 1
|
||||
|
||||
// Look up and write out 8 TMDS symbols
|
||||
lw a5, 4(a4) // 1
|
||||
lw a4, 0(a4) // 1
|
||||
lw a7, 4(a6) // 1
|
||||
lw a6, 0(a6) // 1
|
||||
sw a4, 0(a2) // 1
|
||||
sw a5, 4(a2) // 1
|
||||
sw a6, 8(a2) // 1
|
||||
sw a7, 12(a2) // 1
|
||||
addi a2, a2, 16 // 1
|
||||
#endif
|
||||
.endm
|
||||
|
||||
|
||||
|
@ -78,9 +106,12 @@
|
|||
|
||||
.section .scratch_x.tmds_encode_font_2bpp, "ax"
|
||||
.global tmds_encode_font_2bpp
|
||||
#ifndef __riscv
|
||||
.type tmds_encode_font_2bpp,%function
|
||||
.thumb_func
|
||||
#endif
|
||||
tmds_encode_font_2bpp:
|
||||
#ifndef __riscv
|
||||
push {r4-r7, lr}
|
||||
mov r4, r8
|
||||
mov r5, r9
|
||||
|
@ -123,6 +154,32 @@ tmds_encode_font_2bpp:
|
|||
mov r10, r6
|
||||
pop {r4-r7, pc}
|
||||
|
||||
#else
|
||||
|
||||
sh1add t0, a3, a2
|
||||
li a3, 0xf0 * 8
|
||||
|
||||
mv t1, a4
|
||||
la t2, palettised_1bpp_tables
|
||||
mv t3, a1
|
||||
|
||||
bgeu a2, t0, 2f
|
||||
1:
|
||||
lw a1, (t3)
|
||||
addi t3, t3, 4
|
||||
do_char 0 slli 7
|
||||
do_char 1 slli 3
|
||||
do_char 2 srli 1
|
||||
do_char 3 srli 5
|
||||
do_char 4 srli 9
|
||||
do_char 5 srli 13
|
||||
do_char 6 srli 17
|
||||
do_char 7 srli 21
|
||||
addi a0, a0, 8
|
||||
bltu a2, t0, 1b
|
||||
2:
|
||||
ret
|
||||
#endif
|
||||
|
||||
// Table generation:
|
||||
// levels_2bpp_even = [0x05, 0x50, 0xaf, 0xfa]
|
||||
|
|
|
@ -8,7 +8,6 @@
|
|||
#include "hardware/pll.h"
|
||||
#include "hardware/sync.h"
|
||||
#include "hardware/structs/bus_ctrl.h"
|
||||
#include "hardware/structs/ssi.h"
|
||||
#include "hardware/vreg.h"
|
||||
#include "pico/multicore.h"
|
||||
#include "pico/sem.h"
|
||||
|
|
|
@ -8,7 +8,6 @@
|
|||
#include "hardware/gpio.h"
|
||||
#include "hardware/vreg.h"
|
||||
#include "hardware/structs/bus_ctrl.h"
|
||||
#include "hardware/structs/ssi.h"
|
||||
#include "hardware/dma.h"
|
||||
#include "pico/sem.h"
|
||||
|
||||
|
|
|
@ -127,8 +127,10 @@ void __not_in_flash("render") render_loop() {
|
|||
tile16(pixbuf, &bg1, y, FRAME_WIDTH);
|
||||
queue_add_blocking(&dvi0.q_colour_valid, &pixbuf);
|
||||
}
|
||||
bg0.xscroll += 1;
|
||||
bg1.xscroll += 2;
|
||||
bg1.xscroll += 1;
|
||||
if (frame_ctr & 1) {
|
||||
bg0.xscroll += 1;
|
||||
}
|
||||
++frame_ctr;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -10,9 +10,19 @@ add_executable(vista-palette
|
|||
# flash using direct SSI DMA, which would trample on XIP.
|
||||
pico_set_binary_type(vista-palette copy_to_ram)
|
||||
|
||||
pico_define_boot_stage2(vista-palette_boot2 ${PICO_SDK_PATH}/src/rp2_common/boot_stage2/boot2_w25q080.S)
|
||||
target_compile_definitions(vista-palette_boot2 PRIVATE PICO_FLASH_SPI_CLKDIV=4)
|
||||
pico_set_boot_stage2(vista-palette vista-palette_boot2)
|
||||
|
||||
if (PICO_RP2040)
|
||||
pico_define_boot_stage2(vista-palette_boot2 ${PICO_SDK_PATH}/src/rp2040/boot_stage2/boot2_w25q080.S)
|
||||
pico_set_boot_stage2(vista-palette vista-palette_boot2)
|
||||
target_compile_definitions(vista-palette_boot2 PRIVATE PICO_FLASH_SPI_CLKDIV=4)
|
||||
else ()
|
||||
target_compile_definitions(vista-palette PRIVATE
|
||||
PICO_EMBED_XIP_SETUP=1
|
||||
PICO_BOOT_STAGE2_CHOOSE_W25Q080=1
|
||||
PICO_FLASH_SPI_CLKDIV=2
|
||||
PICO_FLASH_SPI_RXDELAY=3
|
||||
)
|
||||
endif()
|
||||
|
||||
target_compile_definitions(vista-palette PRIVATE
|
||||
DVI_DEFAULT_SERIAL_CONFIG=${DVI_DEFAULT_SERIAL_CONFIG}
|
||||
|
|
|
@ -13,6 +13,19 @@ target_compile_definitions(vista PRIVATE
|
|||
DVI_SYMBOLS_PER_WORD=1
|
||||
)
|
||||
|
||||
if (PICO_RP2040)
|
||||
pico_define_boot_stage2(vista_boot2 ${PICO_SDK_PATH}/src/rp2040/boot_stage2/boot2_w25q080.S)
|
||||
pico_set_boot_stage2(vista vista_boot2)
|
||||
target_compile_definitions(vista_boot2 PRIVATE PICO_FLASH_SPI_CLKDIV=4)
|
||||
else ()
|
||||
target_compile_definitions(vista PRIVATE
|
||||
PICO_EMBED_XIP_SETUP=1
|
||||
PICO_BOOT_STAGE2_CHOOSE_W25Q080=1
|
||||
PICO_FLASH_SPI_CLKDIV=2
|
||||
PICO_FLASH_SPI_RXDELAY=3
|
||||
)
|
||||
endif()
|
||||
|
||||
target_compile_definitions(vista PRIVATE PICO_STACK_SIZE=0x200)
|
||||
|
||||
target_link_libraries(vista
|
||||
|
|
|
@ -7,11 +7,17 @@
|
|||
#include "hardware/pll.h"
|
||||
#include "hardware/sync.h"
|
||||
#include "hardware/structs/bus_ctrl.h"
|
||||
#include "hardware/structs/ssi.h"
|
||||
#include "hardware/vreg.h"
|
||||
#include "pico/multicore.h"
|
||||
#include "pico/sem.h"
|
||||
#include "pico/stdlib.h"
|
||||
#if PICO_RP2040
|
||||
#include "hardware/structs/ssi.h"
|
||||
#else
|
||||
#include "hardware/structs/xip_ctrl.h"
|
||||
#include "hardware/structs/xip_aux.h"
|
||||
#include "hardware/structs/qmi.h"
|
||||
#endif
|
||||
|
||||
#include "tmds_encode.h"
|
||||
|
||||
|
@ -45,27 +51,40 @@ static inline void prepare_scanline(const uint32_t *colourbuf, uint32_t *tmdsbuf
|
|||
tmds_encode_data_channel_fullres_16bpp(colourbuf, tmdsbuf + 2 * pixwidth, pixwidth, 15, 11);
|
||||
}
|
||||
|
||||
void __no_inline_not_in_flash_func(flash_bulk_dma_start)(uint32_t *rxbuf, uint32_t flash_offs, size_t len, uint dma_chan)
|
||||
{
|
||||
void __no_inline_not_in_flash_func(flash_bulk_dma_start)(uint32_t *rxbuf, uint32_t flash_offs, size_t len, uint dma_chan) {
|
||||
#if PICO_RP2040
|
||||
// On RP2040, program the SSI to clock the correct amount of data without stopping
|
||||
ssi_hw->ssienr = 0;
|
||||
ssi_hw->ctrlr1 = len - 1; // NDF, number of data frames
|
||||
ssi_hw->dmacr = SSI_DMACR_TDMAE_BITS | SSI_DMACR_RDMAE_BITS;
|
||||
ssi_hw->ssienr = 1;
|
||||
// Other than NDF, the SSI configuration used for XIP is suitable for a bulk read too.
|
||||
|
||||
dma_hw->ch[dma_chan].read_addr = (uint32_t)&ssi_hw->dr0;
|
||||
const uintptr_t read_addr = (uintptr_t)&ssi_hw->dr0;
|
||||
const uint dreq = DREQ_XIP_SSIRX;
|
||||
const bool bswap = true;
|
||||
#else
|
||||
// On RP2350, SSI is gone, but XIP streaming is fast enough to keep up with this demo
|
||||
// (you can still DMA to the DIRECT_MODE FIFOs if you really need 100%)
|
||||
xip_ctrl_hw->stream_addr = flash_offs;
|
||||
xip_ctrl_hw->stream_ctr = len;
|
||||
const uintptr_t read_addr = (uintptr_t)&xip_aux_hw->stream;
|
||||
const uint dreq = DREQ_XIP_STREAM;
|
||||
const bool bswap = false;
|
||||
#endif
|
||||
dma_hw->ch[dma_chan].read_addr = read_addr;
|
||||
dma_hw->ch[dma_chan].write_addr = (uint32_t)rxbuf;
|
||||
dma_hw->ch[dma_chan].transfer_count = len;
|
||||
dma_hw->ch[dma_chan].ctrl_trig =
|
||||
DMA_CH0_CTRL_TRIG_BSWAP_BITS |
|
||||
DREQ_XIP_SSIRX << DMA_CH0_CTRL_TRIG_TREQ_SEL_LSB |
|
||||
(uint)bswap << DMA_CH0_CTRL_TRIG_BSWAP_LSB |
|
||||
dreq << DMA_CH0_CTRL_TRIG_TREQ_SEL_LSB |
|
||||
dma_chan << DMA_CH0_CTRL_TRIG_CHAIN_TO_LSB |
|
||||
DMA_CH0_CTRL_TRIG_INCR_WRITE_BITS |
|
||||
DMA_CH0_CTRL_TRIG_DATA_SIZE_VALUE_SIZE_WORD << DMA_CH0_CTRL_TRIG_DATA_SIZE_LSB |
|
||||
DMA_CH0_CTRL_TRIG_EN_BITS;
|
||||
|
||||
#if PICO_RP2040
|
||||
// Now DMA is waiting, kick off the SSI transfer (mode continuation bits in LSBs)
|
||||
ssi_hw->dr0 = (flash_offs << 8) | 0xa0;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Core 1 handles DMA IRQs and runs TMDS encode on scanline buffers it
|
||||
|
@ -91,6 +110,15 @@ int __not_in_flash("main") main() {
|
|||
sleep_ms(10);
|
||||
set_sys_clock_khz(DVI_TIMING.bit_clk_khz, true);
|
||||
|
||||
// A0 SDK won't pick up on the PICO_EMBED_XIP_SETUP flag, so just to make sure:
|
||||
#if PICO_RP2350
|
||||
hw_write_masked(
|
||||
&qmi_hw->m[0].timing,
|
||||
3 << QMI_M0_TIMING_RXDELAY_LSB | 2 << QMI_M0_TIMING_CLKDIV_LSB,
|
||||
QMI_M0_TIMING_RXDELAY_BITS | QMI_M0_TIMING_CLKDIV_BITS
|
||||
);
|
||||
#endif
|
||||
|
||||
setup_default_uart();
|
||||
|
||||
gpio_init(LED_PIN);
|
||||
|
@ -131,21 +159,25 @@ int __not_in_flash("main") main() {
|
|||
}
|
||||
for (int y = 0; y < 2 * FRAME_HEIGHT; y += 2) {
|
||||
// Start DMA to back buffer before starting to encode the front buffer (each buffer is two scanlines)
|
||||
#if !PICO_RP2040
|
||||
// On RP2040 we could never reach this point early, because of the slow encode!
|
||||
dma_channel_wait_for_finish_blocking(img_dma_chan);
|
||||
#endif
|
||||
flash_bulk_dma_start(
|
||||
(uint32_t*)img_buf[img_buf_back],
|
||||
current_image_base + ((y + 2) % (2 * FRAME_HEIGHT)) * IMAGE_SCANLINE_SIZE,
|
||||
IMAGE_SCANLINE_SIZE * 2 / sizeof(uint32_t),
|
||||
img_dma_chan
|
||||
);
|
||||
const uint16_t *img = (const uint16_t*)img_buf[img_buf_front];
|
||||
const uint16_t *img = (const uint16_t*)img_buf[img_buf_front];
|
||||
uint32_t *our_tmds_buf, *their_tmds_buf;
|
||||
queue_remove_blocking_u32(&dvi0.q_tmds_free, &their_tmds_buf);
|
||||
multicore_fifo_push_blocking((uint32_t)(img));
|
||||
multicore_fifo_push_blocking((uint32_t)their_tmds_buf);
|
||||
|
||||
|
||||
queue_remove_blocking_u32(&dvi0.q_tmds_free, &our_tmds_buf);
|
||||
prepare_scanline((const uint32_t*)(img + FRAME_WIDTH * 2), our_tmds_buf);
|
||||
|
||||
|
||||
multicore_fifo_pop_blocking();
|
||||
queue_add_blocking_u32(&dvi0.q_tmds_valid, &their_tmds_buf);
|
||||
queue_add_blocking_u32(&dvi0.q_tmds_valid, &our_tmds_buf);
|
||||
|
@ -156,4 +188,3 @@ int __not_in_flash("main") main() {
|
|||
}
|
||||
__builtin_unreachable();
|
||||
}
|
||||
|
||||
|
|
Plik binarny nie jest wyświetlany.
|
@ -28,6 +28,17 @@ static const struct dvi_serialiser_cfg picodvi_reva_dvi_cfg = {
|
|||
.invert_diffpairs = true
|
||||
};
|
||||
|
||||
// AMY-DVI board, for getting HDMI from the RP2350 FPGA development platform,
|
||||
// again a cursed board that only a couple of people in the world possess:
|
||||
static const struct dvi_serialiser_cfg amy_dvi_cfg = {
|
||||
.pio = DVI_DEFAULT_PIO_INST,
|
||||
.sm_tmds = {0, 1, 2},
|
||||
.pins_tmds = {14, 16, 18},
|
||||
.pins_clk = 12,
|
||||
.invert_diffpairs = true
|
||||
};
|
||||
|
||||
|
||||
// The not-HDMI socket on Rev C PicoDVI boards
|
||||
// (we don't talk about Rev B)
|
||||
static const struct dvi_serialiser_cfg picodvi_dvi_cfg = {
|
||||
|
|
|
@ -184,7 +184,7 @@ static void __dvi_func(dvi_dma_irq_handler)(struct dvi_inst *inst) {
|
|||
// Make sure all three channels have definitely loaded their last block
|
||||
// (should be within a few cycles of one another)
|
||||
for (int i = 0; i < N_TMDS_LANES; ++i) {
|
||||
while (dma_debug_hw->ch[inst->dma_cfg[i].chan_data].tcr != inst->timing->h_active_pixels / DVI_SYMBOLS_PER_WORD)
|
||||
while (dma_debug_hw->ch[inst->dma_cfg[i].chan_data].dbg_tcr != inst->timing->h_active_pixels / DVI_SYMBOLS_PER_WORD)
|
||||
tight_loop_contents();
|
||||
}
|
||||
|
||||
|
|
|
@ -51,8 +51,16 @@
|
|||
#define DVI_SYMBOLS_PER_WORD 2
|
||||
#endif
|
||||
|
||||
#if DVI_SYMBOLS_PER_WORD != 1 && DVI_SYMBOLS_PER_WORD !=2
|
||||
#error "Unsupported value for DVI_SYMBOLS_PER_WORD"
|
||||
// Implement TMDS encode with hardware encoders in SIO, instead of
|
||||
// interpolators + LUTs. The processor still has to crank the encoder, but
|
||||
// it's much faster. This still works with PIO serialisers, which can appear
|
||||
// on any GPIO, unlike the HSTX which is limited to specific GPIOs.
|
||||
#ifndef DVI_USE_SIO_TMDS_ENCODER
|
||||
#if PICO_RP2040
|
||||
#define DVI_USE_SIO_TMDS_ENCODER 0
|
||||
#else
|
||||
#define DVI_USE_SIO_TMDS_ENCODER 1
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
|
|
@ -2,6 +2,10 @@
|
|||
#include "hardware/regs/sio.h"
|
||||
#include "dvi_config_defs.h"
|
||||
|
||||
// This file contains both Arm and RISC-V source, with the correct version
|
||||
// selected via the __arm__ and __riscv predefined macros. The targeted Arm
|
||||
// dialect is Armv6-M, and the targeted RISC-V dialect is RV32IZba
|
||||
|
||||
// Offsets suitable for ldr/str (must be <= 0x7c):
|
||||
#define ACCUM0_OFFS (SIO_INTERP0_ACCUM0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
|
||||
#define ACCUM1_OFFS (SIO_INTERP0_ACCUM1_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
|
||||
|
@ -13,23 +17,33 @@
|
|||
// Note the entirety of INTERP0 and INTERP1 fits inside this 5-bit
|
||||
// word-addressed space... almost as though it were intentional! :)
|
||||
|
||||
#if defined(__arm__) && defined(__riscv)
|
||||
#error "wat"
|
||||
#endif
|
||||
|
||||
#ifdef __arm__
|
||||
.syntax unified
|
||||
.cpu cortex-m0plus
|
||||
.thumb
|
||||
#endif
|
||||
|
||||
.macro decl_func_x name
|
||||
.section .scratch_x.\name, "ax"
|
||||
.global \name
|
||||
#ifdef __arm__
|
||||
.type \name,%function
|
||||
.thumb_func
|
||||
#endif
|
||||
\name:
|
||||
.endm
|
||||
|
||||
.macro decl_func_y name
|
||||
.section .scratch_y.\name, "ax"
|
||||
.global \name
|
||||
#ifdef __arm__
|
||||
.type \name,%function
|
||||
.thumb_func
|
||||
#endif
|
||||
\name:
|
||||
.endm
|
||||
|
||||
|
@ -41,7 +55,10 @@
|
|||
// r0: Input buffer (word-aligned)
|
||||
// r1: Output buffer (word-aligned)
|
||||
// r2: Input size (pixels)
|
||||
// r3: Left shift (for the *_leftshift variant only -- costs 1 cycle per 2 pixels)
|
||||
|
||||
#if defined(__arm__)
|
||||
// Armv6-M:
|
||||
.macro do_channel_16bpp r_ibase r_inout0 r_out1
|
||||
str \r_inout0, [\r_ibase, #ACCUM0_OFFS]
|
||||
ldr \r_inout0, [\r_ibase, #PEEK0_OFFS]
|
||||
|
@ -50,8 +67,11 @@
|
|||
ldr \r_out1, [\r_out1]
|
||||
.endm
|
||||
|
||||
decl_func tmds_encode_loop_16bpp
|
||||
.macro tmds_encode_loop_16bpp_impl leftshift
|
||||
push {r4, r5, r6, r7, lr}
|
||||
// Bounds calculation: each input pixel results in two output pixels,
|
||||
// whose two TMDS symbols are packed in a single 32-bit word. So, 4 bytes
|
||||
// out per one pixel in.
|
||||
lsls r2, #2
|
||||
add r2, r1
|
||||
mov ip, r2
|
||||
|
@ -61,7 +81,13 @@ decl_func tmds_encode_loop_16bpp
|
|||
1:
|
||||
.rept TMDS_ENCODE_UNROLL
|
||||
ldmia r0!, {r4, r6}
|
||||
.if \leftshift
|
||||
lsls r4, r3
|
||||
.endif
|
||||
do_channel_16bpp r2, r4, r5
|
||||
.if \leftshift
|
||||
lsls r6, r3
|
||||
.endif
|
||||
do_channel_16bpp r2, r6, r7
|
||||
stmia r1!, {r4, r5, r6, r7}
|
||||
.endr
|
||||
|
@ -69,82 +95,72 @@ decl_func tmds_encode_loop_16bpp
|
|||
cmp r1, ip
|
||||
bne 1b
|
||||
pop {r4, r5, r6, r7, pc}
|
||||
.endm
|
||||
|
||||
// Same as above, but scale data to make up for lack of left shift
|
||||
// in interpolator (costs 1 cycle per 2 pixels)
|
||||
//
|
||||
// r0: Input buffer (word-aligned)
|
||||
// r1: Output buffer (word-aligned)
|
||||
// r2: Input size (pixels)
|
||||
// r3: Left shift amount
|
||||
#elif defined(__riscv)
|
||||
.macro do_channel_16bpp r_ibase r_inout0 r_out1
|
||||
sw \r_inout0, ACCUM0_OFFS(\r_ibase)
|
||||
// Note two halves are interleaved to avoid load->addr dependency
|
||||
lw \r_inout0, PEEK0_OFFS(\r_ibase)
|
||||
lw \r_out1, PEEK1_OFFS(\r_ibase)
|
||||
lw \r_inout0, (\r_inout0)
|
||||
lw \r_out1, (\r_out1)
|
||||
.endm
|
||||
|
||||
.macro tmds_encode_loop_16bpp_impl leftshift
|
||||
slli a2, a2, 2
|
||||
add t0, a2, a1
|
||||
bgeu a1, t0, 2f
|
||||
li a2, SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET
|
||||
.align 2
|
||||
1:
|
||||
.set i, 0
|
||||
.rept TMDS_ENCODE_UNROLL
|
||||
lw a4, 8 * i + 0(a0)
|
||||
lw a6, 8 * i + 4(a0)
|
||||
.if \leftshift
|
||||
sll a4, a4, a3
|
||||
sll a6, a6, a3
|
||||
.endif
|
||||
do_channel_16bpp a2, a4, a5
|
||||
do_channel_16bpp a2, a6, a7
|
||||
sw a4, 16 * i + 0(a1)
|
||||
sw a5, 16 * i + 4(a1)
|
||||
sw a6, 16 * i + 8(a1)
|
||||
sw a7, 16 * i + 12(a1)
|
||||
.set i, i + 1
|
||||
.endr
|
||||
addi a0, a0, 8 * TMDS_ENCODE_UNROLL
|
||||
addi a1, a1, 16 * TMDS_ENCODE_UNROLL
|
||||
bltu a1, t0, 1b
|
||||
2:
|
||||
ret
|
||||
.endm
|
||||
|
||||
#else
|
||||
#error "Unknown architecture"
|
||||
#endif
|
||||
|
||||
decl_func tmds_encode_loop_16bpp
|
||||
tmds_encode_loop_16bpp_impl 0
|
||||
|
||||
decl_func tmds_encode_loop_16bpp_leftshift
|
||||
push {r4, r5, r6, r7, lr}
|
||||
lsls r2, #2
|
||||
add r2, r1
|
||||
mov ip, r2
|
||||
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
|
||||
b 2f
|
||||
.align 2
|
||||
1:
|
||||
.rept TMDS_ENCODE_UNROLL
|
||||
ldmia r0!, {r4, r6}
|
||||
lsls r4, r3
|
||||
do_channel_16bpp r2, r4, r5
|
||||
lsls r6, r3
|
||||
do_channel_16bpp r2, r6, r7
|
||||
stmia r1!, {r4, r5, r6, r7}
|
||||
.endr
|
||||
2:
|
||||
cmp r1, ip
|
||||
bne 1b
|
||||
pop {r4, r5, r6, r7, pc}
|
||||
tmds_encode_loop_16bpp_impl 1
|
||||
|
||||
// r0: Input buffer (word-aligned)
|
||||
// r1: Output buffer (word-aligned)
|
||||
// r2: Input size (pixels)
|
||||
|
||||
decl_func tmds_encode_loop_8bpp
|
||||
push {r4, r5, r6, r7, lr}
|
||||
lsls r2, #2
|
||||
add r2, r1
|
||||
mov ip, r2
|
||||
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
|
||||
b 2f
|
||||
.align 2
|
||||
1:
|
||||
.rept TMDS_ENCODE_UNROLL
|
||||
ldmia r0!, {r4}
|
||||
str r4, [r2, #ACCUM0_OFFS + INTERP1]
|
||||
str r4, [r2, #ACCUM0_OFFS]
|
||||
ldr r4, [r2, #PEEK0_OFFS]
|
||||
ldr r4, [r4]
|
||||
ldr r5, [r2, #PEEK1_OFFS]
|
||||
ldr r5, [r5]
|
||||
ldr r6, [r2, #PEEK0_OFFS + INTERP1]
|
||||
ldr r6, [r6]
|
||||
ldr r7, [r2, #PEEK1_OFFS + INTERP1]
|
||||
ldr r7, [r7]
|
||||
stmia r1!, {r4, r5, r6, r7}
|
||||
.endr
|
||||
2:
|
||||
cmp r1, ip
|
||||
bne 1b
|
||||
pop {r4, r5, r6, r7, pc}
|
||||
|
||||
// r0: Input buffer (word-aligned)
|
||||
// r1: Output buffer (word-aligned)
|
||||
// r2: Input size (pixels)
|
||||
// r3: Left shift amount
|
||||
// r3: Left shift amount (for the *_leftshift variant of the function)
|
||||
//
|
||||
// Note that only the data written to interp0 (pixel 0, 1) is leftshifted, not
|
||||
// the data written to interp1 (pixel 2, 3). Otherwise we always lose MSBs, as
|
||||
// the LUT offset MSB is at bit 8, so pixel 0 always requires some left shift,
|
||||
// since its channel MSBs are no greater than 7.
|
||||
|
||||
decl_func tmds_encode_loop_8bpp_leftshift
|
||||
#if defined(__arm__)
|
||||
.macro tmds_encode_loop_8bpp_impl leftshift
|
||||
push {r4, r5, r6, r7, lr}
|
||||
lsls r2, #3
|
||||
lsls r2, #2
|
||||
add r2, r1
|
||||
mov ip, r2
|
||||
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
|
||||
|
@ -154,7 +170,9 @@ decl_func tmds_encode_loop_8bpp_leftshift
|
|||
.rept TMDS_ENCODE_UNROLL
|
||||
ldmia r0!, {r4}
|
||||
str r4, [r2, #ACCUM0_OFFS + INTERP1]
|
||||
.if \leftshift
|
||||
lsls r4, r3
|
||||
.endif
|
||||
str r4, [r2, #ACCUM0_OFFS]
|
||||
ldr r4, [r2, #PEEK0_OFFS]
|
||||
ldr r4, [r4]
|
||||
|
@ -170,6 +188,54 @@ decl_func tmds_encode_loop_8bpp_leftshift
|
|||
cmp r1, ip
|
||||
bne 1b
|
||||
pop {r4, r5, r6, r7, pc}
|
||||
.endm
|
||||
|
||||
#elif defined(__riscv)
|
||||
.macro tmds_encode_loop_8bpp_impl leftshift
|
||||
slli a2, a2, 2
|
||||
add a2, a2, a1
|
||||
bgeu a1, a2, 2f
|
||||
mv t0, a2
|
||||
li a2, SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET
|
||||
.align 2
|
||||
1:
|
||||
.set i, 0
|
||||
.rept TMDS_ENCODE_UNROLL
|
||||
lw a4, 4 * i(a0)
|
||||
sw a4, ACCUM0_OFFS + INTERP1(a2)
|
||||
.if \leftshift
|
||||
sll a4, a4, a3
|
||||
.endif
|
||||
sw a4, ACCUM0_OFFS(a2)
|
||||
lw a4, PEEK0_OFFS(a2)
|
||||
lw a5, PEEK1_OFFS(a2)
|
||||
lw a4, (a4)
|
||||
lw a5, (a5)
|
||||
lw a6, PEEK0_OFFS + INTERP1(a2)
|
||||
lw a7, PEEK1_OFFS + INTERP1(a2)
|
||||
lw a6, (a6)
|
||||
lw a7, (a7)
|
||||
sw a4, 16 * i + 0(a1)
|
||||
sw a5, 16 * i + 4(a1)
|
||||
sw a6, 16 * i + 8(a1)
|
||||
sw a7, 16 * i + 12(a1)
|
||||
.set i, i + 1
|
||||
.endr
|
||||
addi a0, a0, TMDS_ENCODE_UNROLL * 4
|
||||
addi a1, a1, TMDS_ENCODE_UNROLL * 16
|
||||
bltu a1, t0, 1b
|
||||
2:
|
||||
ret
|
||||
.endm
|
||||
|
||||
#else
|
||||
#error "Unknown architecture"
|
||||
#endif
|
||||
|
||||
decl_func tmds_encode_loop_8bpp
|
||||
tmds_encode_loop_8bpp_impl 0
|
||||
decl_func tmds_encode_loop_8bpp_leftshift
|
||||
tmds_encode_loop_8bpp_impl 1
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Fast 1bpp black/white encoder (full res)
|
||||
|
@ -190,6 +256,8 @@ decl_func tmds_encode_loop_8bpp_leftshift
|
|||
// r3 contains lookup mask (preshifted)
|
||||
// r8 contains pointer to encode table
|
||||
// 2.125 cyc/pix
|
||||
|
||||
#if defined(__arm__)
|
||||
.macro tmds_encode_1bpp_body shift_instr0 shamt0 shift_instr1 shamt1
|
||||
\shift_instr0 r4, r2, #\shamt0
|
||||
ands r4, r3
|
||||
|
@ -238,6 +306,58 @@ decl_func tmds_encode_1bpp
|
|||
mov r8, r7
|
||||
pop {r4-r7, pc}
|
||||
|
||||
#elif defined(__riscv)
|
||||
// TODO the register allocation is not optimal here for code size
|
||||
.macro tmds_encode_1bpp_body shift_instr0 shamt0 shift_instr1 shamt1
|
||||
\shift_instr0 a4, a2, \shamt0
|
||||
and a4, a4, a3
|
||||
add a4, a4, t1
|
||||
lw a5, 4(a4)
|
||||
lw a4, 0(a4)
|
||||
\shift_instr1 a6, a2, \shamt1
|
||||
and a6, a6, a3
|
||||
add a6, a6, t1
|
||||
lw a7, 4(a6)
|
||||
lw a6, 0(a6)
|
||||
sw a4, 0(a1)
|
||||
sw a5, 4(a1)
|
||||
sw a6, 8(a1)
|
||||
sw a7, 12(a1)
|
||||
addi a1, a1, 16
|
||||
.endm
|
||||
|
||||
// a0: input buffer (word-aligned)
|
||||
// a1: output buffer (word-aligned)
|
||||
// a2: output pixel count
|
||||
decl_func tmds_encode_1bpp
|
||||
slli a2, a2, 1
|
||||
add t0, a2, a1
|
||||
la t1, tmds_1bpp_table
|
||||
// Mask: 4 bit index, 8 bytes per entry
|
||||
li a3, 0x78
|
||||
bgeu a1, t0, 2f
|
||||
1:
|
||||
lw a2, (a0)
|
||||
addi a0, a0, 4
|
||||
#if !DVI_1BPP_BIT_REVERSE
|
||||
tmds_encode_1bpp_body slli 3 srli 1
|
||||
tmds_encode_1bpp_body srli 5 srli 9
|
||||
tmds_encode_1bpp_body srli 13 srli 17
|
||||
tmds_encode_1bpp_body srli 21 srli 25
|
||||
#else
|
||||
tmds_encode_1bpp_body srli 1 slli 3
|
||||
tmds_encode_1bpp_body srli 9 srli 5
|
||||
tmds_encode_1bpp_body srli 17 srli 13
|
||||
tmds_encode_1bpp_body srli 25 srli 21
|
||||
#endif
|
||||
bltu a1, t0, 1b
|
||||
2:
|
||||
ret
|
||||
|
||||
#else
|
||||
#error "Unknown architecture"
|
||||
#endif
|
||||
|
||||
.align 2
|
||||
tmds_1bpp_table:
|
||||
#if !DVI_1BPP_BIT_REVERSE
|
||||
|
@ -299,6 +419,7 @@ tmds_1bpp_table:
|
|||
// level 2: (a5 -> 163) always
|
||||
// level 3: (ef -> 2f0) always
|
||||
|
||||
#if defined(__arm__)
|
||||
// Table base pointer in r0. Input pixels in r2.
|
||||
.macro encode_2bpp_body shift_instr shamt rd
|
||||
\shift_instr \rd, r2, #\shamt
|
||||
|
@ -343,6 +464,55 @@ decl_func tmds_encode_2bpp
|
|||
mov r8, r7
|
||||
pop {r4-r7, pc}
|
||||
|
||||
#elif defined(__riscv)
|
||||
// Table base pointer in a0. Input pixels in a2.
|
||||
.macro encode_2bpp_body shift_instr shamt rd
|
||||
\shift_instr \rd, a2, \shamt
|
||||
and \rd, \rd, a3
|
||||
add \rd, \rd, a0
|
||||
lw \rd, (\rd)
|
||||
.endm
|
||||
|
||||
// a0: input buffer (word-aligned)
|
||||
// a1: output buffer (word-aligned)
|
||||
// a2: output pixel count
|
||||
decl_func tmds_encode_2bpp
|
||||
mv t1, a0
|
||||
la a0, tmds_2bpp_table
|
||||
// Mask: 4-bit index into 4-byte entries.
|
||||
li a3, 0x3c
|
||||
// Limit pointer: 1 word per 2 pixels
|
||||
slli a2, a2, 1
|
||||
add t0, a2, a1
|
||||
bgeu a1, t0, 1b
|
||||
1:
|
||||
lw a2, (t1)
|
||||
addi t1, t1, 4
|
||||
encode_2bpp_body slli 2 a4
|
||||
encode_2bpp_body srli 2 a5
|
||||
encode_2bpp_body srli 6 a6
|
||||
encode_2bpp_body srli 10 a7
|
||||
sw a4, 0(a1)
|
||||
sw a5, 4(a1)
|
||||
sw a6, 8(a1)
|
||||
sw a7, 12(a1)
|
||||
encode_2bpp_body srli 14 a4
|
||||
encode_2bpp_body srli 18 a5
|
||||
encode_2bpp_body srli 22 a6
|
||||
encode_2bpp_body srli 26 a7
|
||||
sw a4, 16(a1)
|
||||
sw a5, 20(a1)
|
||||
sw a6, 24(a1)
|
||||
sw a7, 28(a1)
|
||||
addi a1, a1, 32
|
||||
bltu a1, t0, 1b
|
||||
2:
|
||||
ret
|
||||
|
||||
#else
|
||||
#error "Unknown architecture"
|
||||
#endif
|
||||
|
||||
.align 2
|
||||
tmds_2bpp_table:
|
||||
.word 0x7f103 // 00, 00
|
||||
|
@ -404,17 +574,20 @@ tmds_2bpp_table:
|
|||
// much better, and many monitors will still accept the signals as long as you
|
||||
// DC couple your DVI signals.
|
||||
|
||||
.macro tmds_fullres_encode_loop_body ra rb
|
||||
#if defined(__arm__)
|
||||
.macro tmds_fullres_encode_loop_body leftshift ra rb
|
||||
str \ra, [r2, #ACCUM0_OFFS + INTERP1]
|
||||
.if \leftshift
|
||||
lsls \ra, r3
|
||||
.endif
|
||||
str \ra, [r2, #ACCUM0_OFFS]
|
||||
// Loads interleaved to avoid rdata->addr stall on M33
|
||||
ldr \ra, [r2, #PEEK2_OFFS]
|
||||
ldr \ra, [\ra]
|
||||
#if !TMDS_FULLRES_NO_DC_BALANCE
|
||||
str \ra, [r2, #ACCUM1_ADD_OFFS]
|
||||
#endif
|
||||
ldr \rb, [r2, #PEEK2_OFFS + INTERP1]
|
||||
ldr \ra, [\ra]
|
||||
ldr \rb, [\rb]
|
||||
#if !TMDS_FULLRES_NO_DC_BALANCE
|
||||
str \ra, [r2, #ACCUM1_ADD_OFFS]
|
||||
str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1]
|
||||
#endif
|
||||
.endm
|
||||
|
@ -422,8 +595,9 @@ tmds_2bpp_table:
|
|||
// r0: Input buffer (word-aligned)
|
||||
// r1: Output buffer (word-aligned)
|
||||
// r2: Pixel count
|
||||
// r3: Left shift amount
|
||||
|
||||
.macro tmds_fullres_encode_loop_16bpp
|
||||
.macro tmds_fullres_encode_loop_16bpp leftshift
|
||||
push {r4-r7, lr}
|
||||
mov r4, r8
|
||||
push {r4}
|
||||
|
@ -451,8 +625,8 @@ tmds_2bpp_table:
|
|||
1:
|
||||
.rept 16
|
||||
ldmia r0!, {r4, r6}
|
||||
tmds_fullres_encode_loop_body r4 r5
|
||||
tmds_fullres_encode_loop_body r6 r7
|
||||
tmds_fullres_encode_loop_body \leftshift r4 r5
|
||||
tmds_fullres_encode_loop_body \leftshift r6 r7
|
||||
stmia r1!, {r4, r5, r6, r7}
|
||||
.endr
|
||||
2:
|
||||
|
@ -465,82 +639,77 @@ tmds_2bpp_table:
|
|||
pop {r4-r7, pc}
|
||||
.endm
|
||||
|
||||
// One copy each in X and Y, so the two cores don't step on each other
|
||||
decl_func_x tmds_fullres_encode_loop_16bpp_x
|
||||
tmds_fullres_encode_loop_16bpp
|
||||
decl_func_y tmds_fullres_encode_loop_16bpp_y
|
||||
tmds_fullres_encode_loop_16bpp
|
||||
#elif defined(__riscv)
|
||||
|
||||
|
||||
.macro tmds_fullres_encode_loop_body_leftshift ra rb
|
||||
// Note we apply the leftshift for INTERP0 only
|
||||
str \ra, [r2, #ACCUM0_OFFS + INTERP1]
|
||||
lsls \ra, r3
|
||||
str \ra, [r2, #ACCUM0_OFFS]
|
||||
ldr \ra, [r2, #PEEK2_OFFS]
|
||||
ldr \ra, [\ra]
|
||||
.macro tmds_fullres_encode_loop_body leftshift ra rb
|
||||
sw \ra, ACCUM0_OFFS + INTERP1(a2)
|
||||
.if \leftshift
|
||||
sll \ra, \ra, a3
|
||||
.endif
|
||||
sw \ra, ACCUM0_OFFS(a2)
|
||||
lw \ra, PEEK2_OFFS(a2)
|
||||
lw \rb, PEEK2_OFFS + INTERP1(a2)
|
||||
lw \ra, (\ra)
|
||||
lw \rb, (\rb)
|
||||
#if !TMDS_FULLRES_NO_DC_BALANCE
|
||||
str \ra, [r2, #ACCUM1_ADD_OFFS]
|
||||
#endif
|
||||
ldr \rb, [r2, #PEEK2_OFFS + INTERP1]
|
||||
ldr \rb, [\rb]
|
||||
#if !TMDS_FULLRES_NO_DC_BALANCE
|
||||
str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1]
|
||||
sw \ra, ACCUM1_ADD_OFFS(a2)
|
||||
sw \rb, ACCUM1_ADD_OFFS + INTERP1(a2)
|
||||
#endif
|
||||
.endm
|
||||
|
||||
// r0: Input buffer (word-aligned)
|
||||
// r1: Output buffer (word-aligned)
|
||||
// r2: Pixel count
|
||||
// r3: Left shift amount
|
||||
// a0: Input buffer (word-aligned)
|
||||
// a1: Output buffer (word-aligned)
|
||||
// a2: Pixel count
|
||||
// a3: Left shift amount
|
||||
|
||||
.macro tmds_fullres_encode_loop_16bpp_leftshift
|
||||
push {r4-r7, lr}
|
||||
mov r4, r8
|
||||
mov r5, r9
|
||||
push {r4-r5}
|
||||
|
||||
lsls r2, #2
|
||||
add r2, r1
|
||||
mov ip, r2
|
||||
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
|
||||
.macro tmds_fullres_encode_loop_16bpp leftshift
|
||||
sh2add t0, a2, a1
|
||||
li a2, SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET
|
||||
// DC balance defined to be 0 at start of scanline:
|
||||
movs r4, #0
|
||||
str r4, [r2, #ACCUM1_OFFS]
|
||||
li a4, 0
|
||||
sw a4, ACCUM1_OFFS(a2)
|
||||
#if TMDS_FULLRES_NO_DC_BALANCE
|
||||
// Alternate parity between odd/even symbols if there's no balance feedback
|
||||
mvns r4, r4
|
||||
// Alternate parity between odd/even symbols if no feedback
|
||||
li a4, -1
|
||||
#endif
|
||||
str r4, [r2, #ACCUM1_OFFS + INTERP1]
|
||||
sw a4, ACCUM1_OFFS + INTERP1(a2)
|
||||
|
||||
adr r4, 1f
|
||||
adds r4, #1
|
||||
mov r8, r4
|
||||
b 2f
|
||||
bgeu a1, t0, 2f
|
||||
.align 2
|
||||
1:
|
||||
.rept 16 // 64 pixels per iteration
|
||||
ldmia r0!, {r4, r6}
|
||||
tmds_fullres_encode_loop_body_leftshift r4 r5
|
||||
tmds_fullres_encode_loop_body_leftshift r6 r7
|
||||
stmia r1!, {r4, r5, r6, r7}
|
||||
.set i, 0
|
||||
.rept 16
|
||||
lw a4, 8 * i + 0(a0)
|
||||
lw a6, 8 * i + 4(a0)
|
||||
tmds_fullres_encode_loop_body \leftshift a4 a5
|
||||
tmds_fullres_encode_loop_body \leftshift a6 a7
|
||||
sw a4, 16 * i + 0(a1)
|
||||
sw a5, 16 * i + 4(a1)
|
||||
sw a6, 16 * i + 8(a1)
|
||||
sw a7, 16 * i + 12(a1)
|
||||
.set i, i + 1
|
||||
.endr
|
||||
addi a0, a0, 8 * i
|
||||
addi a1, a1, 16 * i
|
||||
bltu a1, t0, 1b
|
||||
2:
|
||||
cmp r1, ip
|
||||
beq 1f
|
||||
bx r8
|
||||
1:
|
||||
pop {r4-r5}
|
||||
mov r8, r4
|
||||
mov r9, r5
|
||||
pop {r4-r7, pc}
|
||||
ret
|
||||
.endm
|
||||
|
||||
decl_func_x tmds_fullres_encode_loop_16bpp_leftshift_x
|
||||
tmds_fullres_encode_loop_16bpp_leftshift
|
||||
decl_func_y tmds_fullres_encode_loop_16bpp_leftshift_y
|
||||
tmds_fullres_encode_loop_16bpp_leftshift
|
||||
#else
|
||||
#error "Unknown architecture"
|
||||
#endif
|
||||
|
||||
// One copy each in X and Y, so the two cores don't step on each other
|
||||
decl_func_x tmds_fullres_encode_loop_16bpp_x
|
||||
tmds_fullres_encode_loop_16bpp 0
|
||||
decl_func_y tmds_fullres_encode_loop_16bpp_y
|
||||
tmds_fullres_encode_loop_16bpp 0
|
||||
|
||||
decl_func_x tmds_fullres_encode_loop_16bpp_leftshift_x
|
||||
tmds_fullres_encode_loop_16bpp 1
|
||||
decl_func_y tmds_fullres_encode_loop_16bpp_leftshift_y
|
||||
tmds_fullres_encode_loop_16bpp 1
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Full-resolution 8bpp paletted encode
|
||||
|
@ -550,19 +719,19 @@ decl_func_y tmds_fullres_encode_loop_16bpp_leftshift_y
|
|||
// base is set to a reordered list of TMDS symbols based
|
||||
// on a user colour palette.
|
||||
|
||||
#ifdef __arm__
|
||||
// Two pixels input in rd[17:2]. Two symbols output in rd[19:0]. r2 contains
|
||||
// interp base pointer. r7 used as temporary.
|
||||
.macro tmds_palette_encode_loop_body rd
|
||||
str \rd, [r2, #ACCUM0_OFFS]
|
||||
str \rd, [r2, #ACCUM0_OFFS + INTERP1]
|
||||
// Loads interleaved to avoid rdata->addr stall on M33
|
||||
ldr \rd, [r2, #PEEK2_OFFS]
|
||||
ldr \rd, [\rd]
|
||||
#if !TMDS_FULLRES_NO_DC_BALANCE
|
||||
str \rd, [r2, #ACCUM1_ADD_OFFS]
|
||||
#endif
|
||||
ldr r7, [r2, #PEEK2_OFFS + INTERP1]
|
||||
ldr \rd, [\rd]
|
||||
ldr r7, [r7]
|
||||
#if !TMDS_FULLRES_NO_DC_BALANCE
|
||||
str \rd, [r2, #ACCUM1_ADD_OFFS]
|
||||
str r7, [r2, #ACCUM1_ADD_OFFS + INTERP1]
|
||||
#endif
|
||||
lsls r7, #10
|
||||
|
@ -617,7 +786,241 @@ decl_func_y tmds_fullres_encode_loop_16bpp_leftshift_y
|
|||
pop {r4-r7, pc}
|
||||
.endm
|
||||
|
||||
#elif defined(__riscv)
|
||||
|
||||
// Two pixels input in rd[17:2]. Two symbols output in rd[19:0]. a2 contains
|
||||
// interp base pointer. a5 used as temporary.
|
||||
.macro tmds_palette_encode_loop_body rd
|
||||
sw \rd, ACCUM0_OFFS(a2)
|
||||
sw \rd, ACCUM0_OFFS + INTERP1(a2)
|
||||
lw \rd, PEEK2_OFFS(a2)
|
||||
lw a5, PEEK2_OFFS + INTERP1(a2)
|
||||
lw \rd, (\rd)
|
||||
lw a5, (a5)
|
||||
#if !TMDS_FULLRES_NO_DC_BALANCE
|
||||
sw \rd, ACCUM1_ADD_OFFS(a2)
|
||||
sw a5, ACCUM1_ADD_OFFS + INTERP1(a2)
|
||||
#endif
|
||||
slli a5, a5, 10
|
||||
or \rd, \rd, a5
|
||||
.endm
|
||||
|
||||
.macro tmds_palette_encode_loop
|
||||
mv t1, s0
|
||||
sh1add t0, a2, a1
|
||||
li a2, SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET
|
||||
// DC balance defined to be 0 at start of scanline:
|
||||
li a4, 0
|
||||
sw a4, ACCUM1_OFFS(a2)
|
||||
#if TMDS_FULLRES_NO_DC_BALANCE
|
||||
// Alternate parity between odd/even symbols if there's no balance feedback
|
||||
li a4, -1
|
||||
#endif
|
||||
sw a4, ACCUM1_OFFS + INTERP1(a2)
|
||||
|
||||
bgeu a1, t0, 2f
|
||||
.align 2
|
||||
1:
|
||||
.set i, 0
|
||||
.rept 10
|
||||
lw a3, 8 * i + 0(a0)
|
||||
lw s0, 8 * i + 4(a0)
|
||||
srli a4, a3, 14
|
||||
slli a3, a3, 2
|
||||
tmds_palette_encode_loop_body a3
|
||||
tmds_palette_encode_loop_body a4
|
||||
sw a3, 16 * i + 0(a1)
|
||||
sw a4, 16 * i + 4(a1)
|
||||
srli a4, s0, 14
|
||||
slli s0, s0, 2
|
||||
tmds_palette_encode_loop_body s0
|
||||
tmds_palette_encode_loop_body a4
|
||||
sw s0, 16 * i + 8(a1)
|
||||
sw a4, 16 * i + 12(a1)
|
||||
.set i, i + 1
|
||||
.endr
|
||||
addi a0, a0, 8 * i
|
||||
addi a1, a1, 16 * i
|
||||
bltu a1, t0, 1b
|
||||
2:
|
||||
mv s0, t1
|
||||
ret
|
||||
.endm
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
decl_func_x tmds_palette_encode_loop_x
|
||||
tmds_palette_encode_loop
|
||||
decl_func_y tmds_palette_encode_loop_y
|
||||
tmds_palette_encode_loop
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Hand-cranking loops for SIO TMDS encoders
|
||||
|
||||
#if DVI_USE_SIO_TMDS_ENCODER
|
||||
|
||||
#if defined(__arm__)
|
||||
|
||||
// r0: input buffer (word-aligned)
|
||||
// r1: output buffer (word-aligned)
|
||||
// r2: pixel count
|
||||
|
||||
.macro tmds_encode_sio_loop size_ratio peek
|
||||
|
||||
// For larger load/store offsets at high ratios/unroll:
|
||||
.cpu cortex-m33
|
||||
|
||||
.if \size_ratio > 4 * TMDS_ENCODE_UNROLL
|
||||
.set unroll, 1
|
||||
.else
|
||||
.set unroll, 4 * TMDS_ENCODE_UNROLL / \size_ratio
|
||||
.endif
|
||||
|
||||
.if \peek
|
||||
.set even_offset_adj, (SIO_TMDS_PEEK_SINGLE_OFFSET - SIO_TMDS_POP_SINGLE_OFFSET)
|
||||
.else
|
||||
.set even_offset_adj, 0
|
||||
.endif
|
||||
|
||||
push {r4, lr}
|
||||
#if DVI_SYMBOLS_PER_WORD == 1
|
||||
lsls r2, r2, #2
|
||||
#else
|
||||
lsls r2, r2, #1
|
||||
#endif
|
||||
adds r2, r1
|
||||
ldr r3, =SIO_BASE + SIO_TMDS_CTRL_OFFSET
|
||||
b 2f
|
||||
1:
|
||||
.set i, 0
|
||||
.rept unroll
|
||||
ldr r4, [r0, #i * 4]
|
||||
str r4, [r3, #SIO_TMDS_WDATA_OFFSET - SIO_TMDS_CTRL_OFFSET]
|
||||
.set j, 0
|
||||
.rept \size_ratio
|
||||
.set offset_adj, even_offset_adj * ((1 + j + \size_ratio * i) & 0x1)
|
||||
#if DVI_SYMBOLS_PER_WORD == 2
|
||||
ldr r4, [r3, #offset_adj + SIO_TMDS_POP_DOUBLE_L0_OFFSET - SIO_TMDS_CTRL_OFFSET]
|
||||
#else
|
||||
ldr r4, [r3, #offset_adj + SIO_TMDS_POP_SINGLE_OFFSET - SIO_TMDS_CTRL_OFFSET]
|
||||
#endif
|
||||
str r4, [r1, #4 * (j + i * \size_ratio)]
|
||||
.set j, j + 1
|
||||
.endr
|
||||
.set i, i + 1
|
||||
.endr
|
||||
adds r0, 4 * unroll
|
||||
adds r1, 4 * unroll * \size_ratio
|
||||
2:
|
||||
cmp r1, r2
|
||||
blo 1b
|
||||
pop {r4, pc}
|
||||
|
||||
.cpu cortex-m0plus
|
||||
.endm
|
||||
|
||||
#elif defined(__riscv)
|
||||
|
||||
// a0: input buffer (word-aligned)
|
||||
// a1: output buffer (word-aligned)
|
||||
// a2: pixel count
|
||||
|
||||
.macro tmds_encode_sio_loop size_ratio peek
|
||||
|
||||
.if \size_ratio > 4 * TMDS_ENCODE_UNROLL
|
||||
.set unroll, 1
|
||||
.else
|
||||
.set unroll, 4 * TMDS_ENCODE_UNROLL / \size_ratio
|
||||
.endif
|
||||
|
||||
.if \peek
|
||||
.set even_offset_adj, (SIO_TMDS_PEEK_SINGLE_OFFSET - SIO_TMDS_POP_SINGLE_OFFSET)
|
||||
.else
|
||||
.set even_offset_adj, 0
|
||||
.endif
|
||||
|
||||
#if DVI_SYMBOLS_PER_WORD == 1
|
||||
sh2add a2, a2, a1
|
||||
#else
|
||||
sh1add a2, a2, a1
|
||||
#endif
|
||||
li a3, SIO_BASE + SIO_TMDS_CTRL_OFFSET
|
||||
bgeu a1, a2, 2f
|
||||
1:
|
||||
.set i, 0
|
||||
.rept unroll
|
||||
lw a4, i * 4(a0)
|
||||
sw a4, SIO_TMDS_WDATA_OFFSET - SIO_TMDS_CTRL_OFFSET(a3)
|
||||
.set j, 0
|
||||
.rept \size_ratio
|
||||
.set offset_adj, even_offset_adj * ((1 + j + \size_ratio * i) & 0x1)
|
||||
#if DVI_SYMBOLS_PER_WORD == 2
|
||||
lw a4, offset_adj + SIO_TMDS_POP_DOUBLE_L0_OFFSET - SIO_TMDS_CTRL_OFFSET(a3)
|
||||
#else
|
||||
lw a4, offset_adj + SIO_TMDS_POP_SINGLE_OFFSET - SIO_TMDS_CTRL_OFFSET(a3)
|
||||
#endif
|
||||
sw a4, 4 * (j + i * \size_ratio)(a1)
|
||||
.set j, j + 1
|
||||
.endr
|
||||
.set i, i + 1
|
||||
.endr
|
||||
addi a0, a0, 4 * unroll
|
||||
addi a1, a1, 4 * unroll * \size_ratio
|
||||
bltu a1, a2, 1b
|
||||
2:
|
||||
ret
|
||||
.endm
|
||||
|
||||
#else
|
||||
#error "Unknown architecture"
|
||||
#endif
|
||||
|
||||
// For DVI_SYMBOLS_PER_WORD == 2, the ratio of output : input buffer size is:
|
||||
//
|
||||
// Bits/pixel | Ratio (with hdouble) | Ratio (no hdouble)
|
||||
// -----------+----------------------+-------------------
|
||||
// 1 | 32 | 16
|
||||
// 2 | 16 | 8
|
||||
// 4 | 8 | 4
|
||||
// 8 | 4 | 2
|
||||
// 16 | 2 | 1
|
||||
//
|
||||
// For DVI_SYMBOLS_PER_WORD == 1, these ratios are doubled.
|
||||
|
||||
// poppop variants will read from a xxx_POP register for every output word
|
||||
decl_func tmds_encode_sio_loop_poppop_ratio1
|
||||
tmds_encode_sio_loop 1, 0
|
||||
decl_func tmds_encode_sio_loop_poppop_ratio2
|
||||
tmds_encode_sio_loop 2, 0
|
||||
decl_func tmds_encode_sio_loop_poppop_ratio4
|
||||
tmds_encode_sio_loop 4, 0
|
||||
decl_func tmds_encode_sio_loop_poppop_ratio8
|
||||
tmds_encode_sio_loop 8, 0
|
||||
decl_func tmds_encode_sio_loop_poppop_ratio16
|
||||
tmds_encode_sio_loop 16, 0
|
||||
decl_func tmds_encode_sio_loop_poppop_ratio32
|
||||
tmds_encode_sio_loop 32, 0
|
||||
decl_func tmds_encode_sio_loop_poppop_ratio64
|
||||
tmds_encode_sio_loop 64, 0
|
||||
|
||||
// peekpop variants will read alternately from xxx_PEEK and xxx_POP: this is
|
||||
// needed for pixel-doubled output when DVI_PIXELS_PER_WORD == 1 (note the
|
||||
// POP value is different from the PEEK value, as it's the same data but with
|
||||
// different running DC balance)
|
||||
decl_func tmds_encode_sio_loop_peekpop_ratio1
|
||||
tmds_encode_sio_loop 1, 1
|
||||
decl_func tmds_encode_sio_loop_peekpop_ratio2
|
||||
tmds_encode_sio_loop 2, 1
|
||||
decl_func tmds_encode_sio_loop_peekpop_ratio4
|
||||
tmds_encode_sio_loop 4, 1
|
||||
decl_func tmds_encode_sio_loop_peekpop_ratio8
|
||||
tmds_encode_sio_loop 8, 1
|
||||
decl_func tmds_encode_sio_loop_peekpop_ratio16
|
||||
tmds_encode_sio_loop 16, 1
|
||||
decl_func tmds_encode_sio_loop_peekpop_ratio32
|
||||
tmds_encode_sio_loop 32, 1
|
||||
decl_func tmds_encode_sio_loop_peekpop_ratio64
|
||||
tmds_encode_sio_loop 64, 1
|
||||
|
||||
#endif
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
#include "hardware/gpio.h"
|
||||
#include "hardware/sync.h"
|
||||
|
||||
static const uint32_t __scratch_x("tmds_table") tmds_table[] = {
|
||||
static const __unused uint32_t __scratch_x("tmds_table") tmds_table[] = {
|
||||
#include "tmds_table.h"
|
||||
};
|
||||
|
||||
|
@ -11,14 +11,15 @@ static const uint32_t __scratch_x("tmds_table") tmds_table[] = {
|
|||
// memory. There is a third copy which can go in flash, because it's just used
|
||||
// to generate palette LUTs. The ones we don't use will get garbage collected
|
||||
// during linking.
|
||||
const uint32_t __scratch_x("tmds_table_fullres_x") tmds_table_fullres_x[] = {
|
||||
const __unused uint32_t __scratch_x("tmds_table_fullres_x") tmds_table_fullres_x[] = {
|
||||
#include "tmds_table_fullres.h"
|
||||
};
|
||||
|
||||
const uint32_t __scratch_y("tmds_table_fullres_y") tmds_table_fullres_y[] = {
|
||||
const __unused uint32_t __scratch_y("tmds_table_fullres_y") tmds_table_fullres_y[] = {
|
||||
#include "tmds_table_fullres.h"
|
||||
};
|
||||
|
||||
#if !DVI_USE_SIO_TMDS_ENCODER
|
||||
// Configure an interpolator to extract a single colour channel from each of a pair
|
||||
// of pixels, with the first pixel's lsb at pixel_lsb, and the pixels being
|
||||
// pixel_width wide. Produce a LUT address for the first pixel's colour data on
|
||||
|
@ -35,11 +36,16 @@ static int __not_in_flash_func(configure_interp_for_addrgen)(interp_hw_t *interp
|
|||
|
||||
int shift_channel_to_index = pixel_lsb + channel_msb - (lut_index_width - 1) - index_shift;
|
||||
int oops = 0;
|
||||
#if PICO_RP2040
|
||||
if (shift_channel_to_index < 0) {
|
||||
// "It's ok we'll fix it in software"
|
||||
oops = -shift_channel_to_index;
|
||||
shift_channel_to_index = 0;
|
||||
}
|
||||
#else
|
||||
// Now a right-rotate, not a right-shift
|
||||
shift_channel_to_index &= 0x1f;
|
||||
#endif
|
||||
|
||||
uint index_msb = index_shift + lut_index_width - 1;
|
||||
|
||||
|
@ -60,23 +66,60 @@ static int __not_in_flash_func(configure_interp_for_addrgen)(interp_hw_t *interp
|
|||
return oops;
|
||||
}
|
||||
|
||||
#else
|
||||
// Encoding a single channel at a time is not the most efficient way to use
|
||||
// this hardware, because it means we read the colour buffer multiple times,
|
||||
// but it fits better with how things are done in software on RP2040.
|
||||
static void __not_in_flash_func(configure_sio_tmds_for_single_channel)(uint channel_msb, uint channel_lsb, uint pixel_width, bool hdouble) {
|
||||
assert(channel_msb - channel_lsb <= 7); // 1 through 8 bits, inclusive
|
||||
sio_hw->tmds_ctrl =
|
||||
SIO_TMDS_CTRL_CLEAR_BALANCE_BITS |
|
||||
((channel_msb - channel_lsb) << SIO_TMDS_CTRL_L0_NBITS_LSB) |
|
||||
(((channel_msb - 7u) & 0xfu) << SIO_TMDS_CTRL_L0_ROT_LSB) |
|
||||
((1 + __builtin_ctz(pixel_width)) << SIO_TMDS_CTRL_PIX_SHIFT_LSB) |
|
||||
((uint)hdouble << SIO_TMDS_CTRL_PIX2_NOSHIFT_LSB);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Extract up to 6 bits from a buffer of 16 bit pixels, and produce a buffer
|
||||
// of TMDS symbols from this colour channel. Number of pixels must be even,
|
||||
// pixel buffer must be word-aligned.
|
||||
|
||||
void __not_in_flash_func(tmds_encode_data_channel_16bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) {
|
||||
#if DVI_USE_SIO_TMDS_ENCODER
|
||||
configure_sio_tmds_for_single_channel(channel_msb, channel_lsb, 16, true);
|
||||
#if DVI_SYMBOLS_PER_WORD == 1
|
||||
tmds_encode_sio_loop_peekpop_ratio4(pixbuf, symbuf, 2 * n_pix);
|
||||
#else
|
||||
tmds_encode_sio_loop_poppop_ratio2(pixbuf, symbuf, 2 * n_pix);
|
||||
#endif
|
||||
#else
|
||||
interp_hw_save_t interp0_save;
|
||||
interp_save(interp0_hw, &interp0_save);
|
||||
int require_lshift = configure_interp_for_addrgen(interp0_hw, channel_msb, channel_lsb, 0, 16, 6, tmds_table);
|
||||
#if PICO_RP2040
|
||||
if (require_lshift)
|
||||
tmds_encode_loop_16bpp_leftshift(pixbuf, symbuf, n_pix, require_lshift);
|
||||
else
|
||||
tmds_encode_loop_16bpp(pixbuf, symbuf, n_pix);
|
||||
#else
|
||||
assert(!require_lshift); (void)require_lshift;
|
||||
tmds_encode_loop_16bpp(pixbuf, symbuf, n_pix);
|
||||
#endif
|
||||
interp_restore(interp0_hw, &interp0_save);
|
||||
#endif
|
||||
}
|
||||
|
||||
// As above, but 8 bits per pixel, multiple of 4 pixels, and still word-aligned.
|
||||
void __not_in_flash_func(tmds_encode_data_channel_8bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) {
|
||||
#if DVI_USE_SIO_TMDS_ENCODER
|
||||
configure_sio_tmds_for_single_channel(channel_msb, channel_lsb, 8, true);
|
||||
#if DVI_SYMBOLS_PER_WORD == 1
|
||||
tmds_encode_sio_loop_peekpop_ratio8(pixbuf, symbuf, 2 * n_pix);
|
||||
#else
|
||||
tmds_encode_sio_loop_poppop_ratio4(pixbuf, symbuf, 2 * n_pix);
|
||||
#endif
|
||||
#else
|
||||
interp_hw_save_t interp0_save, interp1_save;
|
||||
interp_save(interp0_hw, &interp0_save);
|
||||
interp_save(interp1_hw, &interp1_save);
|
||||
|
@ -86,12 +129,18 @@ void __not_in_flash_func(tmds_encode_data_channel_8bpp)(const uint32_t *pixbuf,
|
|||
int require_lshift = configure_interp_for_addrgen(interp0_hw, channel_msb, channel_lsb, 0, 8, 6, tmds_table);
|
||||
int lshift_upper = configure_interp_for_addrgen(interp1_hw, channel_msb, channel_lsb, 16, 8, 6, tmds_table);
|
||||
assert(!lshift_upper); (void)lshift_upper;
|
||||
#if PICO_RP2040
|
||||
if (require_lshift)
|
||||
tmds_encode_loop_8bpp_leftshift(pixbuf, symbuf, n_pix, require_lshift);
|
||||
else
|
||||
tmds_encode_loop_8bpp(pixbuf, symbuf, n_pix);
|
||||
#else
|
||||
assert(!require_lshift); (void)require_lshift;
|
||||
tmds_encode_loop_8bpp(pixbuf, symbuf, n_pix);
|
||||
#endif
|
||||
interp_restore(interp0_hw, &interp0_save);
|
||||
interp_restore(interp1_hw, &interp1_save);
|
||||
#endif
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
@ -103,16 +152,22 @@ void __not_in_flash_func(tmds_encode_data_channel_8bpp)(const uint32_t *pixbuf,
|
|||
// pixels, and INTERP1 for odd pixels. Note this means that even and odd
|
||||
// symbols have their DC balance handled separately, which is not to spec.
|
||||
|
||||
#if !DVI_USE_SIO_TMDS_ENCODER
|
||||
static int __not_in_flash_func(configure_interp_for_addrgen_fullres)(interp_hw_t *interp, uint channel_msb, uint channel_lsb, uint lut_index_width, const uint32_t *lutbase) {
|
||||
const uint index_shift = 2; // scaled lookup for 4-byte LUT entries
|
||||
|
||||
int shift_channel_to_index = channel_msb - (lut_index_width - 1) - index_shift;
|
||||
int oops = 0;
|
||||
#if PICO_RP2040
|
||||
if (shift_channel_to_index < 0) {
|
||||
// "It's ok we'll fix it in software"
|
||||
oops = -shift_channel_to_index;
|
||||
shift_channel_to_index = 0;
|
||||
}
|
||||
#else
|
||||
// Now a right-rotate rather than right-shift
|
||||
shift_channel_to_index &= 0x1f;
|
||||
#endif
|
||||
|
||||
uint index_msb = index_shift + lut_index_width - 1;
|
||||
|
||||
|
@ -133,8 +188,17 @@ static int __not_in_flash_func(configure_interp_for_addrgen_fullres)(interp_hw_t
|
|||
|
||||
return oops;
|
||||
}
|
||||
#endif
|
||||
|
||||
void __not_in_flash_func(tmds_encode_data_channel_fullres_16bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) {
|
||||
#if DVI_USE_SIO_TMDS_ENCODER
|
||||
configure_sio_tmds_for_single_channel(channel_msb, channel_lsb, 16, false);
|
||||
#if DVI_SYMBOLS_PER_WORD == 1
|
||||
tmds_encode_sio_loop_poppop_ratio2(pixbuf, symbuf, n_pix);
|
||||
#else
|
||||
tmds_encode_sio_loop_poppop_ratio1(pixbuf, symbuf, n_pix);
|
||||
#endif
|
||||
#else
|
||||
uint core = get_core_num();
|
||||
#if !TMDS_FULLRES_NO_INTERP_SAVE
|
||||
interp_hw_save_t interp0_save, interp1_save;
|
||||
|
@ -165,17 +229,16 @@ void __not_in_flash_func(tmds_encode_data_channel_fullres_16bpp)(const uint32_t
|
|||
interp_restore(interp0_hw, &interp0_save);
|
||||
interp_restore(interp1_hw, &interp1_save);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
static const int8_t imbalance_lookup[16] = { -4, -2, -2, 0, -2, 0, 0, 2, -2, 0, 0, 2, 0, 2, 2, 4 };
|
||||
|
||||
static inline int byte_imbalance(uint32_t x)
|
||||
{
|
||||
static inline int byte_imbalance(uint32_t x) {
|
||||
return imbalance_lookup[x >> 4] + imbalance_lookup[x & 0xF];
|
||||
}
|
||||
|
||||
static void tmds_encode_symbols(uint8_t pixel, uint32_t* negative_balance_sym, uint32_t* positive_balance_sym)
|
||||
{
|
||||
static void tmds_encode_symbols(uint8_t pixel, uint32_t* negative_balance_sym, uint32_t* positive_balance_sym) {
|
||||
int pixel_imbalance = byte_imbalance(pixel);
|
||||
uint32_t sym = pixel & 1;
|
||||
if (pixel_imbalance > 0 || (pixel_imbalance == 0 && sym == 0)) {
|
||||
|
|
|
@ -34,4 +34,23 @@ void tmds_fullres_encode_loop_16bpp_leftshift_y(const uint32_t *pixbuf, uint32_t
|
|||
void tmds_palette_encode_loop_x(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
|
||||
void tmds_palette_encode_loop_y(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
|
||||
|
||||
#if !PICO_RP2040
|
||||
// Crank the SIO TMDS encoder:
|
||||
void tmds_encode_sio_loop_poppop_ratio1(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
|
||||
void tmds_encode_sio_loop_poppop_ratio2(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
|
||||
void tmds_encode_sio_loop_poppop_ratio4(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
|
||||
void tmds_encode_sio_loop_poppop_ratio8(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
|
||||
void tmds_encode_sio_loop_poppop_ratio16(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
|
||||
void tmds_encode_sio_loop_poppop_ratio32(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
|
||||
void tmds_encode_sio_loop_poppop_ratio64(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
|
||||
|
||||
void tmds_encode_sio_loop_peekpop_ratio1(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
|
||||
void tmds_encode_sio_loop_peekpop_ratio2(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
|
||||
void tmds_encode_sio_loop_peekpop_ratio4(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
|
||||
void tmds_encode_sio_loop_peekpop_ratio8(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
|
||||
void tmds_encode_sio_loop_peekpop_ratio16(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
|
||||
void tmds_encode_sio_loop_peekpop_ratio32(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
|
||||
void tmds_encode_sio_loop_peekpop_ratio64(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
@ -3,13 +3,24 @@ add_library(libsprite INTERFACE)
|
|||
target_sources(libsprite INTERFACE
|
||||
${CMAKE_CURRENT_LIST_DIR}/affine_transform.h
|
||||
${CMAKE_CURRENT_LIST_DIR}/sprite_asm_const.h
|
||||
${CMAKE_CURRENT_LIST_DIR}/sprite.S
|
||||
${CMAKE_CURRENT_LIST_DIR}/sprite.c
|
||||
${CMAKE_CURRENT_LIST_DIR}/sprite.h
|
||||
${CMAKE_CURRENT_LIST_DIR}/tile.S
|
||||
${CMAKE_CURRENT_LIST_DIR}/tile.c
|
||||
${CMAKE_CURRENT_LIST_DIR}/tile.h
|
||||
)
|
||||
|
||||
if (PICO_RISCV)
|
||||
target_sources(libsprite INTERFACE
|
||||
${CMAKE_CURRENT_LIST_DIR}/sprite_riscv.S
|
||||
${CMAKE_CURRENT_LIST_DIR}/tile_riscv.S
|
||||
)
|
||||
else ()
|
||||
target_sources(libsprite INTERFACE
|
||||
${CMAKE_CURRENT_LIST_DIR}/sprite_armv6m.S
|
||||
${CMAKE_CURRENT_LIST_DIR}/tile_armv6m.S
|
||||
)
|
||||
endif()
|
||||
|
||||
|
||||
target_include_directories(libsprite INTERFACE ${CMAKE_CURRENT_LIST_DIR})
|
||||
target_link_libraries(libsprite INTERFACE pico_base_headers hardware_interp)
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
// Stolen from RISCBoy
|
||||
|
||||
#include <stdint.h>
|
||||
#include "pico/platform.h"
|
||||
#include "pico.h"
|
||||
|
||||
// Store unpacked affine transforms as signed 16.16 fixed point in the following order:
|
||||
// a00, a01, b0, a10, a11, b1
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#include "sprite.h"
|
||||
#include "affine_transform.h"
|
||||
|
||||
#include "pico/platform.h" // for __not_in_flash
|
||||
#include "pico.h" // for __not_in_flash
|
||||
#include "hardware/interp.h"
|
||||
|
||||
// Note some of the sprite routines are quite large (unrolled), so trying to
|
||||
|
|
|
@ -5,8 +5,11 @@
|
|||
.macro decl_func name
|
||||
.section .time_critical.\name, "ax"
|
||||
.global \name
|
||||
.p2align 2
|
||||
#ifndef __riscv
|
||||
.type \name,%function
|
||||
.thumb_func
|
||||
#endif
|
||||
\name:
|
||||
.endm
|
||||
|
||||
|
@ -16,11 +19,40 @@
|
|||
// same way as non-alpha pixels when encoding (and the co-opted channel LSB
|
||||
// always ends up being set on alpha pixels, which is pretty harmless)
|
||||
|
||||
// Also note this is expressed as a right-shift into the carry flag (on Arm),
|
||||
// so this is equal to the bit index of the alpha bit plus 1. On RISC-V it's
|
||||
// idiomatic to shift up to the sign bit instead, so a left shift of 32 - x
|
||||
// should be used instead of a right shift of x.
|
||||
|
||||
#define ALPHA_SHIFT_16BPP 6
|
||||
|
||||
// Assume RAGB2132 (so alpha is bit 5)
|
||||
|
||||
#define ALPHA_SHIFT_8BPP 6
|
||||
|
||||
#ifdef __riscv
|
||||
// Macros for forcing individual instructions to be 32 bits, to maintain
|
||||
// branch target alignment without adding NOPs
|
||||
.macro norvc_1a instr, arg0
|
||||
.option push
|
||||
.option norvc
|
||||
\instr \arg0
|
||||
.option pop
|
||||
.endm
|
||||
|
||||
.macro norvc_2a instr, arg0, arg1
|
||||
.option push
|
||||
.option norvc
|
||||
\instr \arg0, \arg1
|
||||
.option pop
|
||||
.endm
|
||||
|
||||
.macro norvc_3a instr, arg0, arg1, arg2
|
||||
.option push
|
||||
.option norvc
|
||||
\instr \arg0, \arg1, \arg2
|
||||
.option pop
|
||||
.endm
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,657 @@
|
|||
// Functions for doing simple 2D graphics operations on a RGB scanline buffer.
|
||||
|
||||
#include "hardware/regs/addressmap.h"
|
||||
#include "hardware/regs/sio.h"
|
||||
|
||||
#include "sprite_asm_const.h"
|
||||
|
||||
#define POP2_OFFS (SIO_INTERP0_POP_FULL_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
|
||||
#define PEEK0_OFFS (SIO_INTERP0_PEEK_LANE0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
|
||||
#define PEEK1_OFFS (SIO_INTERP0_PEEK_LANE1_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
|
||||
#define ACCUM0_OFFS (SIO_INTERP0_ACCUM0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
|
||||
#define ACCUM1_OFFS (SIO_INTERP0_ACCUM1_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
|
||||
#define CTRL0_OFFS (SIO_INTERP0_CTRL_LANE0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
|
||||
#define INTERP1 (SIO_INTERP1_ACCUM0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
|
||||
|
||||
#if defined(__riscv_c) || defined(__riscv_zca)
|
||||
#define RISCV_HAVE_COMPRESSED_ISA 1
|
||||
#endif
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Colour fill
|
||||
|
||||
// a0: dst
|
||||
// a1: value
|
||||
// a2: count
|
||||
|
||||
decl_func sprite_fill8
|
||||
// Slide for short fills
|
||||
li a3, 18
|
||||
bltu a3, a2, 2f
|
||||
#ifndef RISCV_HAVE_COMPRESSED_ISA
|
||||
#error "This address computation is wrong for non-RVC:"
|
||||
#endif
|
||||
auipc a3, 0 // 32-bit instruction after address of auipc
|
||||
slli a2, a2, 2 // 16-bit instruction after address of auipc
|
||||
sub a3, a3, a2 // 16-bit instruction after address of auipc
|
||||
jr a3, 18 * 4 + 12 // 32-bit instruction after address of auipc
|
||||
.align 2
|
||||
// With Zcb this is a mix of 16-bit and 32-bit instructions due to the
|
||||
// limited immediate size. Force 32-bit so we can do a computed branch.
|
||||
.option push
|
||||
.option norvc
|
||||
sb a1, 17(a0)
|
||||
sb a1, 16(a0)
|
||||
sb a1, 15(a0)
|
||||
sb a1, 14(a0)
|
||||
sb a1, 13(a0)
|
||||
sb a1, 12(a0)
|
||||
sb a1, 11(a0)
|
||||
sb a1, 10(a0)
|
||||
sb a1, 9(a0)
|
||||
sb a1, 8(a0)
|
||||
sb a1, 7(a0)
|
||||
sb a1, 6(a0)
|
||||
sb a1, 5(a0)
|
||||
sb a1, 4(a0)
|
||||
sb a1, 3(a0)
|
||||
sb a1, 2(a0)
|
||||
sb a1, 1(a0)
|
||||
sb a1, 0(a0)
|
||||
.option pop
|
||||
ret
|
||||
2:
|
||||
// Duplicate byte x4
|
||||
packh a1, a1, a1
|
||||
pack a1, a1, a1
|
||||
// Get a0 word-aligned:
|
||||
andi a3, a0, 0x1
|
||||
bnez a3, 1f
|
||||
sb a1, (a0)
|
||||
addi a0, a0, 1
|
||||
addi a2, a2, -1
|
||||
1:
|
||||
andi a3, a0, 0x2
|
||||
bnez a3, 1f
|
||||
sh a1, (a0)
|
||||
addi a0, a0, 2
|
||||
addi a2, a2, -2
|
||||
1:
|
||||
// Set up for main loop. Limit pointer at end - (loop body size)
|
||||
add a2, a2, a0
|
||||
addi a2, a2, -16
|
||||
|
||||
// Fall straight into loop, because cases less than (loop body + max misalignment) are handled by slide
|
||||
1:
|
||||
sw a1, 0(a0)
|
||||
sw a1, 4(a0)
|
||||
sw a1, 8(a0)
|
||||
sw a1, 12(a0)
|
||||
addi a0, a0, 16
|
||||
bgeu a2, a0, 1b
|
||||
|
||||
// Main loop done, now tidy up the odds and ends. Note bits 3:0 of the
|
||||
// pointer difference are not affected by us subtracting 16 earlier.
|
||||
sub a2, a2, a0
|
||||
// No more than 15 bytes remaining -- first test bit 3 by shifting it to sign bit
|
||||
slli a2, a2, 28
|
||||
bgez a2, 1f
|
||||
sw a1, 0(a0)
|
||||
sw a1, 4(a0)
|
||||
addi a0, a0, 8
|
||||
1:
|
||||
slli a2, a2, 1
|
||||
bgez a2, 1f
|
||||
sw a1, (a0)
|
||||
addi a0, a0, 4
|
||||
1:
|
||||
slli a2, a2, 1
|
||||
bgez a2, 1f
|
||||
sh a1, (a0)
|
||||
addi a0, a0, 2
|
||||
1:
|
||||
slli a2, a2, 1
|
||||
bgez a2, 1f
|
||||
sb a1, (a0)
|
||||
1:
|
||||
ret
|
||||
|
||||
.p2align 2
|
||||
decl_func sprite_fill16
|
||||
// Slide for short fills
|
||||
norvc_2a li a3, 16
|
||||
bltu a3, a2, 2f
|
||||
#ifndef RISCV_HAVE_COMPRESSED_ISA
|
||||
#error "This address computation is wrong for non-RVC:"
|
||||
#endif
|
||||
auipc a3, 0 // 32-bit instruction after address of auipc
|
||||
slli a2, a2, 2 // 16-bit instruction after address of auipc
|
||||
sub a3, a3, a2 // 16-bit instruction after address of auipc
|
||||
jr a3, 16 * 4 + 12 // 32-bit instruction after address of auipc
|
||||
.option push
|
||||
.option norvc
|
||||
sh a1, 30(a0)
|
||||
sh a1, 28(a0)
|
||||
sh a1, 26(a0)
|
||||
sh a1, 24(a0)
|
||||
sh a1, 22(a0)
|
||||
sh a1, 20(a0)
|
||||
sh a1, 18(a0)
|
||||
sh a1, 16(a0)
|
||||
sh a1, 14(a0)
|
||||
sh a1, 12(a0)
|
||||
sh a1, 10(a0)
|
||||
sh a1, 8(a0)
|
||||
sh a1, 6(a0)
|
||||
sh a1, 4(a0)
|
||||
sh a1, 2(a0)
|
||||
sh a1, 0(a0)
|
||||
.option pop
|
||||
ret
|
||||
2:
|
||||
// Get word-aligned before main fill loop
|
||||
andi a3, a2, 0x2
|
||||
beqz a3, 1f
|
||||
sh a1, (a0)
|
||||
addi a0, a0, 2
|
||||
addi a2, a2, -1
|
||||
1:
|
||||
// Set limit pointer at end - (loop body size)
|
||||
slli a2, a2, 1
|
||||
add a2, a2, a0
|
||||
addi a2, a2, -32
|
||||
pack a1, a1, a1
|
||||
// We can fall through because cases < 1 loop are handled by slide
|
||||
1:
|
||||
sw a1, 0(a0)
|
||||
sw a1, 4(a0)
|
||||
sw a1, 8(a0)
|
||||
sw a1, 12(a0)
|
||||
sw a1, 16(a0)
|
||||
sw a1, 20(a0)
|
||||
sw a1, 24(a0)
|
||||
sw a1, 28(a0)
|
||||
addi a0, a0, 32
|
||||
bgeu a2, a0, 1b
|
||||
|
||||
// Most of the work done, we have a few more to tidy up -- note bits 4:1
|
||||
// of the pointer difference are not affected by earlier subtraction of 32
|
||||
sub a2, a2, a0
|
||||
|
||||
// Bit 4 becomes sign bit
|
||||
slli a2, a2, 27
|
||||
bgez a2, 1f
|
||||
sw a1, 0(a0)
|
||||
sw a1, 4(a0)
|
||||
sw a1, 8(a0)
|
||||
sw a1, 12(a0)
|
||||
addi a0, a0, 16
|
||||
1:
|
||||
slli a2, a2, 1
|
||||
bgez a2, 1f
|
||||
sw a1, 0(a0)
|
||||
sw a1, 4(a0)
|
||||
addi a0, a0, 8
|
||||
1:
|
||||
slli a2, a2, 1
|
||||
bgez a2, 1f
|
||||
sw a1, 0(a0)
|
||||
addi a0, a0, 4
|
||||
1:
|
||||
slli a2, a2, 1
|
||||
bgez a2, 1f
|
||||
sh a1, 0(a0)
|
||||
1:
|
||||
ret
|
||||
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Non-AT sprite
|
||||
|
||||
|
||||
// TODO 8-bit version not yet ported to RISC-V
|
||||
#if 0
|
||||
// Unrolled loop body with an initial computed branch.
|
||||
|
||||
// a0: dst
|
||||
// a1: src
|
||||
// a2: pixel count
|
||||
decl_func sprite_blit8
|
||||
mov ip, a0
|
||||
lsrs a3, a2, #3
|
||||
lsls a3, #3
|
||||
eors a2, a3 // a2 = pixels % 8, a3 = pixels - pixels % 8
|
||||
|
||||
add a0, a3
|
||||
add a1, a3
|
||||
|
||||
adr a3, 2f
|
||||
lsls a2, #2
|
||||
subs a3, a2
|
||||
adds a3, #1 // thumb bit >:(
|
||||
bx a3
|
||||
|
||||
.align 2
|
||||
1:
|
||||
subs a0, #8
|
||||
subs a1, #8
|
||||
ldrb a3, [a1, #7]
|
||||
strb a3, [a0, #7]
|
||||
ldrb a3, [a1, #6]
|
||||
strb a3, [a0, #6]
|
||||
ldrb a3, [a1, #5]
|
||||
strb a3, [a0, #5]
|
||||
ldrb a3, [a1, #4]
|
||||
strb a3, [a0, #4]
|
||||
ldrb a3, [a1, #3]
|
||||
strb a3, [a0, #3]
|
||||
ldrb a3, [a1, #2]
|
||||
strb a3, [a0, #2]
|
||||
ldrb a3, [a1, #1]
|
||||
strb a3, [a0, #1]
|
||||
ldrb a3, [a1, #0]
|
||||
strb a3, [a0, #0]
|
||||
2:
|
||||
cmp a0, ip
|
||||
bhi 1b
|
||||
bx lr
|
||||
|
||||
.macro sprite_blit8_alpha_body n
|
||||
ldrb a3, [a1, #\n]
|
||||
lsrs a2, a3, #ALPHA_SHIFT_8BPP
|
||||
bcc 2f
|
||||
strb a3, [a0, #\n]
|
||||
2:
|
||||
.endm
|
||||
|
||||
// a0: dst
|
||||
// a1: src
|
||||
// a2: pixel count
|
||||
decl_func sprite_blit8_alpha
|
||||
mov ip, a0
|
||||
lsrs a3, a2, #3
|
||||
lsls a3, #3
|
||||
eors a2, a3
|
||||
|
||||
add a0, a3
|
||||
add a1, a3
|
||||
|
||||
adr a3, 3f
|
||||
lsls a2, #3
|
||||
subs a3, a2
|
||||
adds a3, #1
|
||||
bx a3
|
||||
|
||||
.align 2
|
||||
1:
|
||||
subs a0, #8
|
||||
subs a1, #8
|
||||
sprite_blit8_alpha_body 7
|
||||
sprite_blit8_alpha_body 6
|
||||
sprite_blit8_alpha_body 5
|
||||
sprite_blit8_alpha_body 4
|
||||
sprite_blit8_alpha_body 3
|
||||
sprite_blit8_alpha_body 2
|
||||
sprite_blit8_alpha_body 1
|
||||
sprite_blit8_alpha_body 0
|
||||
3:
|
||||
cmp a0, ip
|
||||
bhi 1b
|
||||
bx lr
|
||||
|
||||
#endif
|
||||
|
||||
// Note this is the same ideal cycle count as lhu; lhu; sh; sh; but it reduces
|
||||
// the number of memory accesses by 25%, so less bus contention
|
||||
.macro storew_alignh rd ra offs
|
||||
sh \rd, \offs(\ra)
|
||||
srli \rd, \rd, 16
|
||||
sh \rd, \offs+2(\ra)
|
||||
.endm
|
||||
|
||||
// a0: dst
|
||||
// a1: src
|
||||
// a2: pixel count
|
||||
decl_func sprite_blit16
|
||||
// Force source pointer to be word-aligned
|
||||
andi a3, a1, 2
|
||||
beqz a3, 1f
|
||||
lhu a3, (a1)
|
||||
sh a3, (a0)
|
||||
addi a0, a0, 2
|
||||
addi a1, a1, 2
|
||||
addi a2, a2, -1
|
||||
1:
|
||||
// Each loop is 8 pixels. Place limit pointer at 16 bytes before
|
||||
// end, loop until past it. There will be 0 to 7 pixels remaining.
|
||||
slli a2, a2, 1
|
||||
add a2, a2, a0
|
||||
addi a5, a2, -16
|
||||
// Early out:
|
||||
bltu a5, a0, 2f
|
||||
1:
|
||||
lw a2, 0(a1)
|
||||
lw a3, 4(a1)
|
||||
storew_alignh a2, a0, 0
|
||||
storew_alignh a3, a0, 4
|
||||
lw a2, 8(a1)
|
||||
lw a3, 12(a1)
|
||||
storew_alignh a2, a0, 8
|
||||
storew_alignh a3, a0, 12
|
||||
addi a0, a0, 16
|
||||
addi a1, a1, 16
|
||||
bgeu a5, a0, 1b
|
||||
2:
|
||||
sub a5, a5, a0
|
||||
// At least 4 pixels? (bit 3 -> sign bit)
|
||||
slli a5, a5, 28
|
||||
bgez a5, 1f
|
||||
lw a2, 0(a1)
|
||||
lw a3, 4(a1)
|
||||
storew_alignh a2, a0, 0
|
||||
storew_alignh a3, a0, 4
|
||||
addi a0, a0, 8
|
||||
addi a1, a1, 8
|
||||
1:
|
||||
// At least 2 pixels?
|
||||
slli a5, a5, 1
|
||||
bgez a5, 1f
|
||||
lw a2, 0(a1)
|
||||
storew_alignh a2, a0, 0
|
||||
addi a0, a0, 4
|
||||
addi a1, a1, 4
|
||||
1:
|
||||
// One more pixel?
|
||||
slli a5, a5, 1
|
||||
bgez a5, 1f
|
||||
lhu a3, (a1)
|
||||
sh a3, (a0)
|
||||
1:
|
||||
ret
|
||||
|
||||
// dst: a0, src: a1, clobbers: a4-a7
|
||||
.macro sprite_blit16_alpha_body_x2 n
|
||||
// Disable RVC to force 32-bit alignment of branch targets without adding
|
||||
// alignment nops (lhu/sh *may* be 16-bit if Zcb is enabled)
|
||||
.option push
|
||||
.option norvc
|
||||
// Interleave two loads to avoid load->shift dependency stall
|
||||
lhu a4, 4*\n(a1)
|
||||
lhu a5, 4*\n+2(a1)
|
||||
slli a6, a4, 32 - ALPHA_SHIFT_16BPP
|
||||
slli a7, a5, 32 - ALPHA_SHIFT_16BPP
|
||||
bgez a6, 3f
|
||||
sh a4, 4*\n(a0)
|
||||
3:
|
||||
bgez a7, 3f
|
||||
sh a5, 4*\n+2(a0)
|
||||
3:
|
||||
.option pop
|
||||
.endm
|
||||
|
||||
// a0: dst
|
||||
// a1: src
|
||||
// a2: pixel count
|
||||
decl_func sprite_blit16_alpha
|
||||
// Not using the computed branch approach of the v6-M code as it doesn't
|
||||
// play nicely with the pairing of pixels used in the loop body here.
|
||||
slli a2, a2, 1
|
||||
add a2, a2, a0
|
||||
norvc_3a addi, a2, a2, -16
|
||||
bltu a2, a0, 2f
|
||||
1:
|
||||
// 8 pixels per loop
|
||||
sprite_blit16_alpha_body_x2 0
|
||||
sprite_blit16_alpha_body_x2 1
|
||||
sprite_blit16_alpha_body_x2 2
|
||||
sprite_blit16_alpha_body_x2 3
|
||||
addi a0, a0, 16
|
||||
addi a1, a1, 16
|
||||
bgeu a2, a0, 1b
|
||||
2:
|
||||
sub a2, a2, a0
|
||||
// At least 4 pixels? (bit 3 -> sign bit)
|
||||
slli a2, a2, 28
|
||||
bgez a2, 1f
|
||||
sprite_blit16_alpha_body_x2 0
|
||||
sprite_blit16_alpha_body_x2 1
|
||||
addi a0, a0, 8
|
||||
addi a1, a1, 8
|
||||
1:
|
||||
// At least 2 pixels?
|
||||
norvc_3a slli, a2, a2, 1
|
||||
bgez a2, 1f
|
||||
sprite_blit16_alpha_body_x2 0
|
||||
addi a1, a1, 4
|
||||
addi a0, a0, 4
|
||||
1:
|
||||
// One more pixel?
|
||||
slli a2, a2, 1
|
||||
bgez a2, 1f
|
||||
lhu a4, (a1)
|
||||
slli a6, a4, 32 - ALPHA_SHIFT_16BPP
|
||||
bgez a6, 1f
|
||||
sh a4, (a0)
|
||||
1:
|
||||
ret
|
||||
// ----------------------------------------------------------------------------
|
||||
// Affine-transformed sprite (note these are just the inner loops -- INTERP0
|
||||
// must be configured by the caller, which is presumably not written in asm)
|
||||
|
||||
// TODO not yet ported to RISC-V
|
||||
#if 0
|
||||
// r0: raster start pointer
|
||||
// r1: raster span size (pixels)
|
||||
|
||||
.macro sprite_ablit8_loop_body n
|
||||
ldr r1, [r3, #CTRL0_OFFS]
|
||||
ldr r2, [r3, #POP2_OFFS]
|
||||
lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1
|
||||
bcs 2f
|
||||
ldrb r2, [r2]
|
||||
strb r2, [r0, #\n]
|
||||
2:
|
||||
.endm
|
||||
|
||||
decl_func sprite_ablit8_loop
|
||||
mov ip, r0
|
||||
|
||||
lsrs r2, r1, #3
|
||||
lsls r2, #3
|
||||
eors r1, r2
|
||||
add r0, r2
|
||||
|
||||
adr r2, 3f
|
||||
movs r3, #12 // Each (non-unrolled) loop body is 12 bytes
|
||||
muls r1, r3
|
||||
subs r2, r1
|
||||
adds r2, #1
|
||||
|
||||
ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
|
||||
bx r2
|
||||
|
||||
.align 2
|
||||
nop
|
||||
1:
|
||||
subs r0, #8
|
||||
sprite_ablit8_loop_body 7
|
||||
sprite_ablit8_loop_body 6
|
||||
sprite_ablit8_loop_body 5
|
||||
sprite_ablit8_loop_body 4
|
||||
sprite_ablit8_loop_body 3
|
||||
sprite_ablit8_loop_body 2
|
||||
sprite_ablit8_loop_body 1
|
||||
sprite_ablit8_loop_body 0
|
||||
3:
|
||||
cmp r0, ip
|
||||
bne 1b
|
||||
bx lr
|
||||
|
||||
|
||||
|
||||
// As above but bit 5 is assumed to be an alpha bit (RAGB2132)
|
||||
|
||||
.macro sprite_ablit8_alpha_loop_body n
|
||||
ldr r1, [r3, #CTRL0_OFFS]
|
||||
ldr r2, [r3, #POP2_OFFS]
|
||||
lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1
|
||||
bcs 2f
|
||||
ldrb r2, [r2]
|
||||
lsrs r1, r2, #ALPHA_SHIFT_8BPP
|
||||
bcc 2f
|
||||
strb r2, [r0, #\n]
|
||||
2:
|
||||
.endm
|
||||
|
||||
decl_func sprite_ablit8_alpha_loop
|
||||
mov ip, r0
|
||||
ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
|
||||
|
||||
lsrs r2, r1, #3
|
||||
lsls r2, #3
|
||||
eors r1, r2
|
||||
add r0, r2
|
||||
|
||||
adr r2, 3f
|
||||
lsls r1, #4 // Each (non-unrolled) loop body is 16 bytes
|
||||
subs r2, r1
|
||||
adds r2, #1
|
||||
bx r2
|
||||
|
||||
.align 2
|
||||
nop
|
||||
1:
|
||||
subs r0, #8
|
||||
sprite_ablit8_alpha_loop_body 7
|
||||
sprite_ablit8_alpha_loop_body 6
|
||||
sprite_ablit8_alpha_loop_body 5
|
||||
sprite_ablit8_alpha_loop_body 4
|
||||
sprite_ablit8_alpha_loop_body 3
|
||||
sprite_ablit8_alpha_loop_body 2
|
||||
sprite_ablit8_alpha_loop_body 1
|
||||
sprite_ablit8_alpha_loop_body 0
|
||||
3:
|
||||
cmp r0, ip
|
||||
bhi 1b
|
||||
bx lr
|
||||
|
||||
|
||||
|
||||
.macro sprite_ablit16_loop_body n
|
||||
ldr r1, [r3, #CTRL0_OFFS]
|
||||
ldr r2, [r3, #POP2_OFFS]
|
||||
lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1
|
||||
bcs 2f
|
||||
ldrh r2, [r2]
|
||||
strh r2, [r0, #2*\n]
|
||||
2:
|
||||
.endm
|
||||
|
||||
decl_func sprite_ablit16_loop
|
||||
mov ip, r0
|
||||
|
||||
lsrs r2, r1, #3
|
||||
lsls r2, #3
|
||||
eors r1, r2
|
||||
lsls r2, #1 // Each pixel is 2 bytes
|
||||
add r0, r2
|
||||
|
||||
adr r2, 3f
|
||||
movs r3, #12 // Each (non-unrolled) loop body is 12 bytes
|
||||
muls r1, r3
|
||||
subs r2, r1
|
||||
adds r2, #1
|
||||
|
||||
ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
|
||||
bx r2
|
||||
|
||||
.align 2
|
||||
nop
|
||||
1:
|
||||
subs r0, #16
|
||||
sprite_ablit16_loop_body 7
|
||||
sprite_ablit16_loop_body 6
|
||||
sprite_ablit16_loop_body 5
|
||||
sprite_ablit16_loop_body 4
|
||||
sprite_ablit16_loop_body 3
|
||||
sprite_ablit16_loop_body 2
|
||||
sprite_ablit16_loop_body 1
|
||||
sprite_ablit16_loop_body 0
|
||||
3:
|
||||
cmp r0, ip
|
||||
bne 1b
|
||||
bx lr
|
||||
|
||||
#endif
|
||||
|
||||
#define FIX_OVERF_CHECK 1
|
||||
|
||||
#ifndef RISCV_HAVE_COMPRESSED_ISA
|
||||
#error "Address calculations are incorrect if not assembled with C extension"
|
||||
#endif
|
||||
.macro sprite_ablit16_alpha_loop_body n
|
||||
// Instructions which are only compressible under Zcb (e.g. lhu, sh) are
|
||||
// forced uncompressed, to get consistent size for address calculations.
|
||||
// This code should be exactly 24 bytes.
|
||||
|
||||
// Bit 25 is OVERF, bit 24 is OVERF1, bits 31:26 are zero, so can test for
|
||||
// overflow by testing the uppermost byte of CTRL0 for nonzero.
|
||||
#if !FIX_OVERF_CHECK
|
||||
norvc_2a lbu a1, CTRL0_OFFS+3(a5)
|
||||
lw a2, POP2_OFFS(a5)
|
||||
bnez a1, 2f
|
||||
#else
|
||||
lw a1, ACCUM0_OFFS(a5)
|
||||
lw a3, ACCUM1_OFFS(a5)
|
||||
lw a2, POP2_OFFS(a5)
|
||||
srli a1, a1, 7 + 16
|
||||
bnez a1, 2f
|
||||
srli a3, a3, 7 + 16
|
||||
bnez a3, 2f
|
||||
#endif
|
||||
norvc_2a lhu a2, (a2)
|
||||
// TODO dep stall on lhu, but it makes the OVERF case faster:
|
||||
slli a1, a2, 32 - ALPHA_SHIFT_16BPP
|
||||
bgez a1, 2f
|
||||
norvc_2a sh a2, 2*\n(a0)
|
||||
2:
|
||||
.endm
|
||||
|
||||
decl_func sprite_ablit16_alpha_loop
|
||||
mv a4, a0
|
||||
li a5, SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET
|
||||
|
||||
// Split off pixels modulo 8
|
||||
andi a2, a1, 0x7
|
||||
sub a1, a1, a2
|
||||
// Pointer to beginning of endmost block of 8 pixels:
|
||||
sh1add a0, a1, a0
|
||||
|
||||
// Compute branch into first loop, which has the modulo-8 pixels.
|
||||
// Each pixel takes 24 bytes of instructions.
|
||||
#if !FIX_OVERF_CHECK
|
||||
slli a2, a2, 3
|
||||
sh1add a2, a2, a2
|
||||
#else
|
||||
li a3, 30
|
||||
mul a2, a2, a3
|
||||
#endif
|
||||
|
||||
la a1, 3f
|
||||
sub a1, a1, a2
|
||||
jr a1
|
||||
|
||||
.align 2
|
||||
1:
|
||||
norvc_3a addi a0, a0, -16
|
||||
sprite_ablit16_alpha_loop_body 7
|
||||
sprite_ablit16_alpha_loop_body 6
|
||||
sprite_ablit16_alpha_loop_body 5
|
||||
sprite_ablit16_alpha_loop_body 4
|
||||
sprite_ablit16_alpha_loop_body 3
|
||||
sprite_ablit16_alpha_loop_body 2
|
||||
sprite_ablit16_alpha_loop_body 1
|
||||
sprite_ablit16_alpha_loop_body 0
|
||||
3:
|
||||
bltu a4, a0, 1b
|
||||
ret
|
|
@ -1,6 +1,6 @@
|
|||
#include "tile.h"
|
||||
|
||||
#include "pico/platform.h" // for __not_in_flash
|
||||
#include "pico.h" // for __not_in_flash
|
||||
#include "hardware/interp.h"
|
||||
|
||||
#define __ram_func(foo) __not_in_flash(#foo) foo
|
||||
|
|
|
@ -0,0 +1,188 @@
|
|||
#include "hardware/regs/addressmap.h"
|
||||
#include "hardware/regs/sio.h"
|
||||
|
||||
#include "sprite_asm_const.h"
|
||||
|
||||
#define POP2_OFFS (SIO_INTERP0_POP_FULL_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Tile layout
|
||||
//
|
||||
// Some terms:
|
||||
// Tileset: 1D array of tile images, concatenated image-after-image
|
||||
// Tilemap: 2D array of tileset indices
|
||||
//
|
||||
// Each tile image in a tileset is the same size. Tiles are square, either 8 x
|
||||
// 8 px or 16 x 16 px. This makes it easy to find the start of a tile image
|
||||
// given the tileset base pointer and a tile index (add + shift).
|
||||
//
|
||||
// Tilemaps are 8 bits per tile, always.
|
||||
//
|
||||
// One advantage of this layout is that y coordinates can be handled outside
|
||||
// of the loops in this file, which are all scanline-oriented, by offsetting
|
||||
// the tileset and tilemap pointers passed in. These routines only care about
|
||||
// x. The tileset pointer is offset by y modulo tile height, and the tilemap
|
||||
// pointer is offset by y divided by tile height, modulo tileset height in
|
||||
// tiles.
|
||||
|
||||
// Tileset: 16px tiles, 16bpp, with 1-bit alpha.
|
||||
// Tilemap: 8 bit indices.
|
||||
|
||||
.macro do_2px_16bpp_alpha rd rs rx dstoffs
|
||||
.option push
|
||||
.option norvc
|
||||
// TODO we could save a shift here by making alpha the MSB (not worth it
|
||||
// on Arm due to lack of sign-extension or flag update on loads)
|
||||
slli \rx, \rs, 32 - ALPHA_SHIFT_16BPP
|
||||
bgez \rx, 1f
|
||||
sh \rs, \dstoffs(\rd)
|
||||
1:
|
||||
slli \rx, \rs, 16 - ALPHA_SHIFT_16BPP
|
||||
bgez \rx, 1f
|
||||
srli \rs, \rs, 16
|
||||
sh \rs, \dstoffs+2(\rd)
|
||||
1:
|
||||
.option pop
|
||||
.endm
|
||||
|
||||
.macro do_2px_16bpp rd rs dstoffs
|
||||
sh \rs, \dstoffs(\rd)
|
||||
srli \rs, \rs, 16
|
||||
sh \rs, \dstoffs+2(\rd)
|
||||
.endm
|
||||
|
||||
// interp1 has been set up to give the next x-ward pointer into the tilemap
|
||||
// with each pop. This saves us having to remember the tilemap pointer and
|
||||
// tilemap x size mask in core registers.
|
||||
|
||||
// a0: dst
|
||||
// a1: tileset
|
||||
// a2: x0 (start pos in tile space)
|
||||
// a3: x1 (end pos in tile space, exclusive)
|
||||
|
||||
// Instantiated with alpha=1 and alpha=0 to get both variants of the loop.
|
||||
// Linker garbage collection ensures we only keep the versions we use.
|
||||
|
||||
.macro tile16_16px_loop_alpha_or_nonalpha alpha
|
||||
li a7, SIO_BASE + SIO_INTERP1_ACCUM0_OFFSET
|
||||
|
||||
// The main loop only handles whole tiles, so we may need to first copy
|
||||
// individual pixels to get tile-aligned. Skip this entirely if we are
|
||||
// already aligned, to avoid the extra interp pop.
|
||||
andi a5, a2, 0xf
|
||||
beqz a5, 3f
|
||||
|
||||
// Get pointer to tileset image
|
||||
lw a4, POP2_OFFS(a7)
|
||||
lbu a4, (a4) // dep stall
|
||||
slli a4, a4, 9 // 16 px wide * 16 px high * 2 bytes/px
|
||||
add a4, a4, a1
|
||||
// Offset tile image pointer to align with x0
|
||||
sh1add a4, a5, a4
|
||||
// Fall through into copy loop
|
||||
1:
|
||||
lhu a5, (a4)
|
||||
addi a4, a4, 2 // hoisted to fill load dependency slot
|
||||
.if \alpha
|
||||
slli a6, a5, 32 - ALPHA_SHIFT_16BPP
|
||||
bgez a6, 2f
|
||||
.endif
|
||||
sh a5, (a0)
|
||||
2:
|
||||
addi a0, a0, 2
|
||||
addi a2, a2, 1
|
||||
// Skip out if we have already reached end of span:
|
||||
bgeu a2, a3, 3f
|
||||
// Loop if we are not yet aligned: (TODO these checks could be merged)
|
||||
andi a6, a2, 0xf
|
||||
bnez a6, 1b
|
||||
3:
|
||||
// The next output pixel is aligned to the start of a tile. Set up main loop.
|
||||
|
||||
// Tileset pointer is only needed occasionally, so free up a1 for better
|
||||
// code density:
|
||||
mv t0, a1
|
||||
// t1: dst limit pointer at end of all pixels:
|
||||
sub a3, a3, a2
|
||||
sh1add t1, a3, a0
|
||||
// a5: dst limit pointer at end of whole tiles:
|
||||
andi a4, a3, ~0xf
|
||||
sh1add a5, a4, a0
|
||||
|
||||
// a0 is dst, a7 is interp base, a1-a4 are trashed by loop, a5 is dst limit.
|
||||
// Early skip for case of 0 whole tiles:
|
||||
bgeu a0, a5, 3f
|
||||
2:
|
||||
// Get next tilemap pointer
|
||||
lw a1, POP2_OFFS(a7)
|
||||
// Get tile image pointer
|
||||
lbu a1, (a1) // dep stall
|
||||
slli a1, a1, 9
|
||||
add a1, a1, t0
|
||||
|
||||
.if \alpha
|
||||
lw a3, 0(a1)
|
||||
lw a4, 4(a1)
|
||||
do_2px_16bpp_alpha a0 a3 a2 0
|
||||
do_2px_16bpp_alpha a0 a4 a2 4
|
||||
lw a3, 8(a1)
|
||||
lw a4, 12(a1)
|
||||
do_2px_16bpp_alpha a0 a3 a2 8
|
||||
do_2px_16bpp_alpha a0 a4 a2 12
|
||||
lw a3, 16(a1)
|
||||
lw a4, 20(a1)
|
||||
do_2px_16bpp_alpha a0 a3 a2 16
|
||||
do_2px_16bpp_alpha a0 a4 a2 20
|
||||
lw a3, 24(a1)
|
||||
lw a4, 28(a1)
|
||||
do_2px_16bpp_alpha a0 a3 a2 24
|
||||
do_2px_16bpp_alpha a0 a4 a2 28
|
||||
.else
|
||||
lw a3, 0(a1)
|
||||
lw a4, 4(a1)
|
||||
do_2px_16bpp a0 a3 0
|
||||
do_2px_16bpp a0 a4 4
|
||||
lw a3, 8(a1)
|
||||
lw a4, 12(a1)
|
||||
do_2px_16bpp a0 a3 8
|
||||
do_2px_16bpp a0 a4 12
|
||||
lw a3, 16(a1)
|
||||
lw a4, 20(a1)
|
||||
do_2px_16bpp a0 a3 16
|
||||
do_2px_16bpp a0 a4 20
|
||||
lw a3, 24(a1)
|
||||
lw a4, 28(a1)
|
||||
do_2px_16bpp a0 a3 24
|
||||
do_2px_16bpp a0 a4 28
|
||||
.endif
|
||||
addi a0, a0, 32
|
||||
bltu a0, a5, 2b
|
||||
3:
|
||||
|
||||
// Skip ahead if there are no spare pixels to tidy up
|
||||
bgeu a0, t1, 3f
|
||||
// Copy <1 tile's worth of loose pixels
|
||||
lw a4, POP2_OFFS(a7)
|
||||
lbu a4, (a4) // dep stall
|
||||
slli a4, a4, 9
|
||||
add a4, a4, t0
|
||||
1:
|
||||
lh a5, (a4)
|
||||
addi a4, a4, 2
|
||||
.if \alpha
|
||||
slli a6, a5, 32 - ALPHA_SHIFT_16BPP
|
||||
bgez a6, 2f
|
||||
.endif
|
||||
sh a5, (a0)
|
||||
2:
|
||||
addi a0, a0, 2
|
||||
bltu a0, t1, 1b
|
||||
3:
|
||||
ret
|
||||
.endm
|
||||
|
||||
decl_func tile16_16px_alpha_loop
|
||||
tile16_16px_loop_alpha_or_nonalpha 1
|
||||
|
||||
decl_func tile16_16px_loop
|
||||
tile16_16px_loop_alpha_or_nonalpha 0
|
Ładowanie…
Reference in New Issue