diff --git a/Readme.md b/Readme.md index 7653282..9934023 100644 --- a/Readme.md +++ b/Readme.md @@ -1,3 +1,30 @@ +RP2350 PicoDVI Preview +====================== + +Changes from the public GitHub version: + +* All Arm assembly in `libdvi` has been ported to RISC-V and tuned for Hazard3 +* Some of the existing Arm assembly in `libdvi` has been tweaked for better performance on Cortex-M33 +* RGB encode now uses the SIO TMDS encoders by default on RP2350 (can be disabled by defining `DVI_USE_SIO_TMDS_ENCODE=0` -- see `software/libdvi/dvi_config_defs.h`) +* Much of the Arm assembly in `libsprite` has been ported to RISC-V -- enough to run the stock demos + +Build instructions: + +```bash +cd software +mkdir build +# PICO_PLATFORM can also be rp2350-riscv +# List of DVI configs is in software/include/common_dvi_pin_configs.h +cmake -DPICO_SDK_PATH=/path/to/sdk -DPICO_PLATFORM=rp2350 -DPICO_COPY_TO_RAM=1 -DDVI_DEFAULT_SERIAL_CONFIG=pico_sock_cfg .. +make -j$(nproc) +# Then flash a binary, e.g.: +cp apps/tiles_and_sprites/tiles_and_sprites.uf2 +``` + +If you plan to run the `vista` demo, then note that there are now two UF2 data files, `software/assets/vista_data_rp2040.uf2` and `software/assets/vista_data_rp2350.uf2`. The only difference is the family IDs: the first can be dragged on RP2040 and on RP2350 A0, and the second can be dragged on RP2350 A1 and later. + +The following is the original RP2040 writeup: + Bitbanged DVI on the RP2040 Microcontroller =========================================== diff --git a/software/.gitignore b/software/.gitignore index 89efb0f..2e12106 100644 --- a/software/.gitignore +++ b/software/.gitignore @@ -1,2 +1,3 @@ build *.swp +build-* diff --git a/software/apps/CMakeLists.txt b/software/apps/CMakeLists.txt index 0c95dfb..233ff73 100644 --- a/software/apps/CMakeLists.txt +++ b/software/apps/CMakeLists.txt @@ -1,4 +1,7 @@ -add_subdirectory(bad_apple) +if (NOT PICO_RISCV) + # Arm assembly needs porting to RISC-V + add_subdirectory(bad_apple) +endif() add_subdirectory(colour_terminal) add_subdirectory(christmas_snowflakes) add_subdirectory(dht_logging) @@ -12,5 +15,8 @@ add_subdirectory(tiles) add_subdirectory(tiles_and_sprites) add_subdirectory(tiles_parallax) add_subdirectory(vista) -add_subdirectory(vista-palette) +if (PICO_RP2040) + # Needs porting to use XIP stream instead of SSI, as was done to vista + add_subdirectory(vista-palette) +endif() add_subdirectory(mandel-full) diff --git a/software/apps/colour_terminal/main.c b/software/apps/colour_terminal/main.c index 711837f..1b1a2a5 100644 --- a/software/apps/colour_terminal/main.c +++ b/software/apps/colour_terminal/main.c @@ -8,7 +8,6 @@ #include "hardware/gpio.h" #include "hardware/vreg.h" #include "hardware/structs/bus_ctrl.h" -#include "hardware/structs/ssi.h" #include "hardware/dma.h" #include "pico/sem.h" diff --git a/software/apps/colour_terminal/tmds_encode_font_2bpp.S b/software/apps/colour_terminal/tmds_encode_font_2bpp.S index a544890..0d4688d 100644 --- a/software/apps/colour_terminal/tmds_encode_font_2bpp.S +++ b/software/apps/colour_terminal/tmds_encode_font_2bpp.S @@ -1,9 +1,11 @@ #include "hardware/regs/addressmap.h" #include "hardware/regs/sio.h" +#ifndef __riscv .syntax unified .cpu cortex-m0plus .thumb +#endif // Using the following: // @@ -46,12 +48,13 @@ // r8 contains a pointer to the font bitmap for this scanline. // r9 contains the TMDS LUT base. .macro do_char charbuf_offs colour_shift_instr colour_shamt +#ifndef __riscv // Get 8x font bits for next character, put 4 LSBs in bits 6:3 of r4 (so // scaled to 8-byte LUT entries), and 4 MSBs in bits 6:3 of r6. - ldrb r4, [r0, #\charbuf_offs] // 2 - add r4, r8 // 1 - ldrb r4, [r4] // 2 - lsrs r6, r4, #4 // 1 + ldrb r4, [r0, #\charbuf_offs] // 2 (note these cycle + add r4, r8 // 1 counts are for M0+ + ldrb r4, [r4] // 2 and are a little + lsrs r6, r4, #4 // 1 pessimistic on M33) lsls r6, #3 // 1 lsls r4, #28 // 1 lsrs r4, #25 // 1 @@ -67,6 +70,31 @@ ldmia r4, {r4, r5} // 3 ldmia r6, {r6, r7} // 3 stmia r2!, {r4-r7} // 5 +#else + lbu a4, \charbuf_offs(a0) // 1 + \colour_shift_instr a5, a1, \colour_shamt // 1 + add a4, a4, t1 // 1 + lbu a4, (a4) // 2 + srli a6, a4, 4 // 1 + andi a4, a4, 0xf // 1 + + // Get colour bits, add to TMDS LUT base and font bits + and a5, a5, a3 // 1 + add a5, a5, t2 // 1 + sh3add a4, a4, a5 // 1 + sh3add a6, a6, a5 // 1 + + // Look up and write out 8 TMDS symbols + lw a5, 4(a4) // 1 + lw a4, 0(a4) // 1 + lw a7, 4(a6) // 1 + lw a6, 0(a6) // 1 + sw a4, 0(a2) // 1 + sw a5, 4(a2) // 1 + sw a6, 8(a2) // 1 + sw a7, 12(a2) // 1 + addi a2, a2, 16 // 1 +#endif .endm @@ -78,9 +106,12 @@ .section .scratch_x.tmds_encode_font_2bpp, "ax" .global tmds_encode_font_2bpp +#ifndef __riscv .type tmds_encode_font_2bpp,%function .thumb_func +#endif tmds_encode_font_2bpp: +#ifndef __riscv push {r4-r7, lr} mov r4, r8 mov r5, r9 @@ -123,6 +154,32 @@ tmds_encode_font_2bpp: mov r10, r6 pop {r4-r7, pc} +#else + + sh1add t0, a3, a2 + li a3, 0xf0 * 8 + + mv t1, a4 + la t2, palettised_1bpp_tables + mv t3, a1 + + bgeu a2, t0, 2f +1: + lw a1, (t3) + addi t3, t3, 4 + do_char 0 slli 7 + do_char 1 slli 3 + do_char 2 srli 1 + do_char 3 srli 5 + do_char 4 srli 9 + do_char 5 srli 13 + do_char 6 srli 17 + do_char 7 srli 21 + addi a0, a0, 8 + bltu a2, t0, 1b +2: + ret +#endif // Table generation: // levels_2bpp_even = [0x05, 0x50, 0xaf, 0xfa] diff --git a/software/apps/mandel-full/main.c b/software/apps/mandel-full/main.c index 5b492b5..15b991c 100644 --- a/software/apps/mandel-full/main.c +++ b/software/apps/mandel-full/main.c @@ -8,7 +8,6 @@ #include "hardware/pll.h" #include "hardware/sync.h" #include "hardware/structs/bus_ctrl.h" -#include "hardware/structs/ssi.h" #include "hardware/vreg.h" #include "pico/multicore.h" #include "pico/sem.h" diff --git a/software/apps/terminal/main.c b/software/apps/terminal/main.c index 7fed0c6..d868419 100644 --- a/software/apps/terminal/main.c +++ b/software/apps/terminal/main.c @@ -8,7 +8,6 @@ #include "hardware/gpio.h" #include "hardware/vreg.h" #include "hardware/structs/bus_ctrl.h" -#include "hardware/structs/ssi.h" #include "hardware/dma.h" #include "pico/sem.h" diff --git a/software/apps/tiles_parallax/main.c b/software/apps/tiles_parallax/main.c index 48a5e31..e5975d8 100644 --- a/software/apps/tiles_parallax/main.c +++ b/software/apps/tiles_parallax/main.c @@ -127,8 +127,10 @@ void __not_in_flash("render") render_loop() { tile16(pixbuf, &bg1, y, FRAME_WIDTH); queue_add_blocking(&dvi0.q_colour_valid, &pixbuf); } - bg0.xscroll += 1; - bg1.xscroll += 2; + bg1.xscroll += 1; + if (frame_ctr & 1) { + bg0.xscroll += 1; + } ++frame_ctr; } } diff --git a/software/apps/vista-palette/CMakeLists.txt b/software/apps/vista-palette/CMakeLists.txt index 9bd204a..e797ff7 100644 --- a/software/apps/vista-palette/CMakeLists.txt +++ b/software/apps/vista-palette/CMakeLists.txt @@ -10,9 +10,19 @@ add_executable(vista-palette # flash using direct SSI DMA, which would trample on XIP. pico_set_binary_type(vista-palette copy_to_ram) -pico_define_boot_stage2(vista-palette_boot2 ${PICO_SDK_PATH}/src/rp2_common/boot_stage2/boot2_w25q080.S) -target_compile_definitions(vista-palette_boot2 PRIVATE PICO_FLASH_SPI_CLKDIV=4) -pico_set_boot_stage2(vista-palette vista-palette_boot2) + +if (PICO_RP2040) + pico_define_boot_stage2(vista-palette_boot2 ${PICO_SDK_PATH}/src/rp2040/boot_stage2/boot2_w25q080.S) + pico_set_boot_stage2(vista-palette vista-palette_boot2) + target_compile_definitions(vista-palette_boot2 PRIVATE PICO_FLASH_SPI_CLKDIV=4) +else () + target_compile_definitions(vista-palette PRIVATE + PICO_EMBED_XIP_SETUP=1 + PICO_BOOT_STAGE2_CHOOSE_W25Q080=1 + PICO_FLASH_SPI_CLKDIV=2 + PICO_FLASH_SPI_RXDELAY=3 + ) +endif() target_compile_definitions(vista-palette PRIVATE DVI_DEFAULT_SERIAL_CONFIG=${DVI_DEFAULT_SERIAL_CONFIG} diff --git a/software/apps/vista/CMakeLists.txt b/software/apps/vista/CMakeLists.txt index c9f0103..3cb7ccb 100644 --- a/software/apps/vista/CMakeLists.txt +++ b/software/apps/vista/CMakeLists.txt @@ -13,6 +13,19 @@ target_compile_definitions(vista PRIVATE DVI_SYMBOLS_PER_WORD=1 ) +if (PICO_RP2040) + pico_define_boot_stage2(vista_boot2 ${PICO_SDK_PATH}/src/rp2040/boot_stage2/boot2_w25q080.S) + pico_set_boot_stage2(vista vista_boot2) + target_compile_definitions(vista_boot2 PRIVATE PICO_FLASH_SPI_CLKDIV=4) +else () + target_compile_definitions(vista PRIVATE + PICO_EMBED_XIP_SETUP=1 + PICO_BOOT_STAGE2_CHOOSE_W25Q080=1 + PICO_FLASH_SPI_CLKDIV=2 + PICO_FLASH_SPI_RXDELAY=3 + ) +endif() + target_compile_definitions(vista PRIVATE PICO_STACK_SIZE=0x200) target_link_libraries(vista diff --git a/software/apps/vista/main.c b/software/apps/vista/main.c index d8cff3a..7442dc7 100644 --- a/software/apps/vista/main.c +++ b/software/apps/vista/main.c @@ -7,11 +7,17 @@ #include "hardware/pll.h" #include "hardware/sync.h" #include "hardware/structs/bus_ctrl.h" -#include "hardware/structs/ssi.h" #include "hardware/vreg.h" #include "pico/multicore.h" #include "pico/sem.h" #include "pico/stdlib.h" +#if PICO_RP2040 +#include "hardware/structs/ssi.h" +#else +#include "hardware/structs/xip_ctrl.h" +#include "hardware/structs/xip_aux.h" +#include "hardware/structs/qmi.h" +#endif #include "tmds_encode.h" @@ -45,27 +51,40 @@ static inline void prepare_scanline(const uint32_t *colourbuf, uint32_t *tmdsbuf tmds_encode_data_channel_fullres_16bpp(colourbuf, tmdsbuf + 2 * pixwidth, pixwidth, 15, 11); } -void __no_inline_not_in_flash_func(flash_bulk_dma_start)(uint32_t *rxbuf, uint32_t flash_offs, size_t len, uint dma_chan) -{ +void __no_inline_not_in_flash_func(flash_bulk_dma_start)(uint32_t *rxbuf, uint32_t flash_offs, size_t len, uint dma_chan) { +#if PICO_RP2040 + // On RP2040, program the SSI to clock the correct amount of data without stopping ssi_hw->ssienr = 0; ssi_hw->ctrlr1 = len - 1; // NDF, number of data frames ssi_hw->dmacr = SSI_DMACR_TDMAE_BITS | SSI_DMACR_RDMAE_BITS; ssi_hw->ssienr = 1; // Other than NDF, the SSI configuration used for XIP is suitable for a bulk read too. - - dma_hw->ch[dma_chan].read_addr = (uint32_t)&ssi_hw->dr0; + const uintptr_t read_addr = (uintptr_t)&ssi_hw->dr0; + const uint dreq = DREQ_XIP_SSIRX; + const bool bswap = true; +#else + // On RP2350, SSI is gone, but XIP streaming is fast enough to keep up with this demo + // (you can still DMA to the DIRECT_MODE FIFOs if you really need 100%) + xip_ctrl_hw->stream_addr = flash_offs; + xip_ctrl_hw->stream_ctr = len; + const uintptr_t read_addr = (uintptr_t)&xip_aux_hw->stream; + const uint dreq = DREQ_XIP_STREAM; + const bool bswap = false; +#endif + dma_hw->ch[dma_chan].read_addr = read_addr; dma_hw->ch[dma_chan].write_addr = (uint32_t)rxbuf; dma_hw->ch[dma_chan].transfer_count = len; dma_hw->ch[dma_chan].ctrl_trig = - DMA_CH0_CTRL_TRIG_BSWAP_BITS | - DREQ_XIP_SSIRX << DMA_CH0_CTRL_TRIG_TREQ_SEL_LSB | + (uint)bswap << DMA_CH0_CTRL_TRIG_BSWAP_LSB | + dreq << DMA_CH0_CTRL_TRIG_TREQ_SEL_LSB | dma_chan << DMA_CH0_CTRL_TRIG_CHAIN_TO_LSB | DMA_CH0_CTRL_TRIG_INCR_WRITE_BITS | DMA_CH0_CTRL_TRIG_DATA_SIZE_VALUE_SIZE_WORD << DMA_CH0_CTRL_TRIG_DATA_SIZE_LSB | DMA_CH0_CTRL_TRIG_EN_BITS; - +#if PICO_RP2040 // Now DMA is waiting, kick off the SSI transfer (mode continuation bits in LSBs) ssi_hw->dr0 = (flash_offs << 8) | 0xa0; +#endif } // Core 1 handles DMA IRQs and runs TMDS encode on scanline buffers it @@ -91,6 +110,15 @@ int __not_in_flash("main") main() { sleep_ms(10); set_sys_clock_khz(DVI_TIMING.bit_clk_khz, true); + // A0 SDK won't pick up on the PICO_EMBED_XIP_SETUP flag, so just to make sure: +#if PICO_RP2350 + hw_write_masked( + &qmi_hw->m[0].timing, + 3 << QMI_M0_TIMING_RXDELAY_LSB | 2 << QMI_M0_TIMING_CLKDIV_LSB, + QMI_M0_TIMING_RXDELAY_BITS | QMI_M0_TIMING_CLKDIV_BITS + ); +#endif + setup_default_uart(); gpio_init(LED_PIN); @@ -131,21 +159,25 @@ int __not_in_flash("main") main() { } for (int y = 0; y < 2 * FRAME_HEIGHT; y += 2) { // Start DMA to back buffer before starting to encode the front buffer (each buffer is two scanlines) +#if !PICO_RP2040 + // On RP2040 we could never reach this point early, because of the slow encode! + dma_channel_wait_for_finish_blocking(img_dma_chan); +#endif flash_bulk_dma_start( (uint32_t*)img_buf[img_buf_back], current_image_base + ((y + 2) % (2 * FRAME_HEIGHT)) * IMAGE_SCANLINE_SIZE, IMAGE_SCANLINE_SIZE * 2 / sizeof(uint32_t), img_dma_chan ); - const uint16_t *img = (const uint16_t*)img_buf[img_buf_front]; + const uint16_t *img = (const uint16_t*)img_buf[img_buf_front]; uint32_t *our_tmds_buf, *their_tmds_buf; queue_remove_blocking_u32(&dvi0.q_tmds_free, &their_tmds_buf); multicore_fifo_push_blocking((uint32_t)(img)); multicore_fifo_push_blocking((uint32_t)their_tmds_buf); - + queue_remove_blocking_u32(&dvi0.q_tmds_free, &our_tmds_buf); prepare_scanline((const uint32_t*)(img + FRAME_WIDTH * 2), our_tmds_buf); - + multicore_fifo_pop_blocking(); queue_add_blocking_u32(&dvi0.q_tmds_valid, &their_tmds_buf); queue_add_blocking_u32(&dvi0.q_tmds_valid, &our_tmds_buf); @@ -156,4 +188,3 @@ int __not_in_flash("main") main() { } __builtin_unreachable(); } - diff --git a/software/assets/vista_data.uf2 b/software/assets/vista_data_rp2040.uf2 similarity index 100% rename from software/assets/vista_data.uf2 rename to software/assets/vista_data_rp2040.uf2 diff --git a/software/assets/vista_data_rp2350.uf2 b/software/assets/vista_data_rp2350.uf2 new file mode 100644 index 0000000..28997eb Binary files /dev/null and b/software/assets/vista_data_rp2350.uf2 differ diff --git a/software/include/common_dvi_pin_configs.h b/software/include/common_dvi_pin_configs.h index 12e4240..01e8a12 100644 --- a/software/include/common_dvi_pin_configs.h +++ b/software/include/common_dvi_pin_configs.h @@ -28,6 +28,17 @@ static const struct dvi_serialiser_cfg picodvi_reva_dvi_cfg = { .invert_diffpairs = true }; +// AMY-DVI board, for getting HDMI from the RP2350 FPGA development platform, +// again a cursed board that only a couple of people in the world possess: +static const struct dvi_serialiser_cfg amy_dvi_cfg = { + .pio = DVI_DEFAULT_PIO_INST, + .sm_tmds = {0, 1, 2}, + .pins_tmds = {14, 16, 18}, + .pins_clk = 12, + .invert_diffpairs = true +}; + + // The not-HDMI socket on Rev C PicoDVI boards // (we don't talk about Rev B) static const struct dvi_serialiser_cfg picodvi_dvi_cfg = { diff --git a/software/libdvi/dvi.c b/software/libdvi/dvi.c index f66377f..a00bb93 100644 --- a/software/libdvi/dvi.c +++ b/software/libdvi/dvi.c @@ -184,7 +184,7 @@ static void __dvi_func(dvi_dma_irq_handler)(struct dvi_inst *inst) { // Make sure all three channels have definitely loaded their last block // (should be within a few cycles of one another) for (int i = 0; i < N_TMDS_LANES; ++i) { - while (dma_debug_hw->ch[inst->dma_cfg[i].chan_data].tcr != inst->timing->h_active_pixels / DVI_SYMBOLS_PER_WORD) + while (dma_debug_hw->ch[inst->dma_cfg[i].chan_data].dbg_tcr != inst->timing->h_active_pixels / DVI_SYMBOLS_PER_WORD) tight_loop_contents(); } diff --git a/software/libdvi/dvi_config_defs.h b/software/libdvi/dvi_config_defs.h index 448c515..c7b8d27 100644 --- a/software/libdvi/dvi_config_defs.h +++ b/software/libdvi/dvi_config_defs.h @@ -51,8 +51,16 @@ #define DVI_SYMBOLS_PER_WORD 2 #endif -#if DVI_SYMBOLS_PER_WORD != 1 && DVI_SYMBOLS_PER_WORD !=2 -#error "Unsupported value for DVI_SYMBOLS_PER_WORD" +// Implement TMDS encode with hardware encoders in SIO, instead of +// interpolators + LUTs. The processor still has to crank the encoder, but +// it's much faster. This still works with PIO serialisers, which can appear +// on any GPIO, unlike the HSTX which is limited to specific GPIOs. +#ifndef DVI_USE_SIO_TMDS_ENCODER +#if PICO_RP2040 +#define DVI_USE_SIO_TMDS_ENCODER 0 +#else +#define DVI_USE_SIO_TMDS_ENCODER 1 +#endif #endif // ---------------------------------------------------------------------------- diff --git a/software/libdvi/tmds_encode.S b/software/libdvi/tmds_encode.S index 065061d..b095e75 100644 --- a/software/libdvi/tmds_encode.S +++ b/software/libdvi/tmds_encode.S @@ -2,6 +2,10 @@ #include "hardware/regs/sio.h" #include "dvi_config_defs.h" +// This file contains both Arm and RISC-V source, with the correct version +// selected via the __arm__ and __riscv predefined macros. The targeted Arm +// dialect is Armv6-M, and the targeted RISC-V dialect is RV32IZba + // Offsets suitable for ldr/str (must be <= 0x7c): #define ACCUM0_OFFS (SIO_INTERP0_ACCUM0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) #define ACCUM1_OFFS (SIO_INTERP0_ACCUM1_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) @@ -13,23 +17,33 @@ // Note the entirety of INTERP0 and INTERP1 fits inside this 5-bit // word-addressed space... almost as though it were intentional! :) +#if defined(__arm__) && defined(__riscv) +#error "wat" +#endif + +#ifdef __arm__ .syntax unified .cpu cortex-m0plus .thumb +#endif .macro decl_func_x name .section .scratch_x.\name, "ax" .global \name +#ifdef __arm__ .type \name,%function .thumb_func +#endif \name: .endm .macro decl_func_y name .section .scratch_y.\name, "ax" .global \name +#ifdef __arm__ .type \name,%function .thumb_func +#endif \name: .endm @@ -41,7 +55,10 @@ // r0: Input buffer (word-aligned) // r1: Output buffer (word-aligned) // r2: Input size (pixels) +// r3: Left shift (for the *_leftshift variant only -- costs 1 cycle per 2 pixels) +#if defined(__arm__) +// Armv6-M: .macro do_channel_16bpp r_ibase r_inout0 r_out1 str \r_inout0, [\r_ibase, #ACCUM0_OFFS] ldr \r_inout0, [\r_ibase, #PEEK0_OFFS] @@ -50,8 +67,11 @@ ldr \r_out1, [\r_out1] .endm -decl_func tmds_encode_loop_16bpp +.macro tmds_encode_loop_16bpp_impl leftshift push {r4, r5, r6, r7, lr} + // Bounds calculation: each input pixel results in two output pixels, + // whose two TMDS symbols are packed in a single 32-bit word. So, 4 bytes + // out per one pixel in. lsls r2, #2 add r2, r1 mov ip, r2 @@ -61,7 +81,13 @@ decl_func tmds_encode_loop_16bpp 1: .rept TMDS_ENCODE_UNROLL ldmia r0!, {r4, r6} +.if \leftshift + lsls r4, r3 +.endif do_channel_16bpp r2, r4, r5 +.if \leftshift + lsls r6, r3 +.endif do_channel_16bpp r2, r6, r7 stmia r1!, {r4, r5, r6, r7} .endr @@ -69,82 +95,72 @@ decl_func tmds_encode_loop_16bpp cmp r1, ip bne 1b pop {r4, r5, r6, r7, pc} +.endm -// Same as above, but scale data to make up for lack of left shift -// in interpolator (costs 1 cycle per 2 pixels) -// -// r0: Input buffer (word-aligned) -// r1: Output buffer (word-aligned) -// r2: Input size (pixels) -// r3: Left shift amount +#elif defined(__riscv) +.macro do_channel_16bpp r_ibase r_inout0 r_out1 + sw \r_inout0, ACCUM0_OFFS(\r_ibase) + // Note two halves are interleaved to avoid load->addr dependency + lw \r_inout0, PEEK0_OFFS(\r_ibase) + lw \r_out1, PEEK1_OFFS(\r_ibase) + lw \r_inout0, (\r_inout0) + lw \r_out1, (\r_out1) +.endm + +.macro tmds_encode_loop_16bpp_impl leftshift + slli a2, a2, 2 + add t0, a2, a1 + bgeu a1, t0, 2f + li a2, SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET +.align 2 +1: +.set i, 0 +.rept TMDS_ENCODE_UNROLL + lw a4, 8 * i + 0(a0) + lw a6, 8 * i + 4(a0) +.if \leftshift + sll a4, a4, a3 + sll a6, a6, a3 +.endif + do_channel_16bpp a2, a4, a5 + do_channel_16bpp a2, a6, a7 + sw a4, 16 * i + 0(a1) + sw a5, 16 * i + 4(a1) + sw a6, 16 * i + 8(a1) + sw a7, 16 * i + 12(a1) +.set i, i + 1 +.endr + addi a0, a0, 8 * TMDS_ENCODE_UNROLL + addi a1, a1, 16 * TMDS_ENCODE_UNROLL + bltu a1, t0, 1b +2: + ret +.endm + +#else +#error "Unknown architecture" +#endif + +decl_func tmds_encode_loop_16bpp +tmds_encode_loop_16bpp_impl 0 decl_func tmds_encode_loop_16bpp_leftshift - push {r4, r5, r6, r7, lr} - lsls r2, #2 - add r2, r1 - mov ip, r2 - ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) - b 2f -.align 2 -1: -.rept TMDS_ENCODE_UNROLL - ldmia r0!, {r4, r6} - lsls r4, r3 - do_channel_16bpp r2, r4, r5 - lsls r6, r3 - do_channel_16bpp r2, r6, r7 - stmia r1!, {r4, r5, r6, r7} -.endr -2: - cmp r1, ip - bne 1b - pop {r4, r5, r6, r7, pc} +tmds_encode_loop_16bpp_impl 1 // r0: Input buffer (word-aligned) // r1: Output buffer (word-aligned) // r2: Input size (pixels) - -decl_func tmds_encode_loop_8bpp - push {r4, r5, r6, r7, lr} - lsls r2, #2 - add r2, r1 - mov ip, r2 - ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) - b 2f -.align 2 -1: -.rept TMDS_ENCODE_UNROLL - ldmia r0!, {r4} - str r4, [r2, #ACCUM0_OFFS + INTERP1] - str r4, [r2, #ACCUM0_OFFS] - ldr r4, [r2, #PEEK0_OFFS] - ldr r4, [r4] - ldr r5, [r2, #PEEK1_OFFS] - ldr r5, [r5] - ldr r6, [r2, #PEEK0_OFFS + INTERP1] - ldr r6, [r6] - ldr r7, [r2, #PEEK1_OFFS + INTERP1] - ldr r7, [r7] - stmia r1!, {r4, r5, r6, r7} -.endr -2: - cmp r1, ip - bne 1b - pop {r4, r5, r6, r7, pc} - -// r0: Input buffer (word-aligned) -// r1: Output buffer (word-aligned) -// r2: Input size (pixels) -// r3: Left shift amount +// r3: Left shift amount (for the *_leftshift variant of the function) // // Note that only the data written to interp0 (pixel 0, 1) is leftshifted, not // the data written to interp1 (pixel 2, 3). Otherwise we always lose MSBs, as // the LUT offset MSB is at bit 8, so pixel 0 always requires some left shift, // since its channel MSBs are no greater than 7. -decl_func tmds_encode_loop_8bpp_leftshift +#if defined(__arm__) +.macro tmds_encode_loop_8bpp_impl leftshift push {r4, r5, r6, r7, lr} - lsls r2, #3 + lsls r2, #2 add r2, r1 mov ip, r2 ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) @@ -154,7 +170,9 @@ decl_func tmds_encode_loop_8bpp_leftshift .rept TMDS_ENCODE_UNROLL ldmia r0!, {r4} str r4, [r2, #ACCUM0_OFFS + INTERP1] +.if \leftshift lsls r4, r3 +.endif str r4, [r2, #ACCUM0_OFFS] ldr r4, [r2, #PEEK0_OFFS] ldr r4, [r4] @@ -170,6 +188,54 @@ decl_func tmds_encode_loop_8bpp_leftshift cmp r1, ip bne 1b pop {r4, r5, r6, r7, pc} +.endm + +#elif defined(__riscv) +.macro tmds_encode_loop_8bpp_impl leftshift + slli a2, a2, 2 + add a2, a2, a1 + bgeu a1, a2, 2f + mv t0, a2 + li a2, SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET +.align 2 +1: +.set i, 0 +.rept TMDS_ENCODE_UNROLL + lw a4, 4 * i(a0) + sw a4, ACCUM0_OFFS + INTERP1(a2) +.if \leftshift + sll a4, a4, a3 +.endif + sw a4, ACCUM0_OFFS(a2) + lw a4, PEEK0_OFFS(a2) + lw a5, PEEK1_OFFS(a2) + lw a4, (a4) + lw a5, (a5) + lw a6, PEEK0_OFFS + INTERP1(a2) + lw a7, PEEK1_OFFS + INTERP1(a2) + lw a6, (a6) + lw a7, (a7) + sw a4, 16 * i + 0(a1) + sw a5, 16 * i + 4(a1) + sw a6, 16 * i + 8(a1) + sw a7, 16 * i + 12(a1) +.set i, i + 1 +.endr + addi a0, a0, TMDS_ENCODE_UNROLL * 4 + addi a1, a1, TMDS_ENCODE_UNROLL * 16 + bltu a1, t0, 1b +2: + ret +.endm + +#else +#error "Unknown architecture" +#endif + +decl_func tmds_encode_loop_8bpp +tmds_encode_loop_8bpp_impl 0 +decl_func tmds_encode_loop_8bpp_leftshift +tmds_encode_loop_8bpp_impl 1 // ---------------------------------------------------------------------------- // Fast 1bpp black/white encoder (full res) @@ -190,6 +256,8 @@ decl_func tmds_encode_loop_8bpp_leftshift // r3 contains lookup mask (preshifted) // r8 contains pointer to encode table // 2.125 cyc/pix + +#if defined(__arm__) .macro tmds_encode_1bpp_body shift_instr0 shamt0 shift_instr1 shamt1 \shift_instr0 r4, r2, #\shamt0 ands r4, r3 @@ -238,6 +306,58 @@ decl_func tmds_encode_1bpp mov r8, r7 pop {r4-r7, pc} +#elif defined(__riscv) +// TODO the register allocation is not optimal here for code size +.macro tmds_encode_1bpp_body shift_instr0 shamt0 shift_instr1 shamt1 + \shift_instr0 a4, a2, \shamt0 + and a4, a4, a3 + add a4, a4, t1 + lw a5, 4(a4) + lw a4, 0(a4) + \shift_instr1 a6, a2, \shamt1 + and a6, a6, a3 + add a6, a6, t1 + lw a7, 4(a6) + lw a6, 0(a6) + sw a4, 0(a1) + sw a5, 4(a1) + sw a6, 8(a1) + sw a7, 12(a1) + addi a1, a1, 16 +.endm + +// a0: input buffer (word-aligned) +// a1: output buffer (word-aligned) +// a2: output pixel count +decl_func tmds_encode_1bpp + slli a2, a2, 1 + add t0, a2, a1 + la t1, tmds_1bpp_table + // Mask: 4 bit index, 8 bytes per entry + li a3, 0x78 + bgeu a1, t0, 2f +1: + lw a2, (a0) + addi a0, a0, 4 +#if !DVI_1BPP_BIT_REVERSE + tmds_encode_1bpp_body slli 3 srli 1 + tmds_encode_1bpp_body srli 5 srli 9 + tmds_encode_1bpp_body srli 13 srli 17 + tmds_encode_1bpp_body srli 21 srli 25 +#else + tmds_encode_1bpp_body srli 1 slli 3 + tmds_encode_1bpp_body srli 9 srli 5 + tmds_encode_1bpp_body srli 17 srli 13 + tmds_encode_1bpp_body srli 25 srli 21 +#endif + bltu a1, t0, 1b +2: + ret + +#else +#error "Unknown architecture" +#endif + .align 2 tmds_1bpp_table: #if !DVI_1BPP_BIT_REVERSE @@ -299,6 +419,7 @@ tmds_1bpp_table: // level 2: (a5 -> 163) always // level 3: (ef -> 2f0) always +#if defined(__arm__) // Table base pointer in r0. Input pixels in r2. .macro encode_2bpp_body shift_instr shamt rd \shift_instr \rd, r2, #\shamt @@ -343,6 +464,55 @@ decl_func tmds_encode_2bpp mov r8, r7 pop {r4-r7, pc} +#elif defined(__riscv) +// Table base pointer in a0. Input pixels in a2. +.macro encode_2bpp_body shift_instr shamt rd + \shift_instr \rd, a2, \shamt + and \rd, \rd, a3 + add \rd, \rd, a0 + lw \rd, (\rd) +.endm + +// a0: input buffer (word-aligned) +// a1: output buffer (word-aligned) +// a2: output pixel count +decl_func tmds_encode_2bpp + mv t1, a0 + la a0, tmds_2bpp_table + // Mask: 4-bit index into 4-byte entries. + li a3, 0x3c + // Limit pointer: 1 word per 2 pixels + slli a2, a2, 1 + add t0, a2, a1 + bgeu a1, t0, 1b +1: + lw a2, (t1) + addi t1, t1, 4 + encode_2bpp_body slli 2 a4 + encode_2bpp_body srli 2 a5 + encode_2bpp_body srli 6 a6 + encode_2bpp_body srli 10 a7 + sw a4, 0(a1) + sw a5, 4(a1) + sw a6, 8(a1) + sw a7, 12(a1) + encode_2bpp_body srli 14 a4 + encode_2bpp_body srli 18 a5 + encode_2bpp_body srli 22 a6 + encode_2bpp_body srli 26 a7 + sw a4, 16(a1) + sw a5, 20(a1) + sw a6, 24(a1) + sw a7, 28(a1) + addi a1, a1, 32 + bltu a1, t0, 1b +2: + ret + +#else +#error "Unknown architecture" +#endif + .align 2 tmds_2bpp_table: .word 0x7f103 // 00, 00 @@ -404,17 +574,20 @@ tmds_2bpp_table: // much better, and many monitors will still accept the signals as long as you // DC couple your DVI signals. -.macro tmds_fullres_encode_loop_body ra rb +#if defined(__arm__) +.macro tmds_fullres_encode_loop_body leftshift ra rb str \ra, [r2, #ACCUM0_OFFS + INTERP1] +.if \leftshift + lsls \ra, r3 +.endif str \ra, [r2, #ACCUM0_OFFS] + // Loads interleaved to avoid rdata->addr stall on M33 ldr \ra, [r2, #PEEK2_OFFS] - ldr \ra, [\ra] -#if !TMDS_FULLRES_NO_DC_BALANCE - str \ra, [r2, #ACCUM1_ADD_OFFS] -#endif ldr \rb, [r2, #PEEK2_OFFS + INTERP1] + ldr \ra, [\ra] ldr \rb, [\rb] #if !TMDS_FULLRES_NO_DC_BALANCE + str \ra, [r2, #ACCUM1_ADD_OFFS] str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1] #endif .endm @@ -422,8 +595,9 @@ tmds_2bpp_table: // r0: Input buffer (word-aligned) // r1: Output buffer (word-aligned) // r2: Pixel count +// r3: Left shift amount -.macro tmds_fullres_encode_loop_16bpp +.macro tmds_fullres_encode_loop_16bpp leftshift push {r4-r7, lr} mov r4, r8 push {r4} @@ -451,8 +625,8 @@ tmds_2bpp_table: 1: .rept 16 ldmia r0!, {r4, r6} - tmds_fullres_encode_loop_body r4 r5 - tmds_fullres_encode_loop_body r6 r7 + tmds_fullres_encode_loop_body \leftshift r4 r5 + tmds_fullres_encode_loop_body \leftshift r6 r7 stmia r1!, {r4, r5, r6, r7} .endr 2: @@ -465,82 +639,77 @@ tmds_2bpp_table: pop {r4-r7, pc} .endm -// One copy each in X and Y, so the two cores don't step on each other -decl_func_x tmds_fullres_encode_loop_16bpp_x - tmds_fullres_encode_loop_16bpp -decl_func_y tmds_fullres_encode_loop_16bpp_y - tmds_fullres_encode_loop_16bpp +#elif defined(__riscv) - -.macro tmds_fullres_encode_loop_body_leftshift ra rb - // Note we apply the leftshift for INTERP0 only - str \ra, [r2, #ACCUM0_OFFS + INTERP1] - lsls \ra, r3 - str \ra, [r2, #ACCUM0_OFFS] - ldr \ra, [r2, #PEEK2_OFFS] - ldr \ra, [\ra] +.macro tmds_fullres_encode_loop_body leftshift ra rb + sw \ra, ACCUM0_OFFS + INTERP1(a2) +.if \leftshift + sll \ra, \ra, a3 +.endif + sw \ra, ACCUM0_OFFS(a2) + lw \ra, PEEK2_OFFS(a2) + lw \rb, PEEK2_OFFS + INTERP1(a2) + lw \ra, (\ra) + lw \rb, (\rb) #if !TMDS_FULLRES_NO_DC_BALANCE - str \ra, [r2, #ACCUM1_ADD_OFFS] -#endif - ldr \rb, [r2, #PEEK2_OFFS + INTERP1] - ldr \rb, [\rb] -#if !TMDS_FULLRES_NO_DC_BALANCE - str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1] + sw \ra, ACCUM1_ADD_OFFS(a2) + sw \rb, ACCUM1_ADD_OFFS + INTERP1(a2) #endif .endm -// r0: Input buffer (word-aligned) -// r1: Output buffer (word-aligned) -// r2: Pixel count -// r3: Left shift amount +// a0: Input buffer (word-aligned) +// a1: Output buffer (word-aligned) +// a2: Pixel count +// a3: Left shift amount -.macro tmds_fullres_encode_loop_16bpp_leftshift - push {r4-r7, lr} - mov r4, r8 - mov r5, r9 - push {r4-r5} - - lsls r2, #2 - add r2, r1 - mov ip, r2 - ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) +.macro tmds_fullres_encode_loop_16bpp leftshift + sh2add t0, a2, a1 + li a2, SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET // DC balance defined to be 0 at start of scanline: - movs r4, #0 - str r4, [r2, #ACCUM1_OFFS] + li a4, 0 + sw a4, ACCUM1_OFFS(a2) #if TMDS_FULLRES_NO_DC_BALANCE - // Alternate parity between odd/even symbols if there's no balance feedback - mvns r4, r4 + // Alternate parity between odd/even symbols if no feedback + li a4, -1 #endif - str r4, [r2, #ACCUM1_OFFS + INTERP1] + sw a4, ACCUM1_OFFS + INTERP1(a2) - adr r4, 1f - adds r4, #1 - mov r8, r4 - b 2f + bgeu a1, t0, 2f .align 2 1: -.rept 16 // 64 pixels per iteration - ldmia r0!, {r4, r6} - tmds_fullres_encode_loop_body_leftshift r4 r5 - tmds_fullres_encode_loop_body_leftshift r6 r7 - stmia r1!, {r4, r5, r6, r7} +.set i, 0 +.rept 16 + lw a4, 8 * i + 0(a0) + lw a6, 8 * i + 4(a0) + tmds_fullres_encode_loop_body \leftshift a4 a5 + tmds_fullres_encode_loop_body \leftshift a6 a7 + sw a4, 16 * i + 0(a1) + sw a5, 16 * i + 4(a1) + sw a6, 16 * i + 8(a1) + sw a7, 16 * i + 12(a1) +.set i, i + 1 .endr + addi a0, a0, 8 * i + addi a1, a1, 16 * i + bltu a1, t0, 1b 2: - cmp r1, ip - beq 1f - bx r8 -1: - pop {r4-r5} - mov r8, r4 - mov r9, r5 - pop {r4-r7, pc} + ret .endm -decl_func_x tmds_fullres_encode_loop_16bpp_leftshift_x - tmds_fullres_encode_loop_16bpp_leftshift -decl_func_y tmds_fullres_encode_loop_16bpp_leftshift_y - tmds_fullres_encode_loop_16bpp_leftshift +#else +#error "Unknown architecture" +#endif +// One copy each in X and Y, so the two cores don't step on each other +decl_func_x tmds_fullres_encode_loop_16bpp_x + tmds_fullres_encode_loop_16bpp 0 +decl_func_y tmds_fullres_encode_loop_16bpp_y + tmds_fullres_encode_loop_16bpp 0 + +decl_func_x tmds_fullres_encode_loop_16bpp_leftshift_x + tmds_fullres_encode_loop_16bpp 1 +decl_func_y tmds_fullres_encode_loop_16bpp_leftshift_y + tmds_fullres_encode_loop_16bpp 1 // ---------------------------------------------------------------------------- // Full-resolution 8bpp paletted encode @@ -550,19 +719,19 @@ decl_func_y tmds_fullres_encode_loop_16bpp_leftshift_y // base is set to a reordered list of TMDS symbols based // on a user colour palette. +#ifdef __arm__ // Two pixels input in rd[17:2]. Two symbols output in rd[19:0]. r2 contains // interp base pointer. r7 used as temporary. .macro tmds_palette_encode_loop_body rd str \rd, [r2, #ACCUM0_OFFS] str \rd, [r2, #ACCUM0_OFFS + INTERP1] + // Loads interleaved to avoid rdata->addr stall on M33 ldr \rd, [r2, #PEEK2_OFFS] - ldr \rd, [\rd] -#if !TMDS_FULLRES_NO_DC_BALANCE - str \rd, [r2, #ACCUM1_ADD_OFFS] -#endif ldr r7, [r2, #PEEK2_OFFS + INTERP1] + ldr \rd, [\rd] ldr r7, [r7] #if !TMDS_FULLRES_NO_DC_BALANCE + str \rd, [r2, #ACCUM1_ADD_OFFS] str r7, [r2, #ACCUM1_ADD_OFFS + INTERP1] #endif lsls r7, #10 @@ -617,7 +786,241 @@ decl_func_y tmds_fullres_encode_loop_16bpp_leftshift_y pop {r4-r7, pc} .endm +#elif defined(__riscv) + +// Two pixels input in rd[17:2]. Two symbols output in rd[19:0]. a2 contains +// interp base pointer. a5 used as temporary. +.macro tmds_palette_encode_loop_body rd + sw \rd, ACCUM0_OFFS(a2) + sw \rd, ACCUM0_OFFS + INTERP1(a2) + lw \rd, PEEK2_OFFS(a2) + lw a5, PEEK2_OFFS + INTERP1(a2) + lw \rd, (\rd) + lw a5, (a5) +#if !TMDS_FULLRES_NO_DC_BALANCE + sw \rd, ACCUM1_ADD_OFFS(a2) + sw a5, ACCUM1_ADD_OFFS + INTERP1(a2) +#endif + slli a5, a5, 10 + or \rd, \rd, a5 +.endm + +.macro tmds_palette_encode_loop + mv t1, s0 + sh1add t0, a2, a1 + li a2, SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET + // DC balance defined to be 0 at start of scanline: + li a4, 0 + sw a4, ACCUM1_OFFS(a2) +#if TMDS_FULLRES_NO_DC_BALANCE + // Alternate parity between odd/even symbols if there's no balance feedback + li a4, -1 +#endif + sw a4, ACCUM1_OFFS + INTERP1(a2) + + bgeu a1, t0, 2f + .align 2 +1: +.set i, 0 +.rept 10 + lw a3, 8 * i + 0(a0) + lw s0, 8 * i + 4(a0) + srli a4, a3, 14 + slli a3, a3, 2 + tmds_palette_encode_loop_body a3 + tmds_palette_encode_loop_body a4 + sw a3, 16 * i + 0(a1) + sw a4, 16 * i + 4(a1) + srli a4, s0, 14 + slli s0, s0, 2 + tmds_palette_encode_loop_body s0 + tmds_palette_encode_loop_body a4 + sw s0, 16 * i + 8(a1) + sw a4, 16 * i + 12(a1) +.set i, i + 1 +.endr + addi a0, a0, 8 * i + addi a1, a1, 16 * i + bltu a1, t0, 1b +2: + mv s0, t1 + ret +.endm + + +#endif + decl_func_x tmds_palette_encode_loop_x tmds_palette_encode_loop decl_func_y tmds_palette_encode_loop_y tmds_palette_encode_loop + +// ---------------------------------------------------------------------------- +// Hand-cranking loops for SIO TMDS encoders + +#if DVI_USE_SIO_TMDS_ENCODER + +#if defined(__arm__) + +// r0: input buffer (word-aligned) +// r1: output buffer (word-aligned) +// r2: pixel count + +.macro tmds_encode_sio_loop size_ratio peek + +// For larger load/store offsets at high ratios/unroll: +.cpu cortex-m33 + +.if \size_ratio > 4 * TMDS_ENCODE_UNROLL +.set unroll, 1 +.else +.set unroll, 4 * TMDS_ENCODE_UNROLL / \size_ratio +.endif + +.if \peek +.set even_offset_adj, (SIO_TMDS_PEEK_SINGLE_OFFSET - SIO_TMDS_POP_SINGLE_OFFSET) +.else +.set even_offset_adj, 0 +.endif + + push {r4, lr} +#if DVI_SYMBOLS_PER_WORD == 1 + lsls r2, r2, #2 +#else + lsls r2, r2, #1 +#endif + adds r2, r1 + ldr r3, =SIO_BASE + SIO_TMDS_CTRL_OFFSET + b 2f +1: +.set i, 0 +.rept unroll + ldr r4, [r0, #i * 4] + str r4, [r3, #SIO_TMDS_WDATA_OFFSET - SIO_TMDS_CTRL_OFFSET] +.set j, 0 +.rept \size_ratio +.set offset_adj, even_offset_adj * ((1 + j + \size_ratio * i) & 0x1) +#if DVI_SYMBOLS_PER_WORD == 2 + ldr r4, [r3, #offset_adj + SIO_TMDS_POP_DOUBLE_L0_OFFSET - SIO_TMDS_CTRL_OFFSET] +#else + ldr r4, [r3, #offset_adj + SIO_TMDS_POP_SINGLE_OFFSET - SIO_TMDS_CTRL_OFFSET] +#endif + str r4, [r1, #4 * (j + i * \size_ratio)] +.set j, j + 1 +.endr +.set i, i + 1 +.endr + adds r0, 4 * unroll + adds r1, 4 * unroll * \size_ratio +2: + cmp r1, r2 + blo 1b + pop {r4, pc} + +.cpu cortex-m0plus +.endm + +#elif defined(__riscv) + +// a0: input buffer (word-aligned) +// a1: output buffer (word-aligned) +// a2: pixel count + +.macro tmds_encode_sio_loop size_ratio peek + +.if \size_ratio > 4 * TMDS_ENCODE_UNROLL +.set unroll, 1 +.else +.set unroll, 4 * TMDS_ENCODE_UNROLL / \size_ratio +.endif + +.if \peek +.set even_offset_adj, (SIO_TMDS_PEEK_SINGLE_OFFSET - SIO_TMDS_POP_SINGLE_OFFSET) +.else +.set even_offset_adj, 0 +.endif + +#if DVI_SYMBOLS_PER_WORD == 1 + sh2add a2, a2, a1 +#else + sh1add a2, a2, a1 +#endif + li a3, SIO_BASE + SIO_TMDS_CTRL_OFFSET + bgeu a1, a2, 2f +1: +.set i, 0 +.rept unroll + lw a4, i * 4(a0) + sw a4, SIO_TMDS_WDATA_OFFSET - SIO_TMDS_CTRL_OFFSET(a3) +.set j, 0 +.rept \size_ratio +.set offset_adj, even_offset_adj * ((1 + j + \size_ratio * i) & 0x1) +#if DVI_SYMBOLS_PER_WORD == 2 + lw a4, offset_adj + SIO_TMDS_POP_DOUBLE_L0_OFFSET - SIO_TMDS_CTRL_OFFSET(a3) +#else + lw a4, offset_adj + SIO_TMDS_POP_SINGLE_OFFSET - SIO_TMDS_CTRL_OFFSET(a3) +#endif + sw a4, 4 * (j + i * \size_ratio)(a1) +.set j, j + 1 +.endr +.set i, i + 1 +.endr + addi a0, a0, 4 * unroll + addi a1, a1, 4 * unroll * \size_ratio + bltu a1, a2, 1b +2: + ret +.endm + +#else +#error "Unknown architecture" +#endif + +// For DVI_SYMBOLS_PER_WORD == 2, the ratio of output : input buffer size is: +// +// Bits/pixel | Ratio (with hdouble) | Ratio (no hdouble) +// -----------+----------------------+------------------- +// 1 | 32 | 16 +// 2 | 16 | 8 +// 4 | 8 | 4 +// 8 | 4 | 2 +// 16 | 2 | 1 +// +// For DVI_SYMBOLS_PER_WORD == 1, these ratios are doubled. + +// poppop variants will read from a xxx_POP register for every output word +decl_func tmds_encode_sio_loop_poppop_ratio1 + tmds_encode_sio_loop 1, 0 +decl_func tmds_encode_sio_loop_poppop_ratio2 + tmds_encode_sio_loop 2, 0 +decl_func tmds_encode_sio_loop_poppop_ratio4 + tmds_encode_sio_loop 4, 0 +decl_func tmds_encode_sio_loop_poppop_ratio8 + tmds_encode_sio_loop 8, 0 +decl_func tmds_encode_sio_loop_poppop_ratio16 + tmds_encode_sio_loop 16, 0 +decl_func tmds_encode_sio_loop_poppop_ratio32 + tmds_encode_sio_loop 32, 0 +decl_func tmds_encode_sio_loop_poppop_ratio64 + tmds_encode_sio_loop 64, 0 + +// peekpop variants will read alternately from xxx_PEEK and xxx_POP: this is +// needed for pixel-doubled output when DVI_PIXELS_PER_WORD == 1 (note the +// POP value is different from the PEEK value, as it's the same data but with +// different running DC balance) +decl_func tmds_encode_sio_loop_peekpop_ratio1 + tmds_encode_sio_loop 1, 1 +decl_func tmds_encode_sio_loop_peekpop_ratio2 + tmds_encode_sio_loop 2, 1 +decl_func tmds_encode_sio_loop_peekpop_ratio4 + tmds_encode_sio_loop 4, 1 +decl_func tmds_encode_sio_loop_peekpop_ratio8 + tmds_encode_sio_loop 8, 1 +decl_func tmds_encode_sio_loop_peekpop_ratio16 + tmds_encode_sio_loop 16, 1 +decl_func tmds_encode_sio_loop_peekpop_ratio32 + tmds_encode_sio_loop 32, 1 +decl_func tmds_encode_sio_loop_peekpop_ratio64 + tmds_encode_sio_loop 64, 1 + +#endif diff --git a/software/libdvi/tmds_encode.c b/software/libdvi/tmds_encode.c index 43a81c2..1ca5be4 100644 --- a/software/libdvi/tmds_encode.c +++ b/software/libdvi/tmds_encode.c @@ -3,7 +3,7 @@ #include "hardware/gpio.h" #include "hardware/sync.h" -static const uint32_t __scratch_x("tmds_table") tmds_table[] = { +static const __unused uint32_t __scratch_x("tmds_table") tmds_table[] = { #include "tmds_table.h" }; @@ -11,14 +11,15 @@ static const uint32_t __scratch_x("tmds_table") tmds_table[] = { // memory. There is a third copy which can go in flash, because it's just used // to generate palette LUTs. The ones we don't use will get garbage collected // during linking. -const uint32_t __scratch_x("tmds_table_fullres_x") tmds_table_fullres_x[] = { +const __unused uint32_t __scratch_x("tmds_table_fullres_x") tmds_table_fullres_x[] = { #include "tmds_table_fullres.h" }; -const uint32_t __scratch_y("tmds_table_fullres_y") tmds_table_fullres_y[] = { +const __unused uint32_t __scratch_y("tmds_table_fullres_y") tmds_table_fullres_y[] = { #include "tmds_table_fullres.h" }; +#if !DVI_USE_SIO_TMDS_ENCODER // Configure an interpolator to extract a single colour channel from each of a pair // of pixels, with the first pixel's lsb at pixel_lsb, and the pixels being // pixel_width wide. Produce a LUT address for the first pixel's colour data on @@ -35,11 +36,16 @@ static int __not_in_flash_func(configure_interp_for_addrgen)(interp_hw_t *interp int shift_channel_to_index = pixel_lsb + channel_msb - (lut_index_width - 1) - index_shift; int oops = 0; +#if PICO_RP2040 if (shift_channel_to_index < 0) { // "It's ok we'll fix it in software" oops = -shift_channel_to_index; shift_channel_to_index = 0; } +#else + // Now a right-rotate, not a right-shift + shift_channel_to_index &= 0x1f; +#endif uint index_msb = index_shift + lut_index_width - 1; @@ -60,23 +66,60 @@ static int __not_in_flash_func(configure_interp_for_addrgen)(interp_hw_t *interp return oops; } +#else +// Encoding a single channel at a time is not the most efficient way to use +// this hardware, because it means we read the colour buffer multiple times, +// but it fits better with how things are done in software on RP2040. +static void __not_in_flash_func(configure_sio_tmds_for_single_channel)(uint channel_msb, uint channel_lsb, uint pixel_width, bool hdouble) { + assert(channel_msb - channel_lsb <= 7); // 1 through 8 bits, inclusive + sio_hw->tmds_ctrl = + SIO_TMDS_CTRL_CLEAR_BALANCE_BITS | + ((channel_msb - channel_lsb) << SIO_TMDS_CTRL_L0_NBITS_LSB) | + (((channel_msb - 7u) & 0xfu) << SIO_TMDS_CTRL_L0_ROT_LSB) | + ((1 + __builtin_ctz(pixel_width)) << SIO_TMDS_CTRL_PIX_SHIFT_LSB) | + ((uint)hdouble << SIO_TMDS_CTRL_PIX2_NOSHIFT_LSB); +} +#endif + // Extract up to 6 bits from a buffer of 16 bit pixels, and produce a buffer // of TMDS symbols from this colour channel. Number of pixels must be even, // pixel buffer must be word-aligned. void __not_in_flash_func(tmds_encode_data_channel_16bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) { +#if DVI_USE_SIO_TMDS_ENCODER + configure_sio_tmds_for_single_channel(channel_msb, channel_lsb, 16, true); +#if DVI_SYMBOLS_PER_WORD == 1 + tmds_encode_sio_loop_peekpop_ratio4(pixbuf, symbuf, 2 * n_pix); +#else + tmds_encode_sio_loop_poppop_ratio2(pixbuf, symbuf, 2 * n_pix); +#endif +#else interp_hw_save_t interp0_save; interp_save(interp0_hw, &interp0_save); int require_lshift = configure_interp_for_addrgen(interp0_hw, channel_msb, channel_lsb, 0, 16, 6, tmds_table); +#if PICO_RP2040 if (require_lshift) tmds_encode_loop_16bpp_leftshift(pixbuf, symbuf, n_pix, require_lshift); else tmds_encode_loop_16bpp(pixbuf, symbuf, n_pix); +#else + assert(!require_lshift); (void)require_lshift; + tmds_encode_loop_16bpp(pixbuf, symbuf, n_pix); +#endif interp_restore(interp0_hw, &interp0_save); +#endif } // As above, but 8 bits per pixel, multiple of 4 pixels, and still word-aligned. void __not_in_flash_func(tmds_encode_data_channel_8bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) { +#if DVI_USE_SIO_TMDS_ENCODER + configure_sio_tmds_for_single_channel(channel_msb, channel_lsb, 8, true); +#if DVI_SYMBOLS_PER_WORD == 1 + tmds_encode_sio_loop_peekpop_ratio8(pixbuf, symbuf, 2 * n_pix); +#else + tmds_encode_sio_loop_poppop_ratio4(pixbuf, symbuf, 2 * n_pix); +#endif +#else interp_hw_save_t interp0_save, interp1_save; interp_save(interp0_hw, &interp0_save); interp_save(interp1_hw, &interp1_save); @@ -86,12 +129,18 @@ void __not_in_flash_func(tmds_encode_data_channel_8bpp)(const uint32_t *pixbuf, int require_lshift = configure_interp_for_addrgen(interp0_hw, channel_msb, channel_lsb, 0, 8, 6, tmds_table); int lshift_upper = configure_interp_for_addrgen(interp1_hw, channel_msb, channel_lsb, 16, 8, 6, tmds_table); assert(!lshift_upper); (void)lshift_upper; +#if PICO_RP2040 if (require_lshift) tmds_encode_loop_8bpp_leftshift(pixbuf, symbuf, n_pix, require_lshift); else tmds_encode_loop_8bpp(pixbuf, symbuf, n_pix); +#else + assert(!require_lshift); (void)require_lshift; + tmds_encode_loop_8bpp(pixbuf, symbuf, n_pix); +#endif interp_restore(interp0_hw, &interp0_save); interp_restore(interp1_hw, &interp1_save); +#endif } // ---------------------------------------------------------------------------- @@ -103,16 +152,22 @@ void __not_in_flash_func(tmds_encode_data_channel_8bpp)(const uint32_t *pixbuf, // pixels, and INTERP1 for odd pixels. Note this means that even and odd // symbols have their DC balance handled separately, which is not to spec. +#if !DVI_USE_SIO_TMDS_ENCODER static int __not_in_flash_func(configure_interp_for_addrgen_fullres)(interp_hw_t *interp, uint channel_msb, uint channel_lsb, uint lut_index_width, const uint32_t *lutbase) { const uint index_shift = 2; // scaled lookup for 4-byte LUT entries int shift_channel_to_index = channel_msb - (lut_index_width - 1) - index_shift; int oops = 0; +#if PICO_RP2040 if (shift_channel_to_index < 0) { // "It's ok we'll fix it in software" oops = -shift_channel_to_index; shift_channel_to_index = 0; } +#else + // Now a right-rotate rather than right-shift + shift_channel_to_index &= 0x1f; +#endif uint index_msb = index_shift + lut_index_width - 1; @@ -133,8 +188,17 @@ static int __not_in_flash_func(configure_interp_for_addrgen_fullres)(interp_hw_t return oops; } +#endif void __not_in_flash_func(tmds_encode_data_channel_fullres_16bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) { +#if DVI_USE_SIO_TMDS_ENCODER + configure_sio_tmds_for_single_channel(channel_msb, channel_lsb, 16, false); +#if DVI_SYMBOLS_PER_WORD == 1 + tmds_encode_sio_loop_poppop_ratio2(pixbuf, symbuf, n_pix); +#else + tmds_encode_sio_loop_poppop_ratio1(pixbuf, symbuf, n_pix); +#endif +#else uint core = get_core_num(); #if !TMDS_FULLRES_NO_INTERP_SAVE interp_hw_save_t interp0_save, interp1_save; @@ -165,17 +229,16 @@ void __not_in_flash_func(tmds_encode_data_channel_fullres_16bpp)(const uint32_t interp_restore(interp0_hw, &interp0_save); interp_restore(interp1_hw, &interp1_save); #endif +#endif } static const int8_t imbalance_lookup[16] = { -4, -2, -2, 0, -2, 0, 0, 2, -2, 0, 0, 2, 0, 2, 2, 4 }; -static inline int byte_imbalance(uint32_t x) -{ +static inline int byte_imbalance(uint32_t x) { return imbalance_lookup[x >> 4] + imbalance_lookup[x & 0xF]; } -static void tmds_encode_symbols(uint8_t pixel, uint32_t* negative_balance_sym, uint32_t* positive_balance_sym) -{ +static void tmds_encode_symbols(uint8_t pixel, uint32_t* negative_balance_sym, uint32_t* positive_balance_sym) { int pixel_imbalance = byte_imbalance(pixel); uint32_t sym = pixel & 1; if (pixel_imbalance > 0 || (pixel_imbalance == 0 && sym == 0)) { diff --git a/software/libdvi/tmds_encode.h b/software/libdvi/tmds_encode.h index ee8e244..6c903f9 100644 --- a/software/libdvi/tmds_encode.h +++ b/software/libdvi/tmds_encode.h @@ -34,4 +34,23 @@ void tmds_fullres_encode_loop_16bpp_leftshift_y(const uint32_t *pixbuf, uint32_t void tmds_palette_encode_loop_x(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix); void tmds_palette_encode_loop_y(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix); +#if !PICO_RP2040 +// Crank the SIO TMDS encoder: +void tmds_encode_sio_loop_poppop_ratio1(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix); +void tmds_encode_sio_loop_poppop_ratio2(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix); +void tmds_encode_sio_loop_poppop_ratio4(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix); +void tmds_encode_sio_loop_poppop_ratio8(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix); +void tmds_encode_sio_loop_poppop_ratio16(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix); +void tmds_encode_sio_loop_poppop_ratio32(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix); +void tmds_encode_sio_loop_poppop_ratio64(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix); + +void tmds_encode_sio_loop_peekpop_ratio1(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix); +void tmds_encode_sio_loop_peekpop_ratio2(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix); +void tmds_encode_sio_loop_peekpop_ratio4(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix); +void tmds_encode_sio_loop_peekpop_ratio8(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix); +void tmds_encode_sio_loop_peekpop_ratio16(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix); +void tmds_encode_sio_loop_peekpop_ratio32(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix); +void tmds_encode_sio_loop_peekpop_ratio64(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix); +#endif + #endif diff --git a/software/libsprite/CMakeLists.txt b/software/libsprite/CMakeLists.txt index bfc8c0e..22350a9 100644 --- a/software/libsprite/CMakeLists.txt +++ b/software/libsprite/CMakeLists.txt @@ -3,13 +3,24 @@ add_library(libsprite INTERFACE) target_sources(libsprite INTERFACE ${CMAKE_CURRENT_LIST_DIR}/affine_transform.h ${CMAKE_CURRENT_LIST_DIR}/sprite_asm_const.h - ${CMAKE_CURRENT_LIST_DIR}/sprite.S ${CMAKE_CURRENT_LIST_DIR}/sprite.c ${CMAKE_CURRENT_LIST_DIR}/sprite.h - ${CMAKE_CURRENT_LIST_DIR}/tile.S ${CMAKE_CURRENT_LIST_DIR}/tile.c ${CMAKE_CURRENT_LIST_DIR}/tile.h ) +if (PICO_RISCV) + target_sources(libsprite INTERFACE + ${CMAKE_CURRENT_LIST_DIR}/sprite_riscv.S + ${CMAKE_CURRENT_LIST_DIR}/tile_riscv.S + ) +else () + target_sources(libsprite INTERFACE + ${CMAKE_CURRENT_LIST_DIR}/sprite_armv6m.S + ${CMAKE_CURRENT_LIST_DIR}/tile_armv6m.S + ) +endif() + + target_include_directories(libsprite INTERFACE ${CMAKE_CURRENT_LIST_DIR}) target_link_libraries(libsprite INTERFACE pico_base_headers hardware_interp) diff --git a/software/libsprite/affine_transform.h b/software/libsprite/affine_transform.h index 188d282..3c4b597 100644 --- a/software/libsprite/affine_transform.h +++ b/software/libsprite/affine_transform.h @@ -4,7 +4,7 @@ // Stolen from RISCBoy #include -#include "pico/platform.h" +#include "pico.h" // Store unpacked affine transforms as signed 16.16 fixed point in the following order: // a00, a01, b0, a10, a11, b1 diff --git a/software/libsprite/sprite.c b/software/libsprite/sprite.c index cb92d52..44f6a80 100644 --- a/software/libsprite/sprite.c +++ b/software/libsprite/sprite.c @@ -1,7 +1,7 @@ #include "sprite.h" #include "affine_transform.h" -#include "pico/platform.h" // for __not_in_flash +#include "pico.h" // for __not_in_flash #include "hardware/interp.h" // Note some of the sprite routines are quite large (unrolled), so trying to diff --git a/software/libsprite/sprite.S b/software/libsprite/sprite_armv6m.S similarity index 100% rename from software/libsprite/sprite.S rename to software/libsprite/sprite_armv6m.S diff --git a/software/libsprite/sprite_asm_const.h b/software/libsprite/sprite_asm_const.h index 636f4d6..1704a6a 100644 --- a/software/libsprite/sprite_asm_const.h +++ b/software/libsprite/sprite_asm_const.h @@ -5,8 +5,11 @@ .macro decl_func name .section .time_critical.\name, "ax" .global \name +.p2align 2 +#ifndef __riscv .type \name,%function .thumb_func +#endif \name: .endm @@ -16,11 +19,40 @@ // same way as non-alpha pixels when encoding (and the co-opted channel LSB // always ends up being set on alpha pixels, which is pretty harmless) +// Also note this is expressed as a right-shift into the carry flag (on Arm), +// so this is equal to the bit index of the alpha bit plus 1. On RISC-V it's +// idiomatic to shift up to the sign bit instead, so a left shift of 32 - x +// should be used instead of a right shift of x. + #define ALPHA_SHIFT_16BPP 6 // Assume RAGB2132 (so alpha is bit 5) #define ALPHA_SHIFT_8BPP 6 +#ifdef __riscv +// Macros for forcing individual instructions to be 32 bits, to maintain +// branch target alignment without adding NOPs +.macro norvc_1a instr, arg0 +.option push +.option norvc +\instr \arg0 +.option pop +.endm + +.macro norvc_2a instr, arg0, arg1 +.option push +.option norvc +\instr \arg0, \arg1 +.option pop +.endm + +.macro norvc_3a instr, arg0, arg1, arg2 +.option push +.option norvc +\instr \arg0, \arg1, \arg2 +.option pop +.endm +#endif #endif diff --git a/software/libsprite/sprite_riscv.S b/software/libsprite/sprite_riscv.S new file mode 100644 index 0000000..0919785 --- /dev/null +++ b/software/libsprite/sprite_riscv.S @@ -0,0 +1,657 @@ +// Functions for doing simple 2D graphics operations on a RGB scanline buffer. + +#include "hardware/regs/addressmap.h" +#include "hardware/regs/sio.h" + +#include "sprite_asm_const.h" + +#define POP2_OFFS (SIO_INTERP0_POP_FULL_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) +#define PEEK0_OFFS (SIO_INTERP0_PEEK_LANE0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) +#define PEEK1_OFFS (SIO_INTERP0_PEEK_LANE1_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) +#define ACCUM0_OFFS (SIO_INTERP0_ACCUM0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) +#define ACCUM1_OFFS (SIO_INTERP0_ACCUM1_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) +#define CTRL0_OFFS (SIO_INTERP0_CTRL_LANE0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) +#define INTERP1 (SIO_INTERP1_ACCUM0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) + +#if defined(__riscv_c) || defined(__riscv_zca) +#define RISCV_HAVE_COMPRESSED_ISA 1 +#endif + +// ---------------------------------------------------------------------------- +// Colour fill + +// a0: dst +// a1: value +// a2: count + +decl_func sprite_fill8 + // Slide for short fills + li a3, 18 + bltu a3, a2, 2f +#ifndef RISCV_HAVE_COMPRESSED_ISA +#error "This address computation is wrong for non-RVC:" +#endif + auipc a3, 0 // 32-bit instruction after address of auipc + slli a2, a2, 2 // 16-bit instruction after address of auipc + sub a3, a3, a2 // 16-bit instruction after address of auipc + jr a3, 18 * 4 + 12 // 32-bit instruction after address of auipc +.align 2 + // With Zcb this is a mix of 16-bit and 32-bit instructions due to the + // limited immediate size. Force 32-bit so we can do a computed branch. +.option push +.option norvc + sb a1, 17(a0) + sb a1, 16(a0) + sb a1, 15(a0) + sb a1, 14(a0) + sb a1, 13(a0) + sb a1, 12(a0) + sb a1, 11(a0) + sb a1, 10(a0) + sb a1, 9(a0) + sb a1, 8(a0) + sb a1, 7(a0) + sb a1, 6(a0) + sb a1, 5(a0) + sb a1, 4(a0) + sb a1, 3(a0) + sb a1, 2(a0) + sb a1, 1(a0) + sb a1, 0(a0) +.option pop + ret +2: + // Duplicate byte x4 + packh a1, a1, a1 + pack a1, a1, a1 + // Get a0 word-aligned: + andi a3, a0, 0x1 + bnez a3, 1f + sb a1, (a0) + addi a0, a0, 1 + addi a2, a2, -1 +1: + andi a3, a0, 0x2 + bnez a3, 1f + sh a1, (a0) + addi a0, a0, 2 + addi a2, a2, -2 +1: + // Set up for main loop. Limit pointer at end - (loop body size) + add a2, a2, a0 + addi a2, a2, -16 + + // Fall straight into loop, because cases less than (loop body + max misalignment) are handled by slide +1: + sw a1, 0(a0) + sw a1, 4(a0) + sw a1, 8(a0) + sw a1, 12(a0) + addi a0, a0, 16 + bgeu a2, a0, 1b + + // Main loop done, now tidy up the odds and ends. Note bits 3:0 of the + // pointer difference are not affected by us subtracting 16 earlier. + sub a2, a2, a0 + // No more than 15 bytes remaining -- first test bit 3 by shifting it to sign bit + slli a2, a2, 28 + bgez a2, 1f + sw a1, 0(a0) + sw a1, 4(a0) + addi a0, a0, 8 +1: + slli a2, a2, 1 + bgez a2, 1f + sw a1, (a0) + addi a0, a0, 4 +1: + slli a2, a2, 1 + bgez a2, 1f + sh a1, (a0) + addi a0, a0, 2 +1: + slli a2, a2, 1 + bgez a2, 1f + sb a1, (a0) +1: + ret + +.p2align 2 +decl_func sprite_fill16 + // Slide for short fills + norvc_2a li a3, 16 + bltu a3, a2, 2f +#ifndef RISCV_HAVE_COMPRESSED_ISA +#error "This address computation is wrong for non-RVC:" +#endif + auipc a3, 0 // 32-bit instruction after address of auipc + slli a2, a2, 2 // 16-bit instruction after address of auipc + sub a3, a3, a2 // 16-bit instruction after address of auipc + jr a3, 16 * 4 + 12 // 32-bit instruction after address of auipc +.option push +.option norvc + sh a1, 30(a0) + sh a1, 28(a0) + sh a1, 26(a0) + sh a1, 24(a0) + sh a1, 22(a0) + sh a1, 20(a0) + sh a1, 18(a0) + sh a1, 16(a0) + sh a1, 14(a0) + sh a1, 12(a0) + sh a1, 10(a0) + sh a1, 8(a0) + sh a1, 6(a0) + sh a1, 4(a0) + sh a1, 2(a0) + sh a1, 0(a0) +.option pop + ret +2: + // Get word-aligned before main fill loop + andi a3, a2, 0x2 + beqz a3, 1f + sh a1, (a0) + addi a0, a0, 2 + addi a2, a2, -1 +1: + // Set limit pointer at end - (loop body size) + slli a2, a2, 1 + add a2, a2, a0 + addi a2, a2, -32 + pack a1, a1, a1 + // We can fall through because cases < 1 loop are handled by slide +1: + sw a1, 0(a0) + sw a1, 4(a0) + sw a1, 8(a0) + sw a1, 12(a0) + sw a1, 16(a0) + sw a1, 20(a0) + sw a1, 24(a0) + sw a1, 28(a0) + addi a0, a0, 32 + bgeu a2, a0, 1b + + // Most of the work done, we have a few more to tidy up -- note bits 4:1 + // of the pointer difference are not affected by earlier subtraction of 32 + sub a2, a2, a0 + + // Bit 4 becomes sign bit + slli a2, a2, 27 + bgez a2, 1f + sw a1, 0(a0) + sw a1, 4(a0) + sw a1, 8(a0) + sw a1, 12(a0) + addi a0, a0, 16 +1: + slli a2, a2, 1 + bgez a2, 1f + sw a1, 0(a0) + sw a1, 4(a0) + addi a0, a0, 8 +1: + slli a2, a2, 1 + bgez a2, 1f + sw a1, 0(a0) + addi a0, a0, 4 +1: + slli a2, a2, 1 + bgez a2, 1f + sh a1, 0(a0) +1: + ret + + +// ---------------------------------------------------------------------------- +// Non-AT sprite + + +// TODO 8-bit version not yet ported to RISC-V +#if 0 +// Unrolled loop body with an initial computed branch. + +// a0: dst +// a1: src +// a2: pixel count +decl_func sprite_blit8 + mov ip, a0 + lsrs a3, a2, #3 + lsls a3, #3 + eors a2, a3 // a2 = pixels % 8, a3 = pixels - pixels % 8 + + add a0, a3 + add a1, a3 + + adr a3, 2f + lsls a2, #2 + subs a3, a2 + adds a3, #1 // thumb bit >:( + bx a3 + +.align 2 +1: + subs a0, #8 + subs a1, #8 + ldrb a3, [a1, #7] + strb a3, [a0, #7] + ldrb a3, [a1, #6] + strb a3, [a0, #6] + ldrb a3, [a1, #5] + strb a3, [a0, #5] + ldrb a3, [a1, #4] + strb a3, [a0, #4] + ldrb a3, [a1, #3] + strb a3, [a0, #3] + ldrb a3, [a1, #2] + strb a3, [a0, #2] + ldrb a3, [a1, #1] + strb a3, [a0, #1] + ldrb a3, [a1, #0] + strb a3, [a0, #0] +2: + cmp a0, ip + bhi 1b + bx lr + +.macro sprite_blit8_alpha_body n + ldrb a3, [a1, #\n] + lsrs a2, a3, #ALPHA_SHIFT_8BPP + bcc 2f + strb a3, [a0, #\n] +2: +.endm + +// a0: dst +// a1: src +// a2: pixel count +decl_func sprite_blit8_alpha + mov ip, a0 + lsrs a3, a2, #3 + lsls a3, #3 + eors a2, a3 + + add a0, a3 + add a1, a3 + + adr a3, 3f + lsls a2, #3 + subs a3, a2 + adds a3, #1 + bx a3 + +.align 2 +1: + subs a0, #8 + subs a1, #8 + sprite_blit8_alpha_body 7 + sprite_blit8_alpha_body 6 + sprite_blit8_alpha_body 5 + sprite_blit8_alpha_body 4 + sprite_blit8_alpha_body 3 + sprite_blit8_alpha_body 2 + sprite_blit8_alpha_body 1 + sprite_blit8_alpha_body 0 +3: + cmp a0, ip + bhi 1b + bx lr + +#endif + +// Note this is the same ideal cycle count as lhu; lhu; sh; sh; but it reduces +// the number of memory accesses by 25%, so less bus contention +.macro storew_alignh rd ra offs + sh \rd, \offs(\ra) + srli \rd, \rd, 16 + sh \rd, \offs+2(\ra) +.endm + +// a0: dst +// a1: src +// a2: pixel count +decl_func sprite_blit16 + // Force source pointer to be word-aligned + andi a3, a1, 2 + beqz a3, 1f + lhu a3, (a1) + sh a3, (a0) + addi a0, a0, 2 + addi a1, a1, 2 + addi a2, a2, -1 +1: + // Each loop is 8 pixels. Place limit pointer at 16 bytes before + // end, loop until past it. There will be 0 to 7 pixels remaining. + slli a2, a2, 1 + add a2, a2, a0 + addi a5, a2, -16 + // Early out: + bltu a5, a0, 2f +1: + lw a2, 0(a1) + lw a3, 4(a1) + storew_alignh a2, a0, 0 + storew_alignh a3, a0, 4 + lw a2, 8(a1) + lw a3, 12(a1) + storew_alignh a2, a0, 8 + storew_alignh a3, a0, 12 + addi a0, a0, 16 + addi a1, a1, 16 + bgeu a5, a0, 1b +2: + sub a5, a5, a0 + // At least 4 pixels? (bit 3 -> sign bit) + slli a5, a5, 28 + bgez a5, 1f + lw a2, 0(a1) + lw a3, 4(a1) + storew_alignh a2, a0, 0 + storew_alignh a3, a0, 4 + addi a0, a0, 8 + addi a1, a1, 8 +1: + // At least 2 pixels? + slli a5, a5, 1 + bgez a5, 1f + lw a2, 0(a1) + storew_alignh a2, a0, 0 + addi a0, a0, 4 + addi a1, a1, 4 +1: + // One more pixel? + slli a5, a5, 1 + bgez a5, 1f + lhu a3, (a1) + sh a3, (a0) +1: + ret + +// dst: a0, src: a1, clobbers: a4-a7 +.macro sprite_blit16_alpha_body_x2 n + // Disable RVC to force 32-bit alignment of branch targets without adding + // alignment nops (lhu/sh *may* be 16-bit if Zcb is enabled) +.option push +.option norvc + // Interleave two loads to avoid load->shift dependency stall + lhu a4, 4*\n(a1) + lhu a5, 4*\n+2(a1) + slli a6, a4, 32 - ALPHA_SHIFT_16BPP + slli a7, a5, 32 - ALPHA_SHIFT_16BPP + bgez a6, 3f + sh a4, 4*\n(a0) +3: + bgez a7, 3f + sh a5, 4*\n+2(a0) +3: +.option pop +.endm + +// a0: dst +// a1: src +// a2: pixel count +decl_func sprite_blit16_alpha + // Not using the computed branch approach of the v6-M code as it doesn't + // play nicely with the pairing of pixels used in the loop body here. + slli a2, a2, 1 + add a2, a2, a0 + norvc_3a addi, a2, a2, -16 + bltu a2, a0, 2f +1: + // 8 pixels per loop + sprite_blit16_alpha_body_x2 0 + sprite_blit16_alpha_body_x2 1 + sprite_blit16_alpha_body_x2 2 + sprite_blit16_alpha_body_x2 3 + addi a0, a0, 16 + addi a1, a1, 16 + bgeu a2, a0, 1b +2: + sub a2, a2, a0 + // At least 4 pixels? (bit 3 -> sign bit) + slli a2, a2, 28 + bgez a2, 1f + sprite_blit16_alpha_body_x2 0 + sprite_blit16_alpha_body_x2 1 + addi a0, a0, 8 + addi a1, a1, 8 +1: + // At least 2 pixels? + norvc_3a slli, a2, a2, 1 + bgez a2, 1f + sprite_blit16_alpha_body_x2 0 + addi a1, a1, 4 + addi a0, a0, 4 +1: + // One more pixel? + slli a2, a2, 1 + bgez a2, 1f + lhu a4, (a1) + slli a6, a4, 32 - ALPHA_SHIFT_16BPP + bgez a6, 1f + sh a4, (a0) +1: + ret +// ---------------------------------------------------------------------------- +// Affine-transformed sprite (note these are just the inner loops -- INTERP0 +// must be configured by the caller, which is presumably not written in asm) + +// TODO not yet ported to RISC-V +#if 0 +// r0: raster start pointer +// r1: raster span size (pixels) + +.macro sprite_ablit8_loop_body n + ldr r1, [r3, #CTRL0_OFFS] + ldr r2, [r3, #POP2_OFFS] + lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1 + bcs 2f + ldrb r2, [r2] + strb r2, [r0, #\n] +2: +.endm + +decl_func sprite_ablit8_loop + mov ip, r0 + + lsrs r2, r1, #3 + lsls r2, #3 + eors r1, r2 + add r0, r2 + + adr r2, 3f + movs r3, #12 // Each (non-unrolled) loop body is 12 bytes + muls r1, r3 + subs r2, r1 + adds r2, #1 + + ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) + bx r2 + +.align 2 + nop +1: + subs r0, #8 + sprite_ablit8_loop_body 7 + sprite_ablit8_loop_body 6 + sprite_ablit8_loop_body 5 + sprite_ablit8_loop_body 4 + sprite_ablit8_loop_body 3 + sprite_ablit8_loop_body 2 + sprite_ablit8_loop_body 1 + sprite_ablit8_loop_body 0 +3: + cmp r0, ip + bne 1b + bx lr + + + +// As above but bit 5 is assumed to be an alpha bit (RAGB2132) + +.macro sprite_ablit8_alpha_loop_body n + ldr r1, [r3, #CTRL0_OFFS] + ldr r2, [r3, #POP2_OFFS] + lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1 + bcs 2f + ldrb r2, [r2] + lsrs r1, r2, #ALPHA_SHIFT_8BPP + bcc 2f + strb r2, [r0, #\n] +2: +.endm + +decl_func sprite_ablit8_alpha_loop + mov ip, r0 + ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) + + lsrs r2, r1, #3 + lsls r2, #3 + eors r1, r2 + add r0, r2 + + adr r2, 3f + lsls r1, #4 // Each (non-unrolled) loop body is 16 bytes + subs r2, r1 + adds r2, #1 + bx r2 + +.align 2 + nop +1: + subs r0, #8 + sprite_ablit8_alpha_loop_body 7 + sprite_ablit8_alpha_loop_body 6 + sprite_ablit8_alpha_loop_body 5 + sprite_ablit8_alpha_loop_body 4 + sprite_ablit8_alpha_loop_body 3 + sprite_ablit8_alpha_loop_body 2 + sprite_ablit8_alpha_loop_body 1 + sprite_ablit8_alpha_loop_body 0 +3: + cmp r0, ip + bhi 1b + bx lr + + + +.macro sprite_ablit16_loop_body n + ldr r1, [r3, #CTRL0_OFFS] + ldr r2, [r3, #POP2_OFFS] + lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1 + bcs 2f + ldrh r2, [r2] + strh r2, [r0, #2*\n] +2: +.endm + +decl_func sprite_ablit16_loop + mov ip, r0 + + lsrs r2, r1, #3 + lsls r2, #3 + eors r1, r2 + lsls r2, #1 // Each pixel is 2 bytes + add r0, r2 + + adr r2, 3f + movs r3, #12 // Each (non-unrolled) loop body is 12 bytes + muls r1, r3 + subs r2, r1 + adds r2, #1 + + ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) + bx r2 + +.align 2 + nop +1: + subs r0, #16 + sprite_ablit16_loop_body 7 + sprite_ablit16_loop_body 6 + sprite_ablit16_loop_body 5 + sprite_ablit16_loop_body 4 + sprite_ablit16_loop_body 3 + sprite_ablit16_loop_body 2 + sprite_ablit16_loop_body 1 + sprite_ablit16_loop_body 0 +3: + cmp r0, ip + bne 1b + bx lr + +#endif + +#define FIX_OVERF_CHECK 1 + +#ifndef RISCV_HAVE_COMPRESSED_ISA +#error "Address calculations are incorrect if not assembled with C extension" +#endif +.macro sprite_ablit16_alpha_loop_body n + // Instructions which are only compressible under Zcb (e.g. lhu, sh) are + // forced uncompressed, to get consistent size for address calculations. + // This code should be exactly 24 bytes. + + // Bit 25 is OVERF, bit 24 is OVERF1, bits 31:26 are zero, so can test for + // overflow by testing the uppermost byte of CTRL0 for nonzero. +#if !FIX_OVERF_CHECK + norvc_2a lbu a1, CTRL0_OFFS+3(a5) + lw a2, POP2_OFFS(a5) + bnez a1, 2f +#else + lw a1, ACCUM0_OFFS(a5) + lw a3, ACCUM1_OFFS(a5) + lw a2, POP2_OFFS(a5) + srli a1, a1, 7 + 16 + bnez a1, 2f + srli a3, a3, 7 + 16 + bnez a3, 2f +#endif + norvc_2a lhu a2, (a2) + // TODO dep stall on lhu, but it makes the OVERF case faster: + slli a1, a2, 32 - ALPHA_SHIFT_16BPP + bgez a1, 2f + norvc_2a sh a2, 2*\n(a0) +2: +.endm + +decl_func sprite_ablit16_alpha_loop + mv a4, a0 + li a5, SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET + + // Split off pixels modulo 8 + andi a2, a1, 0x7 + sub a1, a1, a2 + // Pointer to beginning of endmost block of 8 pixels: + sh1add a0, a1, a0 + + // Compute branch into first loop, which has the modulo-8 pixels. + // Each pixel takes 24 bytes of instructions. +#if !FIX_OVERF_CHECK + slli a2, a2, 3 + sh1add a2, a2, a2 +#else + li a3, 30 + mul a2, a2, a3 +#endif + + la a1, 3f + sub a1, a1, a2 + jr a1 + +.align 2 +1: + norvc_3a addi a0, a0, -16 + sprite_ablit16_alpha_loop_body 7 + sprite_ablit16_alpha_loop_body 6 + sprite_ablit16_alpha_loop_body 5 + sprite_ablit16_alpha_loop_body 4 + sprite_ablit16_alpha_loop_body 3 + sprite_ablit16_alpha_loop_body 2 + sprite_ablit16_alpha_loop_body 1 + sprite_ablit16_alpha_loop_body 0 +3: + bltu a4, a0, 1b + ret diff --git a/software/libsprite/tile.c b/software/libsprite/tile.c index 45805fb..acc2cb4 100644 --- a/software/libsprite/tile.c +++ b/software/libsprite/tile.c @@ -1,6 +1,6 @@ #include "tile.h" -#include "pico/platform.h" // for __not_in_flash +#include "pico.h" // for __not_in_flash #include "hardware/interp.h" #define __ram_func(foo) __not_in_flash(#foo) foo diff --git a/software/libsprite/tile.S b/software/libsprite/tile_armv6m.S similarity index 100% rename from software/libsprite/tile.S rename to software/libsprite/tile_armv6m.S diff --git a/software/libsprite/tile_riscv.S b/software/libsprite/tile_riscv.S new file mode 100644 index 0000000..7d47657 --- /dev/null +++ b/software/libsprite/tile_riscv.S @@ -0,0 +1,188 @@ +#include "hardware/regs/addressmap.h" +#include "hardware/regs/sio.h" + +#include "sprite_asm_const.h" + +#define POP2_OFFS (SIO_INTERP0_POP_FULL_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) + +// ---------------------------------------------------------------------------- +// Tile layout +// +// Some terms: +// Tileset: 1D array of tile images, concatenated image-after-image +// Tilemap: 2D array of tileset indices +// +// Each tile image in a tileset is the same size. Tiles are square, either 8 x +// 8 px or 16 x 16 px. This makes it easy to find the start of a tile image +// given the tileset base pointer and a tile index (add + shift). +// +// Tilemaps are 8 bits per tile, always. +// +// One advantage of this layout is that y coordinates can be handled outside +// of the loops in this file, which are all scanline-oriented, by offsetting +// the tileset and tilemap pointers passed in. These routines only care about +// x. The tileset pointer is offset by y modulo tile height, and the tilemap +// pointer is offset by y divided by tile height, modulo tileset height in +// tiles. + +// Tileset: 16px tiles, 16bpp, with 1-bit alpha. +// Tilemap: 8 bit indices. + +.macro do_2px_16bpp_alpha rd rs rx dstoffs +.option push +.option norvc + // TODO we could save a shift here by making alpha the MSB (not worth it + // on Arm due to lack of sign-extension or flag update on loads) + slli \rx, \rs, 32 - ALPHA_SHIFT_16BPP + bgez \rx, 1f + sh \rs, \dstoffs(\rd) +1: + slli \rx, \rs, 16 - ALPHA_SHIFT_16BPP + bgez \rx, 1f + srli \rs, \rs, 16 + sh \rs, \dstoffs+2(\rd) +1: +.option pop +.endm + +.macro do_2px_16bpp rd rs dstoffs + sh \rs, \dstoffs(\rd) + srli \rs, \rs, 16 + sh \rs, \dstoffs+2(\rd) +.endm + +// interp1 has been set up to give the next x-ward pointer into the tilemap +// with each pop. This saves us having to remember the tilemap pointer and +// tilemap x size mask in core registers. + +// a0: dst +// a1: tileset +// a2: x0 (start pos in tile space) +// a3: x1 (end pos in tile space, exclusive) + +// Instantiated with alpha=1 and alpha=0 to get both variants of the loop. +// Linker garbage collection ensures we only keep the versions we use. + +.macro tile16_16px_loop_alpha_or_nonalpha alpha + li a7, SIO_BASE + SIO_INTERP1_ACCUM0_OFFSET + + // The main loop only handles whole tiles, so we may need to first copy + // individual pixels to get tile-aligned. Skip this entirely if we are + // already aligned, to avoid the extra interp pop. + andi a5, a2, 0xf + beqz a5, 3f + + // Get pointer to tileset image + lw a4, POP2_OFFS(a7) + lbu a4, (a4) // dep stall + slli a4, a4, 9 // 16 px wide * 16 px high * 2 bytes/px + add a4, a4, a1 + // Offset tile image pointer to align with x0 + sh1add a4, a5, a4 + // Fall through into copy loop +1: + lhu a5, (a4) + addi a4, a4, 2 // hoisted to fill load dependency slot +.if \alpha + slli a6, a5, 32 - ALPHA_SHIFT_16BPP + bgez a6, 2f +.endif + sh a5, (a0) +2: + addi a0, a0, 2 + addi a2, a2, 1 + // Skip out if we have already reached end of span: + bgeu a2, a3, 3f + // Loop if we are not yet aligned: (TODO these checks could be merged) + andi a6, a2, 0xf + bnez a6, 1b +3: + // The next output pixel is aligned to the start of a tile. Set up main loop. + + // Tileset pointer is only needed occasionally, so free up a1 for better + // code density: + mv t0, a1 + // t1: dst limit pointer at end of all pixels: + sub a3, a3, a2 + sh1add t1, a3, a0 + // a5: dst limit pointer at end of whole tiles: + andi a4, a3, ~0xf + sh1add a5, a4, a0 + + // a0 is dst, a7 is interp base, a1-a4 are trashed by loop, a5 is dst limit. + // Early skip for case of 0 whole tiles: + bgeu a0, a5, 3f +2: + // Get next tilemap pointer + lw a1, POP2_OFFS(a7) + // Get tile image pointer + lbu a1, (a1) // dep stall + slli a1, a1, 9 + add a1, a1, t0 + +.if \alpha + lw a3, 0(a1) + lw a4, 4(a1) + do_2px_16bpp_alpha a0 a3 a2 0 + do_2px_16bpp_alpha a0 a4 a2 4 + lw a3, 8(a1) + lw a4, 12(a1) + do_2px_16bpp_alpha a0 a3 a2 8 + do_2px_16bpp_alpha a0 a4 a2 12 + lw a3, 16(a1) + lw a4, 20(a1) + do_2px_16bpp_alpha a0 a3 a2 16 + do_2px_16bpp_alpha a0 a4 a2 20 + lw a3, 24(a1) + lw a4, 28(a1) + do_2px_16bpp_alpha a0 a3 a2 24 + do_2px_16bpp_alpha a0 a4 a2 28 +.else + lw a3, 0(a1) + lw a4, 4(a1) + do_2px_16bpp a0 a3 0 + do_2px_16bpp a0 a4 4 + lw a3, 8(a1) + lw a4, 12(a1) + do_2px_16bpp a0 a3 8 + do_2px_16bpp a0 a4 12 + lw a3, 16(a1) + lw a4, 20(a1) + do_2px_16bpp a0 a3 16 + do_2px_16bpp a0 a4 20 + lw a3, 24(a1) + lw a4, 28(a1) + do_2px_16bpp a0 a3 24 + do_2px_16bpp a0 a4 28 +.endif + addi a0, a0, 32 + bltu a0, a5, 2b +3: + + // Skip ahead if there are no spare pixels to tidy up + bgeu a0, t1, 3f + // Copy <1 tile's worth of loose pixels + lw a4, POP2_OFFS(a7) + lbu a4, (a4) // dep stall + slli a4, a4, 9 + add a4, a4, t0 +1: + lh a5, (a4) + addi a4, a4, 2 +.if \alpha + slli a6, a5, 32 - ALPHA_SHIFT_16BPP + bgez a6, 2f +.endif + sh a5, (a0) +2: + addi a0, a0, 2 + bltu a0, t1, 1b +3: + ret +.endm + +decl_func tile16_16px_alpha_loop + tile16_16px_loop_alpha_or_nonalpha 1 + +decl_func tile16_16px_loop + tile16_16px_loop_alpha_or_nonalpha 0