RP2350 changes (including RISC-V)

2024-08-10 13:29:14 -07:00 · 2024-08-10 13:29:14 -07:00 · ca941baf37
commit ca941baf37
--- a/Readme.md
+++ b/Readme.md
@ -1,3 +1,30 @@
+RP2350 PicoDVI Preview
+======================
+
+Changes from the public GitHub version:
+
+* All Arm assembly in `libdvi` has been ported to RISC-V and tuned for Hazard3
+* Some of the existing Arm assembly in `libdvi` has been tweaked for better performance on Cortex-M33
+* RGB encode now uses the SIO TMDS encoders by default on RP2350 (can be disabled by defining `DVI_USE_SIO_TMDS_ENCODE=0` -- see `software/libdvi/dvi_config_defs.h`)
+* Much of the Arm assembly in `libsprite` has been ported to RISC-V -- enough to run the stock demos
+
+Build instructions:
+
+```bash
+cd software
+mkdir build
+# PICO_PLATFORM can also be rp2350-riscv
+# List of DVI configs is in software/include/common_dvi_pin_configs.h
+cmake -DPICO_SDK_PATH=/path/to/sdk -DPICO_PLATFORM=rp2350 -DPICO_COPY_TO_RAM=1 -DDVI_DEFAULT_SERIAL_CONFIG=pico_sock_cfg ..
+make -j$(nproc)
+# Then flash a binary, e.g.:
+cp apps/tiles_and_sprites/tiles_and_sprites.uf2
+```
+
+If you plan to run the `vista` demo, then note that there are now two UF2 data files, `software/assets/vista_data_rp2040.uf2` and `software/assets/vista_data_rp2350.uf2`. The only difference is the family IDs: the first can be dragged on RP2040 and on RP2350 A0, and the second can be dragged on RP2350 A1 and later.
+
+The following is the original RP2040 writeup:
+
 Bitbanged DVI on the RP2040 Microcontroller
 ===========================================

--- a/software/.gitignore
+++ b/software/.gitignore
@ -1,2 +1,3 @@
 build
 *.swp
+build-*
--- a/software/apps/CMakeLists.txt
+++ b/software/apps/CMakeLists.txt
@ -1,4 +1,7 @@
-add_subdirectory(bad_apple)
+if (NOT PICO_RISCV)
+	# Arm assembly needs porting to RISC-V
+	add_subdirectory(bad_apple)
+endif()
 add_subdirectory(colour_terminal)
 add_subdirectory(christmas_snowflakes)
 add_subdirectory(dht_logging)
@ -12,5 +15,8 @@ add_subdirectory(tiles)
 add_subdirectory(tiles_and_sprites)
 add_subdirectory(tiles_parallax)
 add_subdirectory(vista)
-add_subdirectory(vista-palette)
+if (PICO_RP2040)
+	# Needs porting to use XIP stream instead of SSI, as was done to vista
+	add_subdirectory(vista-palette)
+endif()
 add_subdirectory(mandel-full)
--- a/software/apps/colour_terminal/main.c
+++ b/software/apps/colour_terminal/main.c
@ -8,7 +8,6 @@
 #include "hardware/gpio.h"
 #include "hardware/vreg.h"
 #include "hardware/structs/bus_ctrl.h"
-#include "hardware/structs/ssi.h"
 #include "hardware/dma.h"
 #include "pico/sem.h"

--- a/software/apps/colour_terminal/tmds_encode_font_2bpp.S
+++ b/software/apps/colour_terminal/tmds_encode_font_2bpp.S
@ -1,9 +1,11 @@
 #include "hardware/regs/addressmap.h"
 #include "hardware/regs/sio.h"

+#ifndef __riscv
 .syntax unified
 .cpu cortex-m0plus
 .thumb
+#endif

 // Using the following:
 //
@ -46,12 +48,13 @@
 // r8 contains a pointer to the font bitmap for this scanline.
 // r9 contains the TMDS LUT base.
 .macro do_char charbuf_offs colour_shift_instr colour_shamt
+#ifndef __riscv
 	// Get 8x font bits for next character, put 4 LSBs in bits 6:3 of r4 (so
 	// scaled to 8-byte LUT entries), and 4 MSBs in bits 6:3 of r6.
-	ldrb r4, [r0, #\charbuf_offs]                                     // 2
-	add r4, r8                                                        // 1
-	ldrb r4, [r4]                                                     // 2
-	lsrs r6, r4, #4                                                   // 1
+	ldrb r4, [r0, #\charbuf_offs]                                     // 2 (note these cycle
+	add r4, r8                                                        // 1  counts are for M0+
+	ldrb r4, [r4]                                                     // 2  and are a little
+	lsrs r6, r4, #4                                                   // 1  pessimistic on M33)
 	lsls r6, #3                                                       // 1
 	lsls r4, #28                                                      // 1
 	lsrs r4, #25                                                      // 1
@ -67,6 +70,31 @@
 	ldmia r4, {r4, r5}                                                // 3
 	ldmia r6, {r6, r7}                                                // 3
 	stmia r2!, {r4-r7}                                                // 5
+#else
+	lbu a4, \charbuf_offs(a0)                                         // 1
+	\colour_shift_instr a5, a1, \colour_shamt                         // 1
+	add a4, a4, t1                                                    // 1
+	lbu a4, (a4)                                                      // 2
+	srli a6, a4, 4                                                    // 1
+	andi a4, a4, 0xf                                                  // 1
+
+	// Get colour bits, add to TMDS LUT base and font bits
+	and a5, a5, a3                                                    // 1
+	add a5, a5, t2                                                    // 1
+	sh3add a4, a4, a5                                                 // 1
+	sh3add a6, a6, a5                                                 // 1
+
+	// Look up and write out 8 TMDS symbols
+	lw a5, 4(a4)                                                      // 1
+	lw a4, 0(a4)                                                      // 1
+	lw a7, 4(a6)                                                      // 1
+	lw a6, 0(a6)                                                      // 1
+	sw a4, 0(a2)                                                      // 1
+	sw a5, 4(a2)                                                      // 1
+	sw a6, 8(a2)                                                      // 1
+	sw a7, 12(a2)                                                     // 1
+	addi a2, a2, 16                                                   // 1
+#endif
 .endm


@ -78,9 +106,12 @@

 .section .scratch_x.tmds_encode_font_2bpp, "ax"
 .global tmds_encode_font_2bpp
+#ifndef __riscv
 .type tmds_encode_font_2bpp,%function
 .thumb_func
+#endif
 tmds_encode_font_2bpp:
+#ifndef __riscv
 	push {r4-r7, lr}
 	mov r4, r8
 	mov r5, r9
@ -123,6 +154,32 @@ tmds_encode_font_2bpp:
 	mov r10, r6
 	pop {r4-r7, pc}

+#else
+
+	sh1add t0, a3, a2
+	li a3, 0xf0 * 8
+
+	mv t1, a4
+	la t2, palettised_1bpp_tables
+	mv t3, a1
+
+	bgeu a2, t0, 2f
+1:
+	lw a1, (t3)
+	addi t3, t3, 4
+	do_char 0 slli 7
+	do_char 1 slli 3
+	do_char 2 srli 1
+	do_char 3 srli 5
+	do_char 4 srli 9
+	do_char 5 srli 13
+	do_char 6 srli 17
+	do_char 7 srli 21
+	addi a0, a0, 8
+	bltu a2, t0, 1b
+2:
+	ret
+#endif

 // Table generation:
 //	levels_2bpp_even = [0x05, 0x50, 0xaf, 0xfa]
--- a/software/apps/mandel-full/main.c
+++ b/software/apps/mandel-full/main.c
@ -8,7 +8,6 @@
 #include "hardware/pll.h"
 #include "hardware/sync.h"
 #include "hardware/structs/bus_ctrl.h"
-#include "hardware/structs/ssi.h"
 #include "hardware/vreg.h"
 #include "pico/multicore.h"
 #include "pico/sem.h"
--- a/software/apps/terminal/main.c
+++ b/software/apps/terminal/main.c
@ -8,7 +8,6 @@
 #include "hardware/gpio.h"
 #include "hardware/vreg.h"
 #include "hardware/structs/bus_ctrl.h"
-#include "hardware/structs/ssi.h"
 #include "hardware/dma.h"
 #include "pico/sem.h"

--- a/software/apps/tiles_parallax/main.c
+++ b/software/apps/tiles_parallax/main.c
@ -127,8 +127,10 @@ void __not_in_flash("render") render_loop() {
 			tile16(pixbuf, &bg1, y, FRAME_WIDTH);
 			queue_add_blocking(&dvi0.q_colour_valid, &pixbuf);
 		}
-		bg0.xscroll += 1;
-		bg1.xscroll += 2;
+		bg1.xscroll += 1;
+		if (frame_ctr & 1) {
+			bg0.xscroll += 1;
+		}
 		++frame_ctr;
 	}
 }
--- a/software/apps/vista-palette/CMakeLists.txt
+++ b/software/apps/vista-palette/CMakeLists.txt
@ -10,9 +10,19 @@ add_executable(vista-palette
 # flash using direct SSI DMA, which would trample on XIP.
 pico_set_binary_type(vista-palette copy_to_ram)

-pico_define_boot_stage2(vista-palette_boot2 ${PICO_SDK_PATH}/src/rp2_common/boot_stage2/boot2_w25q080.S)
-target_compile_definitions(vista-palette_boot2 PRIVATE PICO_FLASH_SPI_CLKDIV=4)
-pico_set_boot_stage2(vista-palette vista-palette_boot2)
+
+if (PICO_RP2040)
+	pico_define_boot_stage2(vista-palette_boot2 ${PICO_SDK_PATH}/src/rp2040/boot_stage2/boot2_w25q080.S)
+	pico_set_boot_stage2(vista-palette vista-palette_boot2)
+	target_compile_definitions(vista-palette_boot2 PRIVATE PICO_FLASH_SPI_CLKDIV=4)
+else ()
+	target_compile_definitions(vista-palette PRIVATE
+		PICO_EMBED_XIP_SETUP=1
+		PICO_BOOT_STAGE2_CHOOSE_W25Q080=1
+		PICO_FLASH_SPI_CLKDIV=2
+		PICO_FLASH_SPI_RXDELAY=3
+		)
+endif()

 target_compile_definitions(vista-palette PRIVATE
 	DVI_DEFAULT_SERIAL_CONFIG=${DVI_DEFAULT_SERIAL_CONFIG}
--- a/software/apps/vista/CMakeLists.txt
+++ b/software/apps/vista/CMakeLists.txt
@ -13,6 +13,19 @@ target_compile_definitions(vista PRIVATE
 	DVI_SYMBOLS_PER_WORD=1
 	)

+if (PICO_RP2040)
+	pico_define_boot_stage2(vista_boot2 ${PICO_SDK_PATH}/src/rp2040/boot_stage2/boot2_w25q080.S)
+	pico_set_boot_stage2(vista vista_boot2)
+	target_compile_definitions(vista_boot2 PRIVATE PICO_FLASH_SPI_CLKDIV=4)
+else ()
+	target_compile_definitions(vista PRIVATE
+		PICO_EMBED_XIP_SETUP=1
+		PICO_BOOT_STAGE2_CHOOSE_W25Q080=1
+		PICO_FLASH_SPI_CLKDIV=2
+		PICO_FLASH_SPI_RXDELAY=3
+		)
+endif()
+
 target_compile_definitions(vista PRIVATE PICO_STACK_SIZE=0x200)

 target_link_libraries(vista
--- a/software/apps/vista/main.c
+++ b/software/apps/vista/main.c
@ -7,11 +7,17 @@
 #include "hardware/pll.h"
 #include "hardware/sync.h"
 #include "hardware/structs/bus_ctrl.h"
-#include "hardware/structs/ssi.h"
 #include "hardware/vreg.h"
 #include "pico/multicore.h"
 #include "pico/sem.h"
 #include "pico/stdlib.h"
+#if PICO_RP2040
+#include "hardware/structs/ssi.h"
+#else
+#include "hardware/structs/xip_ctrl.h"
+#include "hardware/structs/xip_aux.h"
+#include "hardware/structs/qmi.h"
+#endif

 #include "tmds_encode.h"

@ -45,27 +51,40 @@ static inline void prepare_scanline(const uint32_t *colourbuf, uint32_t *tmdsbuf
 	tmds_encode_data_channel_fullres_16bpp(colourbuf, tmdsbuf + 2 * pixwidth, pixwidth, 15, 11);
 }

-void __no_inline_not_in_flash_func(flash_bulk_dma_start)(uint32_t *rxbuf, uint32_t flash_offs, size_t len, uint dma_chan)
-{
+void __no_inline_not_in_flash_func(flash_bulk_dma_start)(uint32_t *rxbuf, uint32_t flash_offs, size_t len, uint dma_chan) {
+#if PICO_RP2040
+	// On RP2040, program the SSI to clock the correct amount of data without stopping
 	ssi_hw->ssienr = 0;
 	ssi_hw->ctrlr1 = len - 1; // NDF, number of data frames
 	ssi_hw->dmacr = SSI_DMACR_TDMAE_BITS | SSI_DMACR_RDMAE_BITS;
 	ssi_hw->ssienr = 1;
 	// Other than NDF, the SSI configuration used for XIP is suitable for a bulk read too.
-
-	dma_hw->ch[dma_chan].read_addr = (uint32_t)&ssi_hw->dr0;
+	const uintptr_t read_addr = (uintptr_t)&ssi_hw->dr0;
+	const uint dreq = DREQ_XIP_SSIRX;
+	const bool bswap = true;
+#else
+	// On RP2350, SSI is gone, but XIP streaming is fast enough to keep up with this demo
+	// (you can still DMA to the DIRECT_MODE FIFOs if you really need 100%)
+	xip_ctrl_hw->stream_addr = flash_offs;
+	xip_ctrl_hw->stream_ctr = len;
+	const uintptr_t read_addr = (uintptr_t)&xip_aux_hw->stream;
+	const uint dreq = DREQ_XIP_STREAM;
+	const bool bswap = false;
+#endif
+	dma_hw->ch[dma_chan].read_addr = read_addr;
 	dma_hw->ch[dma_chan].write_addr = (uint32_t)rxbuf;
 	dma_hw->ch[dma_chan].transfer_count = len;
 	dma_hw->ch[dma_chan].ctrl_trig =
-		DMA_CH0_CTRL_TRIG_BSWAP_BITS |
-		DREQ_XIP_SSIRX << DMA_CH0_CTRL_TRIG_TREQ_SEL_LSB |
+		(uint)bswap << DMA_CH0_CTRL_TRIG_BSWAP_LSB |
+		dreq << DMA_CH0_CTRL_TRIG_TREQ_SEL_LSB |
 		dma_chan << DMA_CH0_CTRL_TRIG_CHAIN_TO_LSB |
 		DMA_CH0_CTRL_TRIG_INCR_WRITE_BITS |
 		DMA_CH0_CTRL_TRIG_DATA_SIZE_VALUE_SIZE_WORD << DMA_CH0_CTRL_TRIG_DATA_SIZE_LSB |
 		DMA_CH0_CTRL_TRIG_EN_BITS;
-
+#if PICO_RP2040
 	// Now DMA is waiting, kick off the SSI transfer (mode continuation bits in LSBs)
 	ssi_hw->dr0 = (flash_offs << 8) | 0xa0;
+#endif
 }

 // Core 1 handles DMA IRQs and runs TMDS encode on scanline buffers it
@ -91,6 +110,15 @@ int __not_in_flash("main") main() {
 	sleep_ms(10);
 	set_sys_clock_khz(DVI_TIMING.bit_clk_khz, true);

+	// A0 SDK won't pick up on the PICO_EMBED_XIP_SETUP flag, so just to make sure:
+#if PICO_RP2350
+	hw_write_masked(
+		&qmi_hw->m[0].timing,
+		3 << QMI_M0_TIMING_RXDELAY_LSB | 2 << QMI_M0_TIMING_CLKDIV_LSB,
+		QMI_M0_TIMING_RXDELAY_BITS | QMI_M0_TIMING_CLKDIV_BITS
+	);
+#endif
+
 	setup_default_uart();

 	gpio_init(LED_PIN);
@ -131,21 +159,25 @@ int __not_in_flash("main") main() {
 		}
 		for (int y = 0; y < 2 * FRAME_HEIGHT; y += 2) {
 			// Start DMA to back buffer before starting to encode the front buffer (each buffer is two scanlines)
+#if !PICO_RP2040
+			// On RP2040 we could never reach this point early, because of the slow encode!
+			dma_channel_wait_for_finish_blocking(img_dma_chan);
+#endif
 			flash_bulk_dma_start(
 				(uint32_t*)img_buf[img_buf_back],
 				current_image_base + ((y + 2) % (2 * FRAME_HEIGHT)) * IMAGE_SCANLINE_SIZE,
 				IMAGE_SCANLINE_SIZE * 2 / sizeof(uint32_t),
 				img_dma_chan
 			);
-			const uint16_t *img = (const uint16_t*)img_buf[img_buf_front];			
+			const uint16_t *img = (const uint16_t*)img_buf[img_buf_front];
 			uint32_t *our_tmds_buf, *their_tmds_buf;
 			queue_remove_blocking_u32(&dvi0.q_tmds_free, &their_tmds_buf);
 			multicore_fifo_push_blocking((uint32_t)(img));
 			multicore_fifo_push_blocking((uint32_t)their_tmds_buf);
-	
+
 			queue_remove_blocking_u32(&dvi0.q_tmds_free, &our_tmds_buf);
 			prepare_scanline((const uint32_t*)(img + FRAME_WIDTH * 2), our_tmds_buf);
-			
+
 			multicore_fifo_pop_blocking();
 			queue_add_blocking_u32(&dvi0.q_tmds_valid, &their_tmds_buf);
 			queue_add_blocking_u32(&dvi0.q_tmds_valid, &our_tmds_buf);
@ -156,4 +188,3 @@ int __not_in_flash("main") main() {
 	}
 	__builtin_unreachable();
 }
-	
--- a/software/assets/vista_data_rp2040.uf2
+++ b/software/assets/vista_data_rp2040.uf2
--- a/software/assets/vista_data_rp2350.uf2
+++ b/software/assets/vista_data_rp2350.uf2
--- a/software/include/common_dvi_pin_configs.h
+++ b/software/include/common_dvi_pin_configs.h
@ -28,6 +28,17 @@ static const struct dvi_serialiser_cfg picodvi_reva_dvi_cfg = {
 	.invert_diffpairs = true
 };

+// AMY-DVI board, for getting HDMI from the RP2350 FPGA development platform,
+// again a cursed board that only a couple of people in the world possess:
+static const struct dvi_serialiser_cfg amy_dvi_cfg = {
+	.pio = DVI_DEFAULT_PIO_INST,
+	.sm_tmds = {0, 1, 2},
+	.pins_tmds = {14, 16, 18},
+	.pins_clk = 12,
+	.invert_diffpairs = true
+};
+
+
 // The not-HDMI socket on Rev C PicoDVI boards
 // (we don't talk about Rev B)
 static const struct dvi_serialiser_cfg picodvi_dvi_cfg = {
--- a/software/libdvi/dvi.c
+++ b/software/libdvi/dvi.c
@ -184,7 +184,7 @@ static void __dvi_func(dvi_dma_irq_handler)(struct dvi_inst *inst) {
 	// Make sure all three channels have definitely loaded their last block
 	// (should be within a few cycles of one another)
 	for (int i = 0; i < N_TMDS_LANES; ++i) {
-		while (dma_debug_hw->ch[inst->dma_cfg[i].chan_data].tcr != inst->timing->h_active_pixels / DVI_SYMBOLS_PER_WORD)
+		while (dma_debug_hw->ch[inst->dma_cfg[i].chan_data].dbg_tcr != inst->timing->h_active_pixels / DVI_SYMBOLS_PER_WORD)
 			tight_loop_contents();
 	}

--- a/software/libdvi/dvi_config_defs.h
+++ b/software/libdvi/dvi_config_defs.h
@ -51,8 +51,16 @@
 #define DVI_SYMBOLS_PER_WORD 2
 #endif

-#if DVI_SYMBOLS_PER_WORD != 1 && DVI_SYMBOLS_PER_WORD !=2
-#error "Unsupported value for DVI_SYMBOLS_PER_WORD"
+// Implement TMDS encode with hardware encoders in SIO, instead of
+// interpolators + LUTs. The processor still has to crank the encoder, but
+// it's much faster. This still works with PIO serialisers, which can appear
+// on any GPIO, unlike the HSTX which is limited to specific GPIOs.
+#ifndef DVI_USE_SIO_TMDS_ENCODER
+#if PICO_RP2040
+#define DVI_USE_SIO_TMDS_ENCODER 0
+#else
+#define DVI_USE_SIO_TMDS_ENCODER 1
+#endif
 #endif

 // ----------------------------------------------------------------------------
--- a/software/libdvi/tmds_encode.S
+++ b/software/libdvi/tmds_encode.S
@ -2,6 +2,10 @@
 #include "hardware/regs/sio.h"
 #include "dvi_config_defs.h"

+// This file contains both Arm and RISC-V source, with the correct version
+// selected via the __arm__ and __riscv predefined macros. The targeted Arm
+// dialect is Armv6-M, and the targeted RISC-V dialect is RV32IZba
+
 // Offsets suitable for ldr/str (must be <= 0x7c):
 #define ACCUM0_OFFS     (SIO_INTERP0_ACCUM0_OFFSET     - SIO_INTERP0_ACCUM0_OFFSET)
 #define ACCUM1_OFFS     (SIO_INTERP0_ACCUM1_OFFSET     - SIO_INTERP0_ACCUM0_OFFSET)
@ -13,23 +17,33 @@
 // Note the entirety of INTERP0 and INTERP1 fits inside this 5-bit
 // word-addressed space... almost as though it were intentional! :)

+#if defined(__arm__) && defined(__riscv)
+#error "wat"
+#endif
+
+#ifdef __arm__
 .syntax unified
 .cpu cortex-m0plus
 .thumb
+#endif

 .macro decl_func_x name
 .section .scratch_x.\name, "ax"
 .global \name
+#ifdef __arm__
 .type \name,%function
 .thumb_func
+#endif
 \name:
 .endm

 .macro decl_func_y name
 .section .scratch_y.\name, "ax"
 .global \name
+#ifdef __arm__
 .type \name,%function
 .thumb_func
+#endif
 \name:
 .endm

@ -41,7 +55,10 @@
 // r0: Input buffer (word-aligned)
 // r1: Output buffer (word-aligned)
 // r2: Input size (pixels)
+// r3: Left shift (for the *_leftshift variant only -- costs 1 cycle per 2 pixels)

+#if defined(__arm__)
+// Armv6-M:
 .macro do_channel_16bpp r_ibase r_inout0 r_out1
 	str \r_inout0, [\r_ibase, #ACCUM0_OFFS]
 	ldr \r_inout0, [\r_ibase, #PEEK0_OFFS]
@ -50,8 +67,11 @@
 	ldr \r_out1, [\r_out1]
 .endm

-decl_func tmds_encode_loop_16bpp
+.macro tmds_encode_loop_16bpp_impl leftshift
 	push {r4, r5, r6, r7, lr}
+	// Bounds calculation: each input pixel results in two output pixels,
+	// whose two TMDS symbols are packed in a single 32-bit word. So, 4 bytes
+	// out per one pixel in.
 	lsls r2, #2
 	add r2, r1
 	mov ip, r2
@ -61,7 +81,13 @@ decl_func tmds_encode_loop_16bpp
 1:
 .rept TMDS_ENCODE_UNROLL
 	ldmia r0!, {r4, r6}
+.if \leftshift
+	lsls r4, r3
+.endif
 	do_channel_16bpp r2, r4, r5
+.if \leftshift
+	lsls r6, r3
+.endif
 	do_channel_16bpp r2, r6, r7
 	stmia r1!, {r4, r5, r6, r7}
 .endr
@ -69,82 +95,72 @@ decl_func tmds_encode_loop_16bpp
 	cmp r1, ip
 	bne 1b
 	pop {r4, r5, r6, r7, pc}
+.endm

-// Same as above, but scale data to make up for lack of left shift
-// in interpolator (costs 1 cycle per 2 pixels)
-//
-// r0: Input buffer (word-aligned)
-// r1: Output buffer (word-aligned)
-// r2: Input size (pixels)
-// r3: Left shift amount
+#elif defined(__riscv)
+.macro do_channel_16bpp r_ibase r_inout0 r_out1
+	sw \r_inout0, ACCUM0_OFFS(\r_ibase)
+	// Note two halves are interleaved to avoid load->addr dependency
+	lw \r_inout0, PEEK0_OFFS(\r_ibase)
+	lw \r_out1, PEEK1_OFFS(\r_ibase)
+	lw \r_inout0, (\r_inout0)
+	lw \r_out1, (\r_out1)
+.endm
+
+.macro tmds_encode_loop_16bpp_impl leftshift
+	slli a2, a2, 2
+	add t0, a2, a1
+	bgeu a1, t0, 2f
+	li a2, SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET
+.align 2
+1:
+.set i, 0
+.rept TMDS_ENCODE_UNROLL
+	lw a4, 8 * i + 0(a0)
+	lw a6, 8 * i + 4(a0)
+.if \leftshift
+	sll a4, a4, a3
+	sll a6, a6, a3
+.endif
+	do_channel_16bpp a2, a4, a5
+	do_channel_16bpp a2, a6, a7
+	sw a4, 16 * i + 0(a1)
+	sw a5, 16 * i + 4(a1)
+	sw a6, 16 * i + 8(a1)
+	sw a7, 16 * i + 12(a1)
+.set i, i + 1
+.endr
+	addi a0, a0, 8 * TMDS_ENCODE_UNROLL
+	addi a1, a1, 16 * TMDS_ENCODE_UNROLL
+	bltu a1, t0, 1b
+2:
+	ret
+.endm
+
+#else
+#error "Unknown architecture"
+#endif
+
+decl_func tmds_encode_loop_16bpp
+tmds_encode_loop_16bpp_impl 0

 decl_func tmds_encode_loop_16bpp_leftshift
-	push {r4, r5, r6, r7, lr}
-	lsls r2, #2
-	add r2, r1
-	mov ip, r2
-	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
-	b 2f
-.align 2
-1:
-.rept TMDS_ENCODE_UNROLL
-	ldmia r0!, {r4, r6}
-	lsls r4, r3
-	do_channel_16bpp r2, r4, r5
-	lsls r6, r3
-	do_channel_16bpp r2, r6, r7
-	stmia r1!, {r4, r5, r6, r7}
-.endr
-2:
-	cmp r1, ip
-	bne 1b
-	pop {r4, r5, r6, r7, pc}
+tmds_encode_loop_16bpp_impl 1

 // r0: Input buffer (word-aligned)
 // r1: Output buffer (word-aligned)
 // r2: Input size (pixels)
-
-decl_func tmds_encode_loop_8bpp
-	push {r4, r5, r6, r7, lr}
-	lsls r2, #2
-	add r2, r1
-	mov ip, r2
-	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
-	b 2f
-.align 2
-1:
-.rept TMDS_ENCODE_UNROLL
-	ldmia  r0!, {r4}
-	str r4, [r2, #ACCUM0_OFFS + INTERP1]
-	str r4, [r2, #ACCUM0_OFFS]
-	ldr r4, [r2, #PEEK0_OFFS]
-	ldr r4, [r4]
-	ldr r5, [r2, #PEEK1_OFFS]
-	ldr r5, [r5]
-	ldr r6, [r2, #PEEK0_OFFS + INTERP1]
-	ldr r6, [r6]
-	ldr r7, [r2, #PEEK1_OFFS + INTERP1]
-	ldr r7, [r7]
-	stmia r1!, {r4, r5, r6, r7}
-.endr
-2:
-	cmp r1, ip
-	bne 1b
-	pop {r4, r5, r6, r7, pc}
-
-// r0: Input buffer (word-aligned)
-// r1: Output buffer (word-aligned)
-// r2: Input size (pixels)
-// r3: Left shift amount
+// r3: Left shift amount (for the *_leftshift variant of the function)
 //
 // Note that only the data written to interp0 (pixel 0, 1) is leftshifted, not
 // the data written to interp1 (pixel 2, 3). Otherwise we always lose MSBs, as
 // the LUT offset MSB is at bit 8, so pixel 0 always requires some left shift,
 // since its channel MSBs are no greater than 7.

-decl_func tmds_encode_loop_8bpp_leftshift
+#if defined(__arm__)
+.macro tmds_encode_loop_8bpp_impl leftshift
 	push {r4, r5, r6, r7, lr}
-	lsls r2, #3
+	lsls r2, #2
 	add r2, r1
 	mov ip, r2
 	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
@ -154,7 +170,9 @@ decl_func tmds_encode_loop_8bpp_leftshift
 .rept TMDS_ENCODE_UNROLL
 	ldmia  r0!, {r4}
 	str r4, [r2, #ACCUM0_OFFS + INTERP1]
+.if \leftshift
 	lsls r4, r3
+.endif
 	str r4, [r2, #ACCUM0_OFFS]
 	ldr r4, [r2, #PEEK0_OFFS]
 	ldr r4, [r4]
@ -170,6 +188,54 @@ decl_func tmds_encode_loop_8bpp_leftshift
 	cmp r1, ip
 	bne 1b
 	pop {r4, r5, r6, r7, pc}
+.endm
+
+#elif defined(__riscv)
+.macro tmds_encode_loop_8bpp_impl leftshift
+	slli a2, a2, 2
+	add a2, a2, a1
+	bgeu a1, a2, 2f
+	mv t0, a2
+	li a2, SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET
+.align 2
+1:
+.set i, 0
+.rept TMDS_ENCODE_UNROLL
+	lw a4, 4 * i(a0)
+	sw a4, ACCUM0_OFFS + INTERP1(a2)
+.if \leftshift
+	sll a4, a4, a3
+.endif
+	sw a4, ACCUM0_OFFS(a2)
+	lw a4, PEEK0_OFFS(a2)
+	lw a5, PEEK1_OFFS(a2)
+	lw a4, (a4)
+	lw a5, (a5)
+	lw a6, PEEK0_OFFS + INTERP1(a2)
+	lw a7, PEEK1_OFFS + INTERP1(a2)
+	lw a6, (a6)
+	lw a7, (a7)
+	sw a4, 16 * i +  0(a1)
+	sw a5, 16 * i +  4(a1)
+	sw a6, 16 * i +  8(a1)
+	sw a7, 16 * i + 12(a1)
+.set i, i + 1
+.endr
+	addi a0, a0, TMDS_ENCODE_UNROLL * 4
+	addi a1, a1, TMDS_ENCODE_UNROLL * 16
+	bltu a1, t0, 1b
+2:
+	ret
+.endm
+
+#else
+#error "Unknown architecture"
+#endif
+
+decl_func tmds_encode_loop_8bpp
+tmds_encode_loop_8bpp_impl 0
+decl_func tmds_encode_loop_8bpp_leftshift
+tmds_encode_loop_8bpp_impl 1

 // ----------------------------------------------------------------------------
 // Fast 1bpp black/white encoder (full res)
@ -190,6 +256,8 @@ decl_func tmds_encode_loop_8bpp_leftshift
 // r3 contains lookup mask (preshifted)
 // r8 contains pointer to encode table
 // 2.125 cyc/pix
+
+#if defined(__arm__)
 .macro tmds_encode_1bpp_body shift_instr0 shamt0 shift_instr1 shamt1
 	\shift_instr0 r4, r2, #\shamt0
 	ands r4, r3
@ -238,6 +306,58 @@ decl_func tmds_encode_1bpp
 	mov r8, r7
 	pop {r4-r7, pc}

+#elif defined(__riscv)
+// TODO the register allocation is not optimal here for code size
+.macro tmds_encode_1bpp_body shift_instr0 shamt0 shift_instr1 shamt1
+	\shift_instr0 a4, a2, \shamt0
+	and a4, a4, a3
+	add a4, a4, t1
+	lw a5, 4(a4)
+	lw a4, 0(a4)
+	\shift_instr1 a6, a2, \shamt1
+	and a6, a6, a3
+	add a6, a6, t1
+	lw a7, 4(a6)
+	lw a6, 0(a6)
+	sw a4, 0(a1)
+	sw a5, 4(a1)
+	sw a6, 8(a1)
+	sw a7, 12(a1)
+	addi a1, a1, 16
+.endm
+
+// a0: input buffer (word-aligned)
+// a1: output buffer (word-aligned)
+// a2: output pixel count
+decl_func tmds_encode_1bpp
+	slli a2, a2, 1
+	add t0, a2, a1
+	la t1, tmds_1bpp_table
+	// Mask: 4 bit index, 8 bytes per entry
+	li a3, 0x78
+	bgeu a1, t0, 2f
+1:
+	lw a2, (a0)
+	addi a0, a0, 4
+#if !DVI_1BPP_BIT_REVERSE
+	tmds_encode_1bpp_body slli 3  srli 1
+	tmds_encode_1bpp_body srli 5  srli 9
+	tmds_encode_1bpp_body srli 13 srli 17
+	tmds_encode_1bpp_body srli 21 srli 25
+#else
+	tmds_encode_1bpp_body srli 1   slli 3
+	tmds_encode_1bpp_body srli 9   srli 5
+	tmds_encode_1bpp_body srli 17  srli 13
+	tmds_encode_1bpp_body srli 25  srli 21
+#endif
+	bltu a1, t0, 1b
+2:
+	ret
+
+#else
+#error "Unknown architecture"
+#endif
+
 .align 2
 tmds_1bpp_table:
 #if !DVI_1BPP_BIT_REVERSE
@ -299,6 +419,7 @@ tmds_1bpp_table:
 // level 2: (a5 -> 163) always
 // level 3: (ef -> 2f0) always

+#if defined(__arm__)
 // Table base pointer in r0. Input pixels in r2.
 .macro encode_2bpp_body shift_instr shamt rd
 	\shift_instr \rd, r2, #\shamt
@ -343,6 +464,55 @@ decl_func tmds_encode_2bpp
 	mov r8, r7
 	pop {r4-r7, pc}

+#elif defined(__riscv)
+// Table base pointer in a0. Input pixels in a2.
+.macro encode_2bpp_body shift_instr shamt rd
+	\shift_instr \rd, a2, \shamt
+	and \rd, \rd, a3
+	add \rd, \rd, a0
+	lw \rd, (\rd)
+.endm
+
+// a0: input buffer (word-aligned)
+// a1: output buffer (word-aligned)
+// a2: output pixel count
+decl_func tmds_encode_2bpp
+	mv t1, a0
+	la a0, tmds_2bpp_table
+	// Mask: 4-bit index into 4-byte entries.
+	li a3, 0x3c
+	// Limit pointer: 1 word per 2 pixels
+	slli a2, a2, 1
+	add t0, a2, a1
+	bgeu a1, t0, 1b
+1:
+	lw a2, (t1)
+	addi t1, t1, 4
+	encode_2bpp_body slli 2  a4
+	encode_2bpp_body srli 2  a5
+	encode_2bpp_body srli 6  a6
+	encode_2bpp_body srli 10 a7
+	sw a4, 0(a1)
+	sw a5, 4(a1)
+	sw a6, 8(a1)
+	sw a7, 12(a1)
+	encode_2bpp_body srli 14 a4
+	encode_2bpp_body srli 18 a5
+	encode_2bpp_body srli 22 a6
+	encode_2bpp_body srli 26 a7
+	sw a4, 16(a1)
+	sw a5, 20(a1)
+	sw a6, 24(a1)
+	sw a7, 28(a1)
+	addi a1, a1, 32
+	bltu a1, t0, 1b
+2:
+	ret
+
+#else
+#error "Unknown architecture"
+#endif
+
 .align 2
 tmds_2bpp_table:
 	.word 0x7f103 // 00, 00
@ -404,17 +574,20 @@ tmds_2bpp_table:
 // much better, and many monitors will still accept the signals as long as you
 // DC couple your DVI signals.

-.macro tmds_fullres_encode_loop_body ra rb
+#if defined(__arm__)
+.macro tmds_fullres_encode_loop_body leftshift ra rb
 	str \ra, [r2, #ACCUM0_OFFS + INTERP1]
+.if \leftshift
+	lsls \ra, r3
+.endif
 	str \ra, [r2, #ACCUM0_OFFS]
+	// Loads interleaved to avoid rdata->addr stall on M33
 	ldr \ra, [r2, #PEEK2_OFFS]
-	ldr \ra, [\ra]
-#if !TMDS_FULLRES_NO_DC_BALANCE
-	str \ra, [r2, #ACCUM1_ADD_OFFS]
-#endif
 	ldr \rb, [r2, #PEEK2_OFFS + INTERP1]
+	ldr \ra, [\ra]
 	ldr \rb, [\rb]
 #if !TMDS_FULLRES_NO_DC_BALANCE
+	str \ra, [r2, #ACCUM1_ADD_OFFS]
 	str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1]
 #endif
 .endm
@ -422,8 +595,9 @@ tmds_2bpp_table:
 // r0: Input buffer (word-aligned)
 // r1: Output buffer (word-aligned)
 // r2: Pixel count
+// r3: Left shift amount

-.macro tmds_fullres_encode_loop_16bpp
+.macro tmds_fullres_encode_loop_16bpp leftshift
 	push {r4-r7, lr}
 	mov r4, r8
 	push {r4}
@ -451,8 +625,8 @@ tmds_2bpp_table:
 1:
 .rept 16
 	ldmia r0!, {r4, r6}
-	tmds_fullres_encode_loop_body r4 r5
-	tmds_fullres_encode_loop_body r6 r7
+	tmds_fullres_encode_loop_body \leftshift r4 r5
+	tmds_fullres_encode_loop_body \leftshift r6 r7
 	stmia r1!, {r4, r5, r6, r7}
 .endr
 2:
@ -465,82 +639,77 @@ tmds_2bpp_table:
 	pop {r4-r7, pc}
 .endm

-// One copy each in X and Y, so the two cores don't step on each other
-decl_func_x tmds_fullres_encode_loop_16bpp_x
-	tmds_fullres_encode_loop_16bpp
-decl_func_y tmds_fullres_encode_loop_16bpp_y
-	tmds_fullres_encode_loop_16bpp
+#elif defined(__riscv)

-
-.macro tmds_fullres_encode_loop_body_leftshift ra rb
-	// Note we apply the leftshift for INTERP0 only
-	str \ra, [r2, #ACCUM0_OFFS + INTERP1]
-	lsls \ra, r3
-	str \ra, [r2, #ACCUM0_OFFS]
-	ldr \ra, [r2, #PEEK2_OFFS]
-	ldr \ra, [\ra]
+.macro tmds_fullres_encode_loop_body leftshift ra rb
+	sw \ra, ACCUM0_OFFS + INTERP1(a2)
+.if \leftshift
+	sll \ra, \ra, a3
+.endif
+	sw \ra, ACCUM0_OFFS(a2)
+	lw \ra, PEEK2_OFFS(a2)
+	lw \rb, PEEK2_OFFS + INTERP1(a2)
+	lw \ra, (\ra)
+	lw \rb, (\rb)
 #if !TMDS_FULLRES_NO_DC_BALANCE
-	str \ra, [r2, #ACCUM1_ADD_OFFS]
-#endif
-	ldr \rb, [r2, #PEEK2_OFFS + INTERP1]
-	ldr \rb, [\rb]
-#if !TMDS_FULLRES_NO_DC_BALANCE
-	str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1]
+	sw \ra, ACCUM1_ADD_OFFS(a2)
+	sw \rb, ACCUM1_ADD_OFFS + INTERP1(a2)
 #endif
 .endm

-// r0: Input buffer (word-aligned)
-// r1: Output buffer (word-aligned)
-// r2: Pixel count
-// r3: Left shift amount
+// a0: Input buffer (word-aligned)
+// a1: Output buffer (word-aligned)
+// a2: Pixel count
+// a3: Left shift amount

-.macro tmds_fullres_encode_loop_16bpp_leftshift
-	push {r4-r7, lr}
-	mov r4, r8
-	mov r5, r9
-	push {r4-r5}
-
-	lsls r2, #2
-	add r2, r1
-	mov ip, r2
-	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
+.macro tmds_fullres_encode_loop_16bpp leftshift
+	sh2add t0, a2, a1
+	li a2, SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET
 	// DC balance defined to be 0 at start of scanline:
-	movs r4, #0
-	str r4, [r2, #ACCUM1_OFFS]
+	li a4, 0
+	sw a4, ACCUM1_OFFS(a2)
 #if TMDS_FULLRES_NO_DC_BALANCE
-	// Alternate parity between odd/even symbols if there's no balance feedback
-	mvns r4, r4
+	// Alternate parity between odd/even symbols if no feedback
+	li a4, -1
 #endif
-	str r4, [r2, #ACCUM1_OFFS + INTERP1]
+	sw a4, ACCUM1_OFFS + INTERP1(a2)

-	adr r4, 1f
-	adds r4, #1
-	mov r8, r4
-	b 2f
+	bgeu a1, t0, 2f
 	.align 2
 1:
-.rept 16 // 64 pixels per iteration
-	ldmia r0!, {r4, r6}
-	tmds_fullres_encode_loop_body_leftshift r4 r5
-	tmds_fullres_encode_loop_body_leftshift r6 r7
-	stmia r1!, {r4, r5, r6, r7}
+.set i, 0
+.rept 16
+	lw a4, 8 * i + 0(a0)
+	lw a6, 8 * i + 4(a0)
+	tmds_fullres_encode_loop_body \leftshift a4 a5
+	tmds_fullres_encode_loop_body \leftshift a6 a7
+	sw a4, 16 * i +  0(a1)
+	sw a5, 16 * i +  4(a1)
+	sw a6, 16 * i +  8(a1)
+	sw a7, 16 * i + 12(a1)
+.set i, i + 1
 .endr
+	addi a0, a0,  8 * i
+	addi a1, a1, 16 * i
+	bltu a1, t0, 1b
 2:
-	cmp r1, ip
-	beq 1f
-	bx r8
-1:
-	pop {r4-r5}
-	mov r8, r4
-	mov r9, r5
-	pop {r4-r7, pc}
+	ret
 .endm

-decl_func_x tmds_fullres_encode_loop_16bpp_leftshift_x
-	tmds_fullres_encode_loop_16bpp_leftshift
-decl_func_y tmds_fullres_encode_loop_16bpp_leftshift_y
-	tmds_fullres_encode_loop_16bpp_leftshift
+#else
+#error "Unknown architecture"
+#endif

+// One copy each in X and Y, so the two cores don't step on each other
+decl_func_x tmds_fullres_encode_loop_16bpp_x
+	tmds_fullres_encode_loop_16bpp 0
+decl_func_y tmds_fullres_encode_loop_16bpp_y
+	tmds_fullres_encode_loop_16bpp 0
+
+decl_func_x tmds_fullres_encode_loop_16bpp_leftshift_x
+	tmds_fullres_encode_loop_16bpp 1
+decl_func_y tmds_fullres_encode_loop_16bpp_leftshift_y
+	tmds_fullres_encode_loop_16bpp 1

 // ----------------------------------------------------------------------------
 // Full-resolution 8bpp paletted encode
@ -550,19 +719,19 @@ decl_func_y tmds_fullres_encode_loop_16bpp_leftshift_y
 // base is set to a reordered list of TMDS symbols based
 // on a user colour palette.

+#ifdef __arm__
 // Two pixels input in rd[17:2]. Two symbols output in rd[19:0]. r2 contains
 // interp base pointer. r7 used as temporary.
 .macro tmds_palette_encode_loop_body rd
 	str \rd, [r2, #ACCUM0_OFFS]
 	str \rd, [r2, #ACCUM0_OFFS + INTERP1]
+	// Loads interleaved to avoid rdata->addr stall on M33
 	ldr \rd, [r2, #PEEK2_OFFS]
-	ldr \rd, [\rd]
-#if !TMDS_FULLRES_NO_DC_BALANCE
-	str \rd, [r2, #ACCUM1_ADD_OFFS]
-#endif
 	ldr r7, [r2, #PEEK2_OFFS + INTERP1]
+	ldr \rd, [\rd]
 	ldr r7, [r7]
 #if !TMDS_FULLRES_NO_DC_BALANCE
+	str \rd, [r2, #ACCUM1_ADD_OFFS]
 	str r7, [r2, #ACCUM1_ADD_OFFS + INTERP1]
 #endif
 	lsls r7, #10
@ -617,7 +786,241 @@ decl_func_y tmds_fullres_encode_loop_16bpp_leftshift_y
 	pop {r4-r7, pc}
 .endm

+#elif defined(__riscv)
+
+// Two pixels input in rd[17:2]. Two symbols output in rd[19:0]. a2 contains
+// interp base pointer. a5 used as temporary.
+.macro tmds_palette_encode_loop_body rd
+	sw \rd, ACCUM0_OFFS(a2)
+	sw \rd, ACCUM0_OFFS + INTERP1(a2)
+	lw \rd, PEEK2_OFFS(a2)
+	lw a5, PEEK2_OFFS + INTERP1(a2)
+	lw \rd, (\rd)
+	lw a5, (a5)
+#if !TMDS_FULLRES_NO_DC_BALANCE
+	sw \rd, ACCUM1_ADD_OFFS(a2)
+	sw a5, ACCUM1_ADD_OFFS + INTERP1(a2)
+#endif
+	slli a5, a5, 10
+	or \rd, \rd, a5
+.endm
+
+.macro tmds_palette_encode_loop
+	mv t1, s0
+	sh1add t0, a2, a1
+	li a2, SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET
+	// DC balance defined to be 0 at start of scanline:
+	li a4, 0
+	sw a4, ACCUM1_OFFS(a2)
+#if TMDS_FULLRES_NO_DC_BALANCE
+	// Alternate parity between odd/even symbols if there's no balance feedback
+	li a4, -1
+#endif
+	sw a4, ACCUM1_OFFS + INTERP1(a2)
+
+	bgeu a1, t0, 2f
+	.align 2
+1:
+.set i, 0
+.rept 10
+	lw a3, 8 * i + 0(a0)
+	lw s0, 8 * i + 4(a0)
+	srli a4, a3, 14
+	slli a3, a3, 2
+	tmds_palette_encode_loop_body a3
+	tmds_palette_encode_loop_body a4
+	sw a3, 16 * i + 0(a1)
+	sw a4, 16 * i + 4(a1)
+	srli a4, s0, 14
+	slli s0, s0, 2
+	tmds_palette_encode_loop_body s0
+	tmds_palette_encode_loop_body a4
+	sw s0, 16 * i + 8(a1)
+	sw a4, 16 * i + 12(a1)
+.set i, i + 1
+.endr
+	addi a0, a0, 8 * i
+	addi a1, a1, 16 * i
+	bltu a1, t0, 1b
+2:
+	mv s0, t1
+	ret
+.endm
+
+
+#endif
+
 decl_func_x tmds_palette_encode_loop_x
 	tmds_palette_encode_loop
 decl_func_y tmds_palette_encode_loop_y
 	tmds_palette_encode_loop
+
+// ----------------------------------------------------------------------------
+// Hand-cranking loops for SIO TMDS encoders
+
+#if DVI_USE_SIO_TMDS_ENCODER
+
+#if defined(__arm__)
+
+// r0: input buffer (word-aligned)
+// r1: output buffer (word-aligned)
+// r2: pixel count
+
+.macro tmds_encode_sio_loop size_ratio peek
+
+// For larger load/store offsets at high ratios/unroll:
+.cpu cortex-m33
+
+.if \size_ratio > 4 * TMDS_ENCODE_UNROLL
+.set unroll, 1
+.else
+.set unroll, 4 * TMDS_ENCODE_UNROLL / \size_ratio
+.endif
+
+.if \peek
+.set even_offset_adj, (SIO_TMDS_PEEK_SINGLE_OFFSET - SIO_TMDS_POP_SINGLE_OFFSET)
+.else
+.set even_offset_adj, 0
+.endif
+
+	push {r4, lr}
+#if DVI_SYMBOLS_PER_WORD == 1
+	lsls r2, r2, #2
+#else
+	lsls r2, r2, #1
+#endif
+	adds r2, r1
+	ldr r3, =SIO_BASE + SIO_TMDS_CTRL_OFFSET
+	b 2f
+1:
+.set i, 0
+.rept unroll
+	ldr r4, [r0, #i * 4]
+	str r4, [r3, #SIO_TMDS_WDATA_OFFSET - SIO_TMDS_CTRL_OFFSET]
+.set j, 0
+.rept \size_ratio
+.set offset_adj, even_offset_adj * ((1 + j + \size_ratio * i) & 0x1)
+#if DVI_SYMBOLS_PER_WORD == 2
+	ldr r4, [r3, #offset_adj + SIO_TMDS_POP_DOUBLE_L0_OFFSET - SIO_TMDS_CTRL_OFFSET]
+#else
+	ldr r4, [r3, #offset_adj + SIO_TMDS_POP_SINGLE_OFFSET - SIO_TMDS_CTRL_OFFSET]
+#endif
+	str r4, [r1, #4 * (j + i * \size_ratio)]
+.set j, j + 1
+.endr
+.set i, i + 1
+.endr
+	adds r0, 4 * unroll
+	adds r1, 4 * unroll * \size_ratio
+2:
+	cmp r1, r2
+	blo 1b
+	pop {r4, pc}
+
+.cpu cortex-m0plus
+.endm
+
+#elif defined(__riscv)
+
+// a0: input buffer (word-aligned)
+// a1: output buffer (word-aligned)
+// a2: pixel count
+
+.macro tmds_encode_sio_loop size_ratio peek
+
+.if \size_ratio > 4 * TMDS_ENCODE_UNROLL
+.set unroll, 1
+.else
+.set unroll, 4 * TMDS_ENCODE_UNROLL / \size_ratio
+.endif
+
+.if \peek
+.set even_offset_adj, (SIO_TMDS_PEEK_SINGLE_OFFSET - SIO_TMDS_POP_SINGLE_OFFSET)
+.else
+.set even_offset_adj, 0
+.endif
+
+#if DVI_SYMBOLS_PER_WORD == 1
+	sh2add a2, a2, a1
+#else
+	sh1add a2, a2, a1
+#endif
+	li a3, SIO_BASE + SIO_TMDS_CTRL_OFFSET
+	bgeu a1, a2, 2f
+1:
+.set i, 0
+.rept unroll
+	lw a4, i * 4(a0)
+	sw a4, SIO_TMDS_WDATA_OFFSET - SIO_TMDS_CTRL_OFFSET(a3)
+.set j, 0
+.rept \size_ratio
+.set offset_adj, even_offset_adj * ((1 + j + \size_ratio * i) & 0x1)
+#if DVI_SYMBOLS_PER_WORD == 2
+	lw a4, offset_adj + SIO_TMDS_POP_DOUBLE_L0_OFFSET - SIO_TMDS_CTRL_OFFSET(a3)
+#else
+	lw a4, offset_adj + SIO_TMDS_POP_SINGLE_OFFSET - SIO_TMDS_CTRL_OFFSET(a3)
+#endif
+	sw a4, 4 * (j + i * \size_ratio)(a1)
+.set j, j + 1
+.endr
+.set i, i + 1
+.endr
+	addi a0, a0, 4 * unroll
+	addi a1, a1, 4 * unroll * \size_ratio
+	bltu a1, a2, 1b
+2:
+	ret
+.endm
+
+#else
+#error "Unknown architecture"
+#endif
+
+// For DVI_SYMBOLS_PER_WORD == 2, the ratio of output : input buffer size is:
+//
+// Bits/pixel | Ratio (with hdouble) | Ratio (no hdouble)
+// -----------+----------------------+-------------------
+//          1 |                   32 |                 16
+//          2 |                   16 |                  8
+//          4 |                    8 |                  4
+//          8 |                    4 |                  2
+//         16 |                    2 |                  1
+//
+// For DVI_SYMBOLS_PER_WORD == 1, these ratios are doubled.
+
+// poppop variants will read from a xxx_POP register for every output word
+decl_func tmds_encode_sio_loop_poppop_ratio1
+	tmds_encode_sio_loop 1, 0
+decl_func tmds_encode_sio_loop_poppop_ratio2
+	tmds_encode_sio_loop 2, 0
+decl_func tmds_encode_sio_loop_poppop_ratio4
+	tmds_encode_sio_loop 4, 0
+decl_func tmds_encode_sio_loop_poppop_ratio8
+	tmds_encode_sio_loop 8, 0
+decl_func tmds_encode_sio_loop_poppop_ratio16
+	tmds_encode_sio_loop 16, 0
+decl_func tmds_encode_sio_loop_poppop_ratio32
+	tmds_encode_sio_loop 32, 0
+decl_func tmds_encode_sio_loop_poppop_ratio64
+	tmds_encode_sio_loop 64, 0
+
+// peekpop variants will read alternately from xxx_PEEK and xxx_POP: this is
+// needed for pixel-doubled output when DVI_PIXELS_PER_WORD == 1 (note the
+// POP value is different from the PEEK value, as it's the same data but with
+// different running DC balance)
+decl_func tmds_encode_sio_loop_peekpop_ratio1
+	tmds_encode_sio_loop 1, 1
+decl_func tmds_encode_sio_loop_peekpop_ratio2
+	tmds_encode_sio_loop 2, 1
+decl_func tmds_encode_sio_loop_peekpop_ratio4
+	tmds_encode_sio_loop 4, 1
+decl_func tmds_encode_sio_loop_peekpop_ratio8
+	tmds_encode_sio_loop 8, 1
+decl_func tmds_encode_sio_loop_peekpop_ratio16
+	tmds_encode_sio_loop 16, 1
+decl_func tmds_encode_sio_loop_peekpop_ratio32
+	tmds_encode_sio_loop 32, 1
+decl_func tmds_encode_sio_loop_peekpop_ratio64
+	tmds_encode_sio_loop 64, 1
+
+#endif
--- a/software/libdvi/tmds_encode.c
+++ b/software/libdvi/tmds_encode.c
@ -3,7 +3,7 @@
 #include "hardware/gpio.h"
 #include "hardware/sync.h"

-static const uint32_t __scratch_x("tmds_table") tmds_table[] = {
+static const __unused uint32_t __scratch_x("tmds_table") tmds_table[] = {
 #include "tmds_table.h"
 };

@ -11,14 +11,15 @@ static const uint32_t __scratch_x("tmds_table") tmds_table[] = {
 // memory. There is a third copy which can go in flash, because it's just used
 // to generate palette LUTs. The ones we don't use will get garbage collected
 // during linking.
-const uint32_t __scratch_x("tmds_table_fullres_x") tmds_table_fullres_x[] = {
+const __unused uint32_t __scratch_x("tmds_table_fullres_x") tmds_table_fullres_x[] = {
 #include "tmds_table_fullres.h"
 };

-const uint32_t __scratch_y("tmds_table_fullres_y") tmds_table_fullres_y[] = {
+const __unused uint32_t __scratch_y("tmds_table_fullres_y") tmds_table_fullres_y[] = {
 #include "tmds_table_fullres.h"
 };

+#if !DVI_USE_SIO_TMDS_ENCODER
 // Configure an interpolator to extract a single colour channel from each of a pair
 // of pixels, with the first pixel's lsb at pixel_lsb, and the pixels being
 // pixel_width wide. Produce a LUT address for the first pixel's colour data on
@ -35,11 +36,16 @@ static int __not_in_flash_func(configure_interp_for_addrgen)(interp_hw_t *interp

 	int shift_channel_to_index = pixel_lsb + channel_msb - (lut_index_width - 1) - index_shift;
 	int oops = 0;
+#if PICO_RP2040
 	if (shift_channel_to_index < 0) {
 		// "It's ok we'll fix it in software"
 		oops = -shift_channel_to_index;
 		shift_channel_to_index = 0;
 	}
+#else
+	// Now a right-rotate, not a right-shift
+	shift_channel_to_index &= 0x1f;
+#endif

 	uint index_msb = index_shift + lut_index_width - 1;

@ -60,23 +66,60 @@ static int __not_in_flash_func(configure_interp_for_addrgen)(interp_hw_t *interp
 	return oops;
 }

+#else
+// Encoding a single channel at a time is not the most efficient way to use
+// this hardware, because it means we read the colour buffer multiple times,
+// but it fits better with how things are done in software on RP2040.
+static void __not_in_flash_func(configure_sio_tmds_for_single_channel)(uint channel_msb, uint channel_lsb, uint pixel_width, bool hdouble) {
+	assert(channel_msb - channel_lsb <= 7); // 1 through 8 bits, inclusive
+	sio_hw->tmds_ctrl =
+		SIO_TMDS_CTRL_CLEAR_BALANCE_BITS |
+		((channel_msb - channel_lsb) << SIO_TMDS_CTRL_L0_NBITS_LSB) |
+		(((channel_msb - 7u) & 0xfu) << SIO_TMDS_CTRL_L0_ROT_LSB) |
+		((1 + __builtin_ctz(pixel_width)) << SIO_TMDS_CTRL_PIX_SHIFT_LSB) |
+		((uint)hdouble << SIO_TMDS_CTRL_PIX2_NOSHIFT_LSB);
+}
+#endif
+
 // Extract up to 6 bits from a buffer of 16 bit pixels, and produce a buffer
 // of TMDS symbols from this colour channel. Number of pixels must be even,
 // pixel buffer must be word-aligned.

 void __not_in_flash_func(tmds_encode_data_channel_16bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) {
+#if DVI_USE_SIO_TMDS_ENCODER
+	configure_sio_tmds_for_single_channel(channel_msb, channel_lsb, 16, true);
+#if DVI_SYMBOLS_PER_WORD == 1
+	tmds_encode_sio_loop_peekpop_ratio4(pixbuf, symbuf, 2 * n_pix);
+#else
+	tmds_encode_sio_loop_poppop_ratio2(pixbuf, symbuf, 2 * n_pix);
+#endif
+#else
 	interp_hw_save_t interp0_save;
 	interp_save(interp0_hw, &interp0_save);
 	int require_lshift = configure_interp_for_addrgen(interp0_hw, channel_msb, channel_lsb, 0, 16, 6, tmds_table);
+#if PICO_RP2040
 	if (require_lshift)
 		tmds_encode_loop_16bpp_leftshift(pixbuf, symbuf, n_pix, require_lshift);
 	else
 		tmds_encode_loop_16bpp(pixbuf, symbuf, n_pix);
+#else
+	assert(!require_lshift); (void)require_lshift;
+	tmds_encode_loop_16bpp(pixbuf, symbuf, n_pix);
+#endif
 	interp_restore(interp0_hw, &interp0_save);
+#endif
 }

 // As above, but 8 bits per pixel, multiple of 4 pixels, and still word-aligned.
 void __not_in_flash_func(tmds_encode_data_channel_8bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) {
+#if DVI_USE_SIO_TMDS_ENCODER
+	configure_sio_tmds_for_single_channel(channel_msb, channel_lsb, 8, true);
+#if DVI_SYMBOLS_PER_WORD == 1
+	tmds_encode_sio_loop_peekpop_ratio8(pixbuf, symbuf, 2 * n_pix);
+#else
+	tmds_encode_sio_loop_poppop_ratio4(pixbuf, symbuf, 2 * n_pix);
+#endif
+#else
 	interp_hw_save_t interp0_save, interp1_save;
 	interp_save(interp0_hw, &interp0_save);
 	interp_save(interp1_hw, &interp1_save);
@ -86,12 +129,18 @@ void __not_in_flash_func(tmds_encode_data_channel_8bpp)(const uint32_t *pixbuf,
 	int require_lshift = configure_interp_for_addrgen(interp0_hw, channel_msb, channel_lsb, 0, 8, 6, tmds_table);
 	int lshift_upper = configure_interp_for_addrgen(interp1_hw, channel_msb, channel_lsb, 16, 8, 6, tmds_table);
 	assert(!lshift_upper); (void)lshift_upper;
+#if PICO_RP2040
 	if (require_lshift)	
 		tmds_encode_loop_8bpp_leftshift(pixbuf, symbuf, n_pix, require_lshift);
 	else
 		tmds_encode_loop_8bpp(pixbuf, symbuf, n_pix);
+#else
+	assert(!require_lshift); (void)require_lshift;
+	tmds_encode_loop_8bpp(pixbuf, symbuf, n_pix);
+#endif
 	interp_restore(interp0_hw, &interp0_save);
 	interp_restore(interp1_hw, &interp1_save);
+#endif
 }

 // ----------------------------------------------------------------------------
@ -103,16 +152,22 @@ void __not_in_flash_func(tmds_encode_data_channel_8bpp)(const uint32_t *pixbuf,
 // pixels, and INTERP1 for odd pixels. Note this means that even and odd
 // symbols have their DC balance handled separately, which is not to spec.

+#if !DVI_USE_SIO_TMDS_ENCODER
 static int __not_in_flash_func(configure_interp_for_addrgen_fullres)(interp_hw_t *interp, uint channel_msb, uint channel_lsb, uint lut_index_width, const uint32_t *lutbase) {
 	const uint index_shift = 2; // scaled lookup for 4-byte LUT entries

 	int shift_channel_to_index = channel_msb - (lut_index_width - 1) - index_shift;
 	int oops = 0;
+#if PICO_RP2040
 	if (shift_channel_to_index < 0) {
 		// "It's ok we'll fix it in software"
 		oops = -shift_channel_to_index;
 		shift_channel_to_index = 0;
 	}
+#else
+	// Now a right-rotate rather than right-shift
+	shift_channel_to_index &= 0x1f;
+#endif

 	uint index_msb = index_shift + lut_index_width - 1;

@ -133,8 +188,17 @@ static int __not_in_flash_func(configure_interp_for_addrgen_fullres)(interp_hw_t

 	return oops;
 }
+#endif

 void __not_in_flash_func(tmds_encode_data_channel_fullres_16bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) {
+#if DVI_USE_SIO_TMDS_ENCODER
+	configure_sio_tmds_for_single_channel(channel_msb, channel_lsb, 16, false);
+#if DVI_SYMBOLS_PER_WORD == 1
+	tmds_encode_sio_loop_poppop_ratio2(pixbuf, symbuf, n_pix);
+#else
+	tmds_encode_sio_loop_poppop_ratio1(pixbuf, symbuf, n_pix);
+#endif
+#else
 	uint core = get_core_num();
 #if !TMDS_FULLRES_NO_INTERP_SAVE
 	interp_hw_save_t interp0_save, interp1_save;
@ -165,17 +229,16 @@ void __not_in_flash_func(tmds_encode_data_channel_fullres_16bpp)(const uint32_t
 	interp_restore(interp0_hw, &interp0_save);
 	interp_restore(interp1_hw, &interp1_save);
 #endif
+#endif
 }

 static const int8_t imbalance_lookup[16] = { -4, -2, -2, 0, -2, 0, 0, 2, -2, 0, 0, 2, 0, 2, 2, 4 };

-static inline int byte_imbalance(uint32_t x)
-{
+static inline int byte_imbalance(uint32_t x) {
 	return imbalance_lookup[x >> 4] + imbalance_lookup[x & 0xF];
 }

-static void tmds_encode_symbols(uint8_t pixel, uint32_t* negative_balance_sym, uint32_t* positive_balance_sym)
-{
+static void tmds_encode_symbols(uint8_t pixel, uint32_t* negative_balance_sym, uint32_t* positive_balance_sym) {
 	int pixel_imbalance = byte_imbalance(pixel);
 	uint32_t sym = pixel & 1;
 	if (pixel_imbalance > 0 || (pixel_imbalance == 0 && sym == 0)) {
--- a/software/libdvi/tmds_encode.h
+++ b/software/libdvi/tmds_encode.h
@ -34,4 +34,23 @@ void tmds_fullres_encode_loop_16bpp_leftshift_y(const uint32_t *pixbuf, uint32_t
 void tmds_palette_encode_loop_x(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
 void tmds_palette_encode_loop_y(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);

+#if !PICO_RP2040
+// Crank the SIO TMDS encoder:
+void tmds_encode_sio_loop_poppop_ratio1(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
+void tmds_encode_sio_loop_poppop_ratio2(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
+void tmds_encode_sio_loop_poppop_ratio4(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
+void tmds_encode_sio_loop_poppop_ratio8(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
+void tmds_encode_sio_loop_poppop_ratio16(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
+void tmds_encode_sio_loop_poppop_ratio32(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
+void tmds_encode_sio_loop_poppop_ratio64(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
+
+void tmds_encode_sio_loop_peekpop_ratio1(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
+void tmds_encode_sio_loop_peekpop_ratio2(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
+void tmds_encode_sio_loop_peekpop_ratio4(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
+void tmds_encode_sio_loop_peekpop_ratio8(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
+void tmds_encode_sio_loop_peekpop_ratio16(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
+void tmds_encode_sio_loop_peekpop_ratio32(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
+void tmds_encode_sio_loop_peekpop_ratio64(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
+#endif
+
 #endif
--- a/software/libsprite/CMakeLists.txt
+++ b/software/libsprite/CMakeLists.txt
@ -3,13 +3,24 @@ add_library(libsprite INTERFACE)
 target_sources(libsprite INTERFACE
 	${CMAKE_CURRENT_LIST_DIR}/affine_transform.h
 	${CMAKE_CURRENT_LIST_DIR}/sprite_asm_const.h
-	${CMAKE_CURRENT_LIST_DIR}/sprite.S
 	${CMAKE_CURRENT_LIST_DIR}/sprite.c
 	${CMAKE_CURRENT_LIST_DIR}/sprite.h
-	${CMAKE_CURRENT_LIST_DIR}/tile.S
 	${CMAKE_CURRENT_LIST_DIR}/tile.c
 	${CMAKE_CURRENT_LIST_DIR}/tile.h
 	)

+if (PICO_RISCV)
+	target_sources(libsprite INTERFACE
+		${CMAKE_CURRENT_LIST_DIR}/sprite_riscv.S
+		${CMAKE_CURRENT_LIST_DIR}/tile_riscv.S
+		)
+else ()
+	target_sources(libsprite INTERFACE
+		${CMAKE_CURRENT_LIST_DIR}/sprite_armv6m.S
+		${CMAKE_CURRENT_LIST_DIR}/tile_armv6m.S
+		)
+endif()
+
+
 target_include_directories(libsprite INTERFACE ${CMAKE_CURRENT_LIST_DIR})
 target_link_libraries(libsprite INTERFACE pico_base_headers hardware_interp)
--- a/software/libsprite/affine_transform.h
+++ b/software/libsprite/affine_transform.h
@ -4,7 +4,7 @@
 // Stolen from RISCBoy

 #include <stdint.h>
-#include "pico/platform.h"
+#include "pico.h"

 // Store unpacked affine transforms as signed 16.16 fixed point in the following order:
 // a00, a01, b0,   a10, a11, b1
--- a/software/libsprite/sprite.c
+++ b/software/libsprite/sprite.c
@ -1,7 +1,7 @@
 #include "sprite.h"
 #include "affine_transform.h"

-#include "pico/platform.h" // for __not_in_flash
+#include "pico.h" // for __not_in_flash
 #include "hardware/interp.h"

 // Note some of the sprite routines are quite large (unrolled), so trying to
--- a/software/libsprite/sprite_armv6m.S
+++ b/software/libsprite/sprite_armv6m.S
--- a/software/libsprite/sprite_asm_const.h
+++ b/software/libsprite/sprite_asm_const.h
@ -5,8 +5,11 @@
 .macro decl_func name
 .section .time_critical.\name, "ax"
 .global \name
+.p2align 2
+#ifndef __riscv
 .type \name,%function
 .thumb_func
+#endif
 \name:
 .endm

@ -16,11 +19,40 @@
 // same way as non-alpha pixels when encoding (and the co-opted channel LSB
 // always ends up being set on alpha pixels, which is pretty harmless)

+// Also note this is expressed as a right-shift into the carry flag (on Arm),
+// so this is equal to the bit index of the alpha bit plus 1. On RISC-V it's
+// idiomatic to shift up to the sign bit instead, so a left shift of 32 - x
+// should be used instead of a right shift of x.
+
 #define ALPHA_SHIFT_16BPP 6

 // Assume RAGB2132 (so alpha is bit 5)

 #define ALPHA_SHIFT_8BPP 6

+#ifdef __riscv
+// Macros for forcing individual instructions to be 32 bits, to maintain
+// branch target alignment without adding NOPs
+.macro norvc_1a instr, arg0
+.option push
+.option norvc
+\instr \arg0
+.option pop
+.endm
+
+.macro norvc_2a instr, arg0, arg1
+.option push
+.option norvc
+\instr \arg0, \arg1
+.option pop
+.endm
+
+.macro norvc_3a instr, arg0, arg1, arg2
+.option push
+.option norvc
+\instr \arg0, \arg1, \arg2
+.option pop
+.endm
+#endif

 #endif
--- a/software/libsprite/sprite_riscv.S
+++ b/software/libsprite/sprite_riscv.S
@ -0,0 +1,657 @@
+// Functions for doing simple 2D graphics operations on a RGB scanline buffer.
+
+#include "hardware/regs/addressmap.h"
+#include "hardware/regs/sio.h"
+
+#include "sprite_asm_const.h"
+
+#define POP2_OFFS   (SIO_INTERP0_POP_FULL_OFFSET   - SIO_INTERP0_ACCUM0_OFFSET)
+#define PEEK0_OFFS  (SIO_INTERP0_PEEK_LANE0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
+#define PEEK1_OFFS  (SIO_INTERP0_PEEK_LANE1_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
+#define ACCUM0_OFFS (SIO_INTERP0_ACCUM0_OFFSET     - SIO_INTERP0_ACCUM0_OFFSET)
+#define ACCUM1_OFFS (SIO_INTERP0_ACCUM1_OFFSET     - SIO_INTERP0_ACCUM0_OFFSET)
+#define CTRL0_OFFS  (SIO_INTERP0_CTRL_LANE0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
+#define INTERP1     (SIO_INTERP1_ACCUM0_OFFSET     - SIO_INTERP0_ACCUM0_OFFSET)
+
+#if defined(__riscv_c) || defined(__riscv_zca)
+#define RISCV_HAVE_COMPRESSED_ISA 1
+#endif
+
+// ----------------------------------------------------------------------------
+// Colour fill
+
+// a0: dst
+// a1: value
+// a2: count
+
+decl_func sprite_fill8
+	// Slide for short fills
+	li a3, 18
+	bltu a3, a2, 2f
+#ifndef RISCV_HAVE_COMPRESSED_ISA
+#error "This address computation is wrong for non-RVC:"
+#endif
+	auipc a3, 0        // 32-bit instruction after address of auipc
+	slli a2, a2, 2     // 16-bit instruction after address of auipc
+	sub a3, a3, a2     // 16-bit instruction after address of auipc
+	jr a3, 18 * 4 + 12 // 32-bit instruction after address of auipc
+.align 2
+	// With Zcb this is a mix of 16-bit and 32-bit instructions due to the
+	// limited immediate size. Force 32-bit so we can do a computed branch.
+.option push
+.option norvc
+	sb a1, 17(a0)
+	sb a1, 16(a0)
+	sb a1, 15(a0)
+	sb a1, 14(a0)
+	sb a1, 13(a0)
+	sb a1, 12(a0)
+	sb a1, 11(a0)
+	sb a1, 10(a0)
+	sb a1,  9(a0)
+	sb a1,  8(a0)
+	sb a1,  7(a0)
+	sb a1,  6(a0)
+	sb a1,  5(a0)
+	sb a1,  4(a0)
+	sb a1,  3(a0)
+	sb a1,  2(a0)
+	sb a1,  1(a0)
+	sb a1,  0(a0)
+.option pop
+	ret
+2:
+	// Duplicate byte x4
+	packh a1, a1, a1
+	pack a1, a1, a1
+	// Get a0 word-aligned:
+	andi a3, a0, 0x1
+	bnez a3, 1f
+	sb a1, (a0)
+	addi a0, a0, 1
+	addi a2, a2, -1
+1:
+	andi a3, a0, 0x2
+	bnez a3, 1f
+	sh a1, (a0)
+	addi a0, a0, 2
+	addi a2, a2, -2
+1:
+	// Set up for main loop. Limit pointer at end - (loop body size)
+	add a2, a2, a0
+	addi a2, a2, -16
+
+	// Fall straight into loop, because cases less than (loop body + max misalignment) are handled by slide
+1:
+	sw a1,  0(a0)
+	sw a1,  4(a0)
+	sw a1,  8(a0)
+	sw a1, 12(a0)
+	addi a0, a0, 16
+	bgeu a2, a0, 1b
+
+	// Main loop done, now tidy up the odds and ends. Note bits 3:0 of the
+	// pointer difference are not affected by us subtracting 16 earlier.
+	sub a2, a2, a0
+	// No more than 15 bytes remaining -- first test bit 3 by shifting it to sign bit
+	slli a2, a2, 28
+	bgez a2, 1f
+	sw a1, 0(a0)
+	sw a1, 4(a0)
+	addi a0, a0, 8
+1:
+	slli a2, a2, 1
+	bgez a2, 1f
+	sw a1, (a0)
+	addi a0, a0, 4
+1:
+	slli a2, a2, 1
+	bgez a2, 1f
+	sh a1, (a0)
+	addi a0, a0, 2
+1:
+	slli a2, a2, 1
+	bgez a2, 1f
+	sb a1, (a0)
+1:
+	ret
+
+.p2align 2
+decl_func sprite_fill16
+	// Slide for short fills
+	norvc_2a li a3, 16
+	bltu a3, a2, 2f
+#ifndef RISCV_HAVE_COMPRESSED_ISA
+#error "This address computation is wrong for non-RVC:"
+#endif
+	auipc a3, 0        // 32-bit instruction after address of auipc
+	slli a2, a2, 2     // 16-bit instruction after address of auipc
+	sub a3, a3, a2     // 16-bit instruction after address of auipc
+	jr a3, 16 * 4 + 12 // 32-bit instruction after address of auipc
+.option push
+.option norvc
+	sh a1, 30(a0)
+	sh a1, 28(a0)
+	sh a1, 26(a0)
+	sh a1, 24(a0)
+	sh a1, 22(a0)
+	sh a1, 20(a0)
+	sh a1, 18(a0)
+	sh a1, 16(a0)
+	sh a1, 14(a0)
+	sh a1, 12(a0)
+	sh a1, 10(a0)
+	sh a1,  8(a0)
+	sh a1,  6(a0)
+	sh a1,  4(a0)
+	sh a1,  2(a0)
+	sh a1,  0(a0)
+.option pop
+	ret
+2:
+	// Get word-aligned before main fill loop
+	andi a3, a2, 0x2
+	beqz a3, 1f
+	sh a1, (a0)
+	addi a0, a0, 2
+	addi a2, a2, -1
+1:
+	// Set limit pointer at end - (loop body size)
+	slli a2, a2, 1
+	add a2, a2, a0
+	addi a2, a2, -32
+	pack a1, a1, a1
+	// We can fall through because cases < 1 loop are handled by slide
+1:
+	sw a1,  0(a0)
+	sw a1,  4(a0)
+	sw a1,  8(a0)
+	sw a1, 12(a0)
+	sw a1, 16(a0)
+	sw a1, 20(a0)
+	sw a1, 24(a0)
+	sw a1, 28(a0)
+	addi a0, a0, 32
+	bgeu a2, a0, 1b
+
+	// Most of the work done, we have a few more to tidy up -- note bits 4:1
+	// of the pointer difference are not affected by earlier subtraction of 32
+	sub a2, a2, a0
+
+	// Bit 4 becomes sign bit
+	slli a2, a2, 27
+	bgez a2, 1f
+	sw a1,  0(a0)
+	sw a1,  4(a0)
+	sw a1,  8(a0)
+	sw a1, 12(a0)
+	addi a0, a0, 16
+1:
+	slli a2, a2, 1
+	bgez a2, 1f
+	sw a1,  0(a0)
+	sw a1,  4(a0)
+	addi a0, a0, 8
+1:
+	slli a2, a2, 1
+	bgez a2, 1f
+	sw a1,  0(a0)
+	addi a0, a0, 4
+1:
+	slli a2, a2, 1
+	bgez a2, 1f
+	sh a1,  0(a0)
+1:
+	ret
+
+
+// ----------------------------------------------------------------------------
+// Non-AT sprite
+
+
+// TODO 8-bit version not yet ported to RISC-V
+#if 0
+// Unrolled loop body with an initial computed branch.
+
+// a0: dst
+// a1: src
+// a2: pixel count
+decl_func sprite_blit8
+	mov ip, a0
+	lsrs a3, a2, #3
+	lsls a3, #3
+	eors a2, a3   // a2 = pixels % 8, a3 = pixels - pixels % 8
+
+	add a0, a3
+	add a1, a3
+
+	adr a3, 2f
+	lsls a2, #2
+	subs a3, a2
+	adds a3, #1 // thumb bit >:(
+	bx a3
+
+.align 2
+1:
+	subs a0, #8
+	subs a1, #8
+	ldrb a3, [a1, #7]
+	strb a3, [a0, #7]
+	ldrb a3, [a1, #6]
+	strb a3, [a0, #6]
+	ldrb a3, [a1, #5]
+	strb a3, [a0, #5]
+	ldrb a3, [a1, #4]
+	strb a3, [a0, #4]
+	ldrb a3, [a1, #3]
+	strb a3, [a0, #3]
+	ldrb a3, [a1, #2]
+	strb a3, [a0, #2]
+	ldrb a3, [a1, #1]
+	strb a3, [a0, #1]
+	ldrb a3, [a1, #0]
+	strb a3, [a0, #0]
+2:
+	cmp a0, ip
+	bhi 1b
+	bx lr
+
+.macro sprite_blit8_alpha_body n
+	ldrb a3, [a1, #\n]
+	lsrs a2, a3, #ALPHA_SHIFT_8BPP
+	bcc 2f
+	strb a3, [a0, #\n]
+2:
+.endm
+
+// a0: dst
+// a1: src
+// a2: pixel count
+decl_func sprite_blit8_alpha
+	mov ip, a0
+	lsrs a3, a2, #3
+	lsls a3, #3
+	eors a2, a3
+
+	add a0, a3
+	add a1, a3
+
+	adr a3, 3f
+	lsls a2, #3
+	subs a3, a2
+	adds a3, #1
+	bx a3
+
+.align 2
+1:
+	subs a0, #8
+	subs a1, #8
+	sprite_blit8_alpha_body 7
+	sprite_blit8_alpha_body 6
+	sprite_blit8_alpha_body 5
+	sprite_blit8_alpha_body 4
+	sprite_blit8_alpha_body 3
+	sprite_blit8_alpha_body 2
+	sprite_blit8_alpha_body 1
+	sprite_blit8_alpha_body 0
+3:
+	cmp a0, ip
+	bhi 1b
+	bx lr
+
+#endif
+
+// Note this is the same ideal cycle count as lhu; lhu; sh; sh; but it reduces
+// the number of memory accesses by 25%, so less bus contention
+.macro storew_alignh rd ra offs
+	sh \rd, \offs(\ra)
+	srli \rd, \rd, 16
+	sh \rd, \offs+2(\ra)
+.endm
+
+// a0: dst
+// a1: src
+// a2: pixel count
+decl_func sprite_blit16
+	// Force source pointer to be word-aligned
+	andi a3, a1, 2
+	beqz a3, 1f
+	lhu a3, (a1)
+	sh a3, (a0)
+	addi a0, a0, 2
+	addi a1, a1, 2
+	addi a2, a2, -1
+1:
+	// Each loop is 8 pixels. Place limit pointer at 16 bytes before
+	// end, loop until past it. There will be 0 to 7 pixels remaining.
+	slli a2, a2, 1
+	add a2, a2, a0
+	addi a5, a2, -16
+	// Early out:
+	bltu a5, a0, 2f
+1:
+	lw a2, 0(a1)
+	lw a3, 4(a1)
+	storew_alignh a2, a0, 0
+	storew_alignh a3, a0, 4
+	lw a2, 8(a1)
+	lw a3, 12(a1)
+	storew_alignh a2, a0, 8
+	storew_alignh a3, a0, 12
+	addi a0, a0, 16
+	addi a1, a1, 16
+	bgeu a5, a0, 1b
+2:
+	sub a5, a5, a0
+	// At least 4 pixels? (bit 3 -> sign bit)
+	slli a5, a5, 28
+	bgez a5, 1f
+	lw a2, 0(a1)
+	lw a3, 4(a1)
+	storew_alignh a2, a0, 0
+	storew_alignh a3, a0, 4
+	addi a0, a0, 8
+	addi a1, a1, 8
+1:
+	// At least 2 pixels?
+	slli a5, a5, 1
+	bgez a5, 1f
+	lw a2, 0(a1)
+	storew_alignh a2, a0, 0
+	addi a0, a0, 4
+	addi a1, a1, 4
+1:
+	// One more pixel?
+	slli a5, a5, 1
+	bgez a5, 1f
+	lhu a3, (a1)
+	sh a3, (a0)
+1:
+	ret
+
+// dst: a0, src: a1, clobbers: a4-a7
+.macro sprite_blit16_alpha_body_x2 n
+	// Disable RVC to force 32-bit alignment of branch targets without adding
+	// alignment nops (lhu/sh *may* be 16-bit if Zcb is enabled)
+.option push
+.option norvc
+	// Interleave two loads to avoid load->shift dependency stall
+	lhu a4, 4*\n(a1)
+	lhu a5, 4*\n+2(a1)
+	slli a6, a4, 32 - ALPHA_SHIFT_16BPP
+	slli a7, a5, 32 - ALPHA_SHIFT_16BPP
+	bgez a6, 3f
+	sh a4, 4*\n(a0)
+3:
+	bgez a7, 3f
+	sh a5, 4*\n+2(a0)
+3:
+.option pop
+.endm
+
+// a0: dst
+// a1: src
+// a2: pixel count
+decl_func sprite_blit16_alpha
+	// Not using the computed branch approach of the v6-M code as it doesn't
+	// play nicely with the pairing of pixels used in the loop body here.
+	slli a2, a2, 1
+	add a2, a2, a0
+	norvc_3a addi, a2, a2, -16
+	bltu a2, a0, 2f
+1:
+	// 8 pixels per loop
+	sprite_blit16_alpha_body_x2 0
+	sprite_blit16_alpha_body_x2 1
+	sprite_blit16_alpha_body_x2 2
+	sprite_blit16_alpha_body_x2 3
+	addi a0, a0, 16
+	addi a1, a1, 16
+	bgeu a2, a0, 1b
+2:
+	sub a2, a2, a0
+	// At least 4 pixels? (bit 3 -> sign bit)
+	slli a2, a2, 28
+	bgez a2, 1f
+	sprite_blit16_alpha_body_x2 0
+	sprite_blit16_alpha_body_x2 1
+	addi a0, a0, 8
+	addi a1, a1, 8
+1:
+	// At least 2 pixels?
+	norvc_3a slli, a2, a2, 1
+	bgez a2, 1f
+	sprite_blit16_alpha_body_x2 0
+	addi a1, a1, 4
+	addi a0, a0, 4
+1:
+	// One more pixel?
+	slli a2, a2, 1
+	bgez a2, 1f
+	lhu a4, (a1)
+	slli a6, a4, 32 - ALPHA_SHIFT_16BPP
+	bgez a6, 1f
+	sh a4, (a0)
+1:
+	ret
+// ----------------------------------------------------------------------------
+// Affine-transformed sprite (note these are just the inner loops -- INTERP0
+// must be configured by the caller, which is presumably not written in asm)
+
+// TODO not yet ported to RISC-V
+#if 0
+// r0: raster start pointer
+// r1: raster span size (pixels)
+
+.macro sprite_ablit8_loop_body n
+	ldr r1, [r3, #CTRL0_OFFS]
+	ldr r2, [r3, #POP2_OFFS]
+	lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1
+	bcs 2f
+	ldrb r2, [r2]
+	strb r2, [r0, #\n]
+2:
+.endm
+
+decl_func sprite_ablit8_loop
+	mov ip, r0
+
+	lsrs r2, r1, #3
+	lsls r2, #3
+	eors r1, r2
+	add r0, r2
+
+	adr r2, 3f
+	movs r3, #12 // Each (non-unrolled) loop body is 12 bytes
+	muls r1, r3
+	subs r2, r1
+	adds r2, #1
+
+	ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
+	bx r2
+
+.align 2
+	nop
+1:
+	subs r0, #8
+	sprite_ablit8_loop_body 7
+	sprite_ablit8_loop_body 6
+	sprite_ablit8_loop_body 5
+	sprite_ablit8_loop_body 4
+	sprite_ablit8_loop_body 3
+	sprite_ablit8_loop_body 2
+	sprite_ablit8_loop_body 1
+	sprite_ablit8_loop_body 0
+3:
+	cmp r0, ip
+	bne 1b
+	bx lr
+
+
+
+// As above but bit 5 is assumed to be an alpha bit (RAGB2132)
+
+.macro sprite_ablit8_alpha_loop_body n
+	ldr r1, [r3, #CTRL0_OFFS]
+	ldr r2, [r3, #POP2_OFFS]
+	lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1
+	bcs 2f
+	ldrb r2, [r2]
+	lsrs r1, r2, #ALPHA_SHIFT_8BPP
+	bcc 2f
+	strb r2, [r0, #\n]
+2:
+.endm
+
+decl_func sprite_ablit8_alpha_loop
+	mov ip, r0
+	ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
+
+	lsrs r2, r1, #3
+	lsls r2, #3
+	eors r1, r2
+	add r0, r2
+
+	adr r2, 3f
+	lsls r1, #4 // Each (non-unrolled) loop body is 16 bytes
+	subs r2, r1
+	adds r2, #1
+	bx r2
+
+.align 2
+	nop
+1:
+	subs r0, #8
+	sprite_ablit8_alpha_loop_body 7
+	sprite_ablit8_alpha_loop_body 6
+	sprite_ablit8_alpha_loop_body 5
+	sprite_ablit8_alpha_loop_body 4
+	sprite_ablit8_alpha_loop_body 3
+	sprite_ablit8_alpha_loop_body 2
+	sprite_ablit8_alpha_loop_body 1
+	sprite_ablit8_alpha_loop_body 0
+3:
+	cmp r0, ip
+	bhi 1b
+	bx lr
+
+
+
+.macro sprite_ablit16_loop_body n
+	ldr r1, [r3, #CTRL0_OFFS]
+	ldr r2, [r3, #POP2_OFFS]
+	lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1
+	bcs 2f
+	ldrh r2, [r2]
+	strh r2, [r0, #2*\n]
+2:
+.endm
+
+decl_func sprite_ablit16_loop
+	mov ip, r0
+
+	lsrs r2, r1, #3
+	lsls r2, #3
+	eors r1, r2
+	lsls r2, #1 // Each pixel is 2 bytes
+	add r0, r2
+
+	adr r2, 3f
+	movs r3, #12 // Each (non-unrolled) loop body is 12 bytes
+	muls r1, r3
+	subs r2, r1
+	adds r2, #1
+
+	ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
+	bx r2
+
+.align 2
+	nop
+1:
+	subs r0, #16
+	sprite_ablit16_loop_body 7
+	sprite_ablit16_loop_body 6
+	sprite_ablit16_loop_body 5
+	sprite_ablit16_loop_body 4
+	sprite_ablit16_loop_body 3
+	sprite_ablit16_loop_body 2
+	sprite_ablit16_loop_body 1
+	sprite_ablit16_loop_body 0
+3:
+	cmp r0, ip
+	bne 1b
+	bx lr
+
+#endif
+
+#define FIX_OVERF_CHECK 1
+
+#ifndef RISCV_HAVE_COMPRESSED_ISA
+#error "Address calculations are incorrect if not assembled with C extension"
+#endif
+.macro sprite_ablit16_alpha_loop_body n
+	// Instructions which are only compressible under Zcb (e.g. lhu, sh) are
+	// forced uncompressed, to get consistent size for address calculations.
+	// This code should be exactly 24 bytes.
+
+	// Bit 25 is OVERF, bit 24 is OVERF1, bits 31:26 are zero, so can test for
+	// overflow by testing the uppermost byte of CTRL0 for nonzero.
+#if !FIX_OVERF_CHECK
+	norvc_2a lbu a1, CTRL0_OFFS+3(a5)
+	lw a2, POP2_OFFS(a5)
+	bnez a1, 2f
+#else
+	lw a1, ACCUM0_OFFS(a5)
+	lw a3, ACCUM1_OFFS(a5)
+	lw a2, POP2_OFFS(a5)
+	srli a1, a1, 7 + 16
+	bnez a1, 2f
+	srli a3, a3, 7 + 16
+	bnez a3, 2f
+#endif
+	norvc_2a lhu a2, (a2)
+	// TODO dep stall on lhu, but it makes the OVERF case faster:
+	slli a1, a2, 32 - ALPHA_SHIFT_16BPP
+	bgez a1, 2f
+	norvc_2a sh a2, 2*\n(a0)
+2:
+.endm
+
+decl_func sprite_ablit16_alpha_loop
+	mv a4, a0
+	li a5, SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET
+
+	// Split off pixels modulo 8
+	andi a2, a1, 0x7
+	sub a1, a1, a2
+	// Pointer to beginning of endmost block of 8 pixels:
+	sh1add a0, a1, a0
+
+	// Compute branch into first loop, which has the modulo-8 pixels.
+	// Each pixel takes 24 bytes of instructions.
+#if !FIX_OVERF_CHECK
+	slli a2, a2, 3
+	sh1add a2, a2, a2
+#else
+	li a3, 30
+	mul a2, a2, a3
+#endif
+
+	la a1, 3f
+	sub a1, a1, a2
+	jr a1
+
+.align 2
+1:
+	norvc_3a addi a0, a0, -16
+	sprite_ablit16_alpha_loop_body 7
+	sprite_ablit16_alpha_loop_body 6
+	sprite_ablit16_alpha_loop_body 5
+	sprite_ablit16_alpha_loop_body 4
+	sprite_ablit16_alpha_loop_body 3
+	sprite_ablit16_alpha_loop_body 2
+	sprite_ablit16_alpha_loop_body 1
+	sprite_ablit16_alpha_loop_body 0
+3:
+	bltu a4, a0, 1b
+	ret
--- a/software/libsprite/tile.c
+++ b/software/libsprite/tile.c
@ -1,6 +1,6 @@
 #include "tile.h"

-#include "pico/platform.h" // for __not_in_flash
+#include "pico.h" // for __not_in_flash
 #include "hardware/interp.h"

 #define __ram_func(foo) __not_in_flash(#foo) foo
--- a/software/libsprite/tile_armv6m.S
+++ b/software/libsprite/tile_armv6m.S
--- a/software/libsprite/tile_riscv.S
+++ b/software/libsprite/tile_riscv.S
@ -0,0 +1,188 @@
+#include "hardware/regs/addressmap.h"
+#include "hardware/regs/sio.h"
+
+#include "sprite_asm_const.h"
+
+#define POP2_OFFS (SIO_INTERP0_POP_FULL_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
+
+// ----------------------------------------------------------------------------
+// Tile layout
+//
+// Some terms:
+// Tileset: 1D array of tile images, concatenated image-after-image
+// Tilemap: 2D array of tileset indices
+//
+// Each tile image in a tileset is the same size. Tiles are square, either 8 x
+// 8 px or 16 x 16 px. This makes it easy to find the start of a tile image
+// given the tileset base pointer and a tile index (add + shift).
+//
+// Tilemaps are 8 bits per tile, always.
+//
+// One advantage of this layout is that y coordinates can be handled outside
+// of the loops in this file, which are all scanline-oriented, by offsetting
+// the tileset and tilemap pointers passed in. These routines only care about
+// x. The tileset pointer is offset by y modulo tile height, and the tilemap
+// pointer is offset by y divided by tile height, modulo tileset height in
+// tiles.
+
+// Tileset: 16px tiles, 16bpp, with 1-bit alpha.
+// Tilemap: 8 bit indices.
+
+.macro do_2px_16bpp_alpha rd rs rx dstoffs
+.option push
+.option norvc
+	// TODO we could save a shift here by making alpha the MSB (not worth it
+	// on Arm due to lack of sign-extension or flag update on loads)
+	slli \rx, \rs, 32 - ALPHA_SHIFT_16BPP
+	bgez \rx, 1f
+	sh \rs, \dstoffs(\rd)
+1:
+	slli \rx, \rs, 16 - ALPHA_SHIFT_16BPP
+	bgez \rx, 1f
+	srli \rs, \rs, 16
+	sh \rs, \dstoffs+2(\rd)
+1:
+.option pop
+.endm
+
+.macro do_2px_16bpp rd rs dstoffs
+	sh \rs, \dstoffs(\rd)
+	srli \rs, \rs, 16
+	sh \rs, \dstoffs+2(\rd)
+.endm
+
+// interp1 has been set up to give the next x-ward pointer into the tilemap
+// with each pop. This saves us having to remember the tilemap pointer and
+// tilemap x size mask in core registers.
+
+// a0: dst
+// a1: tileset
+// a2: x0 (start pos in tile space)
+// a3: x1 (end pos in tile space, exclusive)
+
+// Instantiated with alpha=1 and alpha=0 to get both variants of the loop.
+// Linker garbage collection ensures we only keep the versions we use.
+
+.macro tile16_16px_loop_alpha_or_nonalpha alpha
+	li a7, SIO_BASE + SIO_INTERP1_ACCUM0_OFFSET
+
+	// The main loop only handles whole tiles, so we may need to first copy
+	// individual pixels to get tile-aligned. Skip this entirely if we are
+	// already aligned, to avoid the extra interp pop.
+	andi a5, a2, 0xf
+	beqz a5, 3f
+
+	// Get pointer to tileset image
+	lw a4, POP2_OFFS(a7)
+	lbu a4, (a4)   // dep stall
+	slli a4, a4, 9 // 16 px wide * 16 px high * 2 bytes/px
+	add a4, a4, a1
+	// Offset tile image pointer to align with x0
+	sh1add a4, a5, a4
+	// Fall through into copy loop
+1:
+	lhu a5, (a4)
+	addi a4, a4, 2 // hoisted to fill load dependency slot
+.if \alpha
+	slli a6, a5, 32 - ALPHA_SHIFT_16BPP
+	bgez a6, 2f
+.endif
+	sh a5, (a0)
+2:
+	addi a0, a0, 2
+	addi a2, a2, 1
+	// Skip out if we have already reached end of span:
+	bgeu a2, a3, 3f
+	// Loop if we are not yet aligned: (TODO these checks could be merged)
+	andi a6, a2, 0xf
+	bnez a6, 1b
+3:
+	// The next output pixel is aligned to the start of a tile. Set up main loop.
+
+	// Tileset pointer is only needed occasionally, so free up a1 for better
+	// code density:
+	mv t0, a1
+	// t1: dst limit pointer at end of all pixels:
+	sub a3, a3, a2
+	sh1add t1, a3, a0
+	// a5: dst limit pointer at end of whole tiles:
+	andi a4, a3, ~0xf
+	sh1add a5, a4, a0
+
+	// a0 is dst, a7 is interp base, a1-a4 are trashed by loop, a5 is dst limit.
+	// Early skip for case of 0 whole tiles:
+	bgeu a0, a5, 3f
+2:
+	// Get next tilemap pointer
+	lw a1, POP2_OFFS(a7)
+	// Get tile image pointer
+	lbu a1, (a1) // dep stall
+	slli a1, a1, 9
+	add a1, a1, t0
+
+.if \alpha
+	lw a3,  0(a1)
+	lw a4,  4(a1)
+	do_2px_16bpp_alpha a0 a3 a2 0
+	do_2px_16bpp_alpha a0 a4 a2 4
+	lw a3,  8(a1)
+	lw a4, 12(a1)
+	do_2px_16bpp_alpha a0 a3 a2 8
+	do_2px_16bpp_alpha a0 a4 a2 12
+	lw a3, 16(a1)
+	lw a4, 20(a1)
+	do_2px_16bpp_alpha a0 a3 a2 16
+	do_2px_16bpp_alpha a0 a4 a2 20
+	lw a3, 24(a1)
+	lw a4, 28(a1)
+	do_2px_16bpp_alpha a0 a3 a2 24
+	do_2px_16bpp_alpha a0 a4 a2 28
+.else
+	lw a3,  0(a1)
+	lw a4,  4(a1)
+	do_2px_16bpp a0 a3 0
+	do_2px_16bpp a0 a4 4
+	lw a3,  8(a1)
+	lw a4, 12(a1)
+	do_2px_16bpp a0 a3 8
+	do_2px_16bpp a0 a4 12
+	lw a3, 16(a1)
+	lw a4, 20(a1)
+	do_2px_16bpp a0 a3 16
+	do_2px_16bpp a0 a4 20
+	lw a3, 24(a1)
+	lw a4, 28(a1)
+	do_2px_16bpp a0 a3 24
+	do_2px_16bpp a0 a4 28
+.endif
+	addi a0, a0, 32
+	bltu a0, a5, 2b
+3:
+
+	// Skip ahead if there are no spare pixels to tidy up	
+	bgeu a0, t1, 3f
+	// Copy <1 tile's worth of loose pixels
+	lw a4, POP2_OFFS(a7)
+	lbu a4, (a4) // dep stall
+	slli a4, a4, 9
+	add a4, a4, t0
+1:
+	lh a5, (a4)
+	addi a4, a4, 2
+.if \alpha
+	slli a6, a5, 32 - ALPHA_SHIFT_16BPP
+	bgez a6, 2f
+.endif
+	sh a5, (a0)
+2:
+	addi a0, a0, 2
+	bltu a0, t1, 1b
+3:
+	ret
+.endm
+
+decl_func tile16_16px_alpha_loop
+	tile16_16px_loop_alpha_or_nonalpha 1
+
+decl_func tile16_16px_loop
+	tile16_16px_loop_alpha_or_nonalpha 0