From 58cc97e29fccd80d6803e1c763c762f03eb1a2e4 Mon Sep 17 00:00:00 2001
From: Luke Wren <wren6991@gmail.com>
Date: Sun, 28 Feb 2021 11:23:38 +0000
Subject: [PATCH] Hacking on 2 symbols per word (single->diff inside PIO), it
 compiles, and that is the best I can say about it

---
 software/include/common_dvi_pin_configs.h |   2 +-
 software/libdvi/CMakeLists.txt            |   1 +
 software/libdvi/dvi.c                     |  20 ++--
 software/libdvi/dvi_config_defs.h         |  16 ++-
 software/libdvi/dvi_serialiser.c          |  56 ++++-----
 software/libdvi/dvi_serialiser.h          |   1 -
 software/libdvi/dvi_serialiser.pio        |  82 +++++---------
 software/libdvi/dvi_timing.c              |  53 +++++----
 software/libdvi/tmds_encode.S             |  52 ++++-----
 software/libdvi/tmds_table.h              | 132 +++++++++++-----------
 software/libdvi/tmds_table_gen.py         |  38 +++++--
 11 files changed, 234 insertions(+), 219 deletions(-)

diff --git a/software/include/common_dvi_pin_configs.h b/software/include/common_dvi_pin_configs.h
index c562e50..d853283 100644
--- a/software/include/common_dvi_pin_configs.h
+++ b/software/include/common_dvi_pin_configs.h
@@ -8,7 +8,7 @@
 #include "dvi_serialiser.h"
 
 #ifndef DEFAULT_DVI_SERIAL_CONFIG
-#define DEFAULT_DVI_SERIAL_CONFIG picodvi_dvi_cfg
+#define DEFAULT_DVI_SERIAL_CONFIG pico_sock_cfg
 #endif
 
 // ----------------------------------------------------------------------------
diff --git a/software/libdvi/CMakeLists.txt b/software/libdvi/CMakeLists.txt
index d615140..7c52661 100644
--- a/software/libdvi/CMakeLists.txt
+++ b/software/libdvi/CMakeLists.txt
@@ -26,6 +26,7 @@ target_link_libraries(libdvi INTERFACE
 	hardware_dma
 	hardware_interp
 	hardware_pio
+	hardware_pwm
 	)
 
 pico_generate_pio_header(libdvi ${CMAKE_CURRENT_LIST_DIR}/dvi_serialiser.pio)
diff --git a/software/libdvi/dvi.c b/software/libdvi/dvi.c
index 3efbe20..7b827b6 100644
--- a/software/libdvi/dvi.c
+++ b/software/libdvi/dvi.c
@@ -43,9 +43,9 @@ void dvi_init(struct dvi_inst *inst, uint spinlock_tmds_queue, uint spinlock_col
 	for (int i = 0; i < DVI_N_TMDS_BUFFERS; ++i) {
 		void *tmdsbuf;
 #if DVI_MONOCHROME_TMDS
-		tmdsbuf = malloc(inst->timing->h_active_pixels * sizeof(uint32_t));
+		tmdsbuf = malloc(inst->timing->h_active_pixels / DVI_SYMBOLS_PER_WORD * sizeof(uint32_t));
 #else
-		tmdsbuf = malloc(3 * inst->timing->h_active_pixels * sizeof(uint32_t));
+		tmdsbuf = malloc(3 * inst->timing->h_active_pixels / DVI_SYMBOLS_PER_WORD * sizeof(uint32_t));
 #endif
 		if (!tmdsbuf)
 			panic("TMDS buffer allocation failed");
@@ -118,6 +118,7 @@ static inline void __dvi_func_x(_dvi_prepare_scanline_8bpp)(struct dvi_inst *ins
 	uint32_t *tmdsbuf;
 	queue_remove_blocking_u32(&inst->q_tmds_free, &tmdsbuf);
 	uint pixwidth = inst->timing->h_active_pixels;
+	uint words_per_channel = pixwidth / DVI_SYMBOLS_PER_WORD;
 	// TODO maybe want to make this configurable one day
 	// anyhoo we are abutting the buffers in TMDS channel order
 	const uint red_msb   = 7;
@@ -126,10 +127,10 @@ static inline void __dvi_func_x(_dvi_prepare_scanline_8bpp)(struct dvi_inst *ins
 	const uint green_lsb = 2;
 	const uint blue_msb  = 1;
 	const uint blue_lsb  = 0;
-	// NB the scanline buffers are half-resolution!
-	tmds_encode_data_channel_8bpp(scanbuf, tmdsbuf, pixwidth / 2, blue_msb, blue_lsb);
-	tmds_encode_data_channel_8bpp(scanbuf, tmdsbuf + pixwidth, pixwidth / 2, green_msb, green_lsb);
-	tmds_encode_data_channel_8bpp(scanbuf, tmdsbuf + 2 * pixwidth, pixwidth / 2, red_msb, red_lsb);
+	// Scanline buffers are half-resolution; the functions take the number of *input* pixels as parameter.
+	tmds_encode_data_channel_8bpp(scanbuf, tmdsbuf + 0 * words_per_channel, pixwidth / 2, blue_msb, blue_lsb);
+	tmds_encode_data_channel_8bpp(scanbuf, tmdsbuf + 1 * words_per_channel, pixwidth / 2, green_msb, green_lsb);
+	tmds_encode_data_channel_8bpp(scanbuf, tmdsbuf + 2 * words_per_channel, pixwidth / 2, red_msb, red_lsb);
 	queue_add_blocking_u32(&inst->q_tmds_valid, &tmdsbuf);
 }
 
@@ -137,15 +138,16 @@ static inline void __dvi_func_x(_dvi_prepare_scanline_16bpp)(struct dvi_inst *in
 	uint32_t *tmdsbuf;
 	queue_remove_blocking_u32(&inst->q_tmds_free, &tmdsbuf);
 	uint pixwidth = inst->timing->h_active_pixels;
+	uint words_per_channel = pixwidth / DVI_SYMBOLS_PER_WORD;
 	const uint red_msb   = 15;
 	const uint red_lsb   = 11;
 	const uint green_msb = 10;
 	const uint green_lsb = 5;
 	const uint blue_msb  = 4;
 	const uint blue_lsb  = 0;
-	tmds_encode_data_channel_16bpp(scanbuf, tmdsbuf, pixwidth / 2, blue_msb, blue_lsb);
-	tmds_encode_data_channel_16bpp(scanbuf, tmdsbuf + pixwidth, pixwidth / 2, green_msb, green_lsb);
-	tmds_encode_data_channel_16bpp(scanbuf, tmdsbuf + 2 * pixwidth, pixwidth / 2, red_msb, red_lsb);
+	tmds_encode_data_channel_16bpp(scanbuf, tmdsbuf + 0 * words_per_channel, pixwidth / 2, blue_msb, blue_lsb);
+	tmds_encode_data_channel_16bpp(scanbuf, tmdsbuf + 1 * words_per_channel, pixwidth / 2, green_msb, green_lsb);
+	tmds_encode_data_channel_16bpp(scanbuf, tmdsbuf + 2 * words_per_channel, pixwidth / 2, red_msb, red_lsb);
 	queue_add_blocking_u32(&inst->q_tmds_valid, &tmdsbuf);
 }
 
diff --git a/software/libdvi/dvi_config_defs.h b/software/libdvi/dvi_config_defs.h
index 3245787..a8992f9 100644
--- a/software/libdvi/dvi_config_defs.h
+++ b/software/libdvi/dvi_config_defs.h
@@ -7,7 +7,7 @@
 // target_compile_definitions())
 
 // Pull in base headers to make sure board definitions override the
-// definitions provided here.
+// definitions provided here. Note this file is included in asm and C.
 #include "hardware/platform_defs.h"
 #include "pico/config.h"
 
@@ -40,6 +40,20 @@
 #define DVI_MONOCHROME_TMDS 0
 #endif
 
+// By default, we assume each 32-bit word written to a PIO FIFO contains 2x
+// 10-bit TMDS symbols, concatenated into the lower 20 bits, least-significant
+// first. This is convenient if you are generating two or more pixels at once,
+// e.g. using the pixel-doubling TMDS encode. You can change this value to 1
+// (so each word contains 1 symbol) for e.g. full resolution RGB encode. Note
+// that this value needs to divide the DVI horizontal timings, so is limited
+// to 1 or 2.
+#ifndef DVI_SYMBOLS_PER_WORD
+#define DVI_SYMBOLS_PER_WORD 2
+#endif
+
+#if DVI_SYMBOLS_PER_WORD != 1 && DVI_SYMBOLS_PER_WORD !=2
+#error "Unsupported value for DVI_SYMBOLS_PER_WORD"
+#endif
 
 // ----------------------------------------------------------------------------
 // TMDS encode controls
diff --git a/software/libdvi/dvi_serialiser.c b/software/libdvi/dvi_serialiser.c
index 823bf30..f740680 100644
--- a/software/libdvi/dvi_serialiser.c
+++ b/software/libdvi/dvi_serialiser.c
@@ -1,16 +1,17 @@
 #include "pico.h"
 #include "hardware/pio.h"
 #include "hardware/gpio.h"
+#include "hardware/pwm.h"
 #include "hardware/structs/padsbank0.h"
 
 #include "dvi.h"
 #include "dvi_serialiser.h"
 #include "dvi_serialiser.pio.h"
 
-static void dvi_init_gpio(uint gpio, bool invert) {
+static void dvi_configure_pad(uint gpio, bool invert) {
 	// 2 mA drive, enable slew rate limiting (this seems fine even at 720p30, and
 	// the 3V3 LDO doesn't get warm like when turning all the GPIOs up to 11).
-	// Also disable digital reciever.
+	// Also disable digital receiver.
 	hw_write_masked(
 		&padsbank0_hw->io[gpio],
 		(0 << PADS_BANK0_GPIO0_DRIVE_LSB),
@@ -22,52 +23,51 @@ static void dvi_init_gpio(uint gpio, bool invert) {
 void dvi_serialiser_init(struct dvi_serialiser_cfg *cfg) {
 #if DVI_SERIAL_DEBUG
 	uint offset = pio_add_program(cfg->pio, &dvi_serialiser_debug_program);
-	uint offset_clk = offset;
 #else
 	uint offset = pio_add_program(cfg->pio, &dvi_serialiser_program);
-	uint offset_clk = pio_add_program(cfg->pio, &dvi_serialiser_clk_program);
 #endif
 	cfg->prog_offs = offset;
-	cfg->prog_offs_clk = offset_clk;
 
 	for (int i = 0; i < N_TMDS_LANES; ++i) {
 		pio_sm_claim(cfg->pio, cfg->sm_tmds[i]);
 		dvi_serialiser_program_init(
 			cfg->pio,
 			cfg->sm_tmds[i],
-			i == TMDS_SYNC_LANE ? offset_clk : offset,
+			offset,
 			cfg->pins_tmds[i],
-			cfg->pins_clk,
-			i == TMDS_SYNC_LANE,
 			DVI_SERIAL_DEBUG
 		);
-		dvi_init_gpio(cfg->pins_tmds[i], cfg->invert_diffpairs);
-		dvi_init_gpio(cfg->pins_tmds[i] + 1, cfg->invert_diffpairs);
+		dvi_configure_pad(cfg->pins_tmds[i], cfg->invert_diffpairs);
+		dvi_configure_pad(cfg->pins_tmds[i] + 1, cfg->invert_diffpairs);
 	}
-	dvi_init_gpio(cfg->pins_clk, cfg->invert_diffpairs);
-	dvi_init_gpio(cfg->pins_clk + 1, cfg->invert_diffpairs);
+
+	// Use a PWM slice to drive the pixel clock. Both GPIOs must be on the same
+	// slice (lower-numbered GPIO must be even).
+	assert(cfg->pins_clk % 2 == 0);
+	uint slice = pwm_gpio_to_slice_num(cfg->pins_clk);
+	// 5 cycles high, 5 low. Invert one channel so that we get complementary outputs.
+	const uint pwm_wrap = 10 - 1;
+	const uint pwm_level = pwm_wrap / 2;
+	pwm_config pwm_cfg = pwm_get_default_config();
+	pwm_config_set_output_polarity(&pwm_cfg, true, false);
+	pwm_config_set_wrap(&pwm_cfg, pwm_wrap);
+	pwm_init(slice, &pwm_cfg, false);
+	pwm_set_both_levels(slice, pwm_level, pwm_level);
+
+	dvi_configure_pad(cfg->pins_clk, cfg->invert_diffpairs);
+	dvi_configure_pad(cfg->pins_clk + 1, cfg->invert_diffpairs);
 }
 
 void dvi_serialiser_enable(struct dvi_serialiser_cfg *cfg, bool enable) {
 	uint mask = 0;
 	for (int i = 0; i < N_TMDS_LANES; ++i)
 		mask |= 1u << (cfg->sm_tmds[i] + PIO_CTRL_SM_ENABLE_LSB);
-	if (enable)
+	if (enable) {
 		hw_set_bits(&cfg->pio->ctrl, mask);
-	else
-		hw_clear_bits(&cfg->pio->ctrl, mask);
-}
-
-uint32_t dvi_single_to_diff(uint32_t in) {
-	uint32_t accum = 0;
-	const uint TMDS_SIZE = 10;
-	for (int i = 0; i < TMDS_SIZE; ++i) {
-		accum <<= 2;
-		if (in & 1 << (TMDS_SIZE - 1))
-			accum |= 0x1;
-		else
-			accum |= 0x2;
-		in <<= 1;
+		pwm_set_enabled(pwm_gpio_to_slice_num(cfg->pins_clk), true);
+	}
+	else {
+		hw_clear_bits(&cfg->pio->ctrl, mask);
+		pwm_set_enabled(pwm_gpio_to_slice_num(cfg->pins_clk), false);
 	}
-	return accum;
 }
diff --git a/software/libdvi/dvi_serialiser.h b/software/libdvi/dvi_serialiser.h
index 9e0845b..d978f60 100644
--- a/software/libdvi/dvi_serialiser.h
+++ b/software/libdvi/dvi_serialiser.h
@@ -13,7 +13,6 @@ struct dvi_serialiser_cfg {
 	uint pins_clk;
 	bool invert_diffpairs;
 	uint prog_offs;
-	uint prog_offs_clk;
 };
 
 void dvi_serialiser_init(struct dvi_serialiser_cfg *cfg);
diff --git a/software/libdvi/dvi_serialiser.pio b/software/libdvi/dvi_serialiser.pio
index cb0720a..520c8e0 100644
--- a/software/libdvi/dvi_serialiser.pio
+++ b/software/libdvi/dvi_serialiser.pio
@@ -1,73 +1,51 @@
-.program dvi_serialiser_clk
-.side_set 2
-
-.wrap_target
-	out pins, 2  side 0b10
-	out pins, 2  side 0b10
-	out pins, 2  side 0b10
-	out pins, 2  side 0b10
-	out pins, 2  side 0b10
-	out pins, 2  side 0b01
-	out pins, 2  side 0b01
-	out pins, 2  side 0b01
-	out pins, 2  side 0b01
-	out pins, 2  side 0b01
-.wrap
-
-
 .program dvi_serialiser
+.side_set 2
+.origin 0
 
-.wrap_target
-	out pins, 2
-.wrap
+; Single-ended -> differential serial
 
+	out pc, 1    side 0b10
+	out pc, 1    side 0b01
+
+.program dvi_serialiser_debug
+.side_set 1 opt
 
 ; The debug variant behaves as a UART with 1 start bit, 10 data bits, 1 stop
 ; bit, and 5/6ths the data throughput of the TMDS version.
 
-.program dvi_serialiser_debug
-.side_set 2 opt
-
-.wrap_target
-	pull side 0x1 ; FIFO stall extends stop bit
-	nop  side 0x2
-	out pins, 2
-	out pins, 2
-	out pins, 2
-	out pins, 2
-	out pins, 2
-	out pins, 2
-	out pins, 2
-	out pins, 2
-	out pins, 2
-	out pins, 2
-.wrap
-
+	pull ifempty  side 1 ; Extend stop bit with FIFO stall
+	nop           side 0
+	out pins, 1          ; Unrolled because we require 1 bit / clk
+	out pins, 1
+	out pins, 1
+	out pins, 1
+	out pins, 1
+	out pins, 1
+	out pins, 1
+	out pins, 1
+	out pins, 1
+	out pins, 1
 	
 % c-sdk {
-static inline void dvi_serialiser_program_init(PIO pio, uint sm, uint offset, uint data_pins, uint clk_pins, bool clk, bool debug) {
-    pio_sm_set_pins_with_mask(pio, sm, 1u << data_pins | 1u << clk_pins, 3u << data_pins | 3u << clk_pins);
-    pio_sm_set_pindirs_with_mask(pio, sm, ~0u, 3u << data_pins | 3u << clk_pins);
-    // Pseudo-differential pairs:
+#include "dvi_config_defs.h"
+
+static inline void dvi_serialiser_program_init(PIO pio, uint sm, uint offset, uint data_pins, bool debug) {
+    pio_sm_set_pins_with_mask(pio, sm, 2u << data_pins, 3u << data_pins);
+    pio_sm_set_pindirs_with_mask(pio, sm, ~0u, 3u << data_pins);
     pio_gpio_init(pio, data_pins);
     pio_gpio_init(pio, data_pins + 1);
-    pio_gpio_init(pio, clk_pins);
-    pio_gpio_init(pio, clk_pins + 1);
+
     pio_sm_config c;
     if (debug) {
         c = dvi_serialiser_debug_program_get_default_config(offset);
-        sm_config_set_sideset_pins(&c, data_pins);
-    }
-    else if (clk) {
-        c = dvi_serialiser_clk_program_get_default_config(offset);
-        sm_config_set_sideset_pins(&c, clk_pins);
     }
     else {
         c = dvi_serialiser_program_get_default_config(offset);
     }
-    sm_config_set_out_pins(&c, data_pins, 2);
-    // Each TMDS symbol is 10 pairs of pseudo-differential bits:
-    sm_config_set_out_shift(&c, true, !debug, 20);
+    sm_config_set_sideset_pins(&c, data_pins);
+    if (debug)
+	    sm_config_set_out_pins(&c, data_pins, 1);
+    sm_config_set_out_shift(&c, true, !debug, 10 * DVI_SYMBOLS_PER_WORD);
     sm_config_set_fifo_join(&c, PIO_FIFO_JOIN_TX);
     pio_sm_init(pio, sm, offset, &c);
     pio_sm_set_enabled(pio, sm, false);
diff --git a/software/libdvi/dvi_timing.c b/software/libdvi/dvi_timing.c
index 958d0b2..aa8308f 100644
--- a/software/libdvi/dvi_timing.c
+++ b/software/libdvi/dvi_timing.c
@@ -190,21 +190,25 @@ const struct dvi_timing __dvi_const(dvi_timing_1600x900p_reduced_30hz) = {
 // four regular IRQs per scanline and return early from 3 of them, but this
 // breaks down when you have very short scanline sections like guard bands.
 
-// Note we particularly want these to be in memory because these addresses get
-// a LOT of DMA traffic!
+// Each symbol appears twice, concatenated in one word. Note these must be in
+// RAM because they see a lot of DMA traffic
 const uint32_t __dvi_const(dvi_ctrl_syms)[4] = {
-	0x5999a,
-	0xa6665,
-	0x9999a,
-	0x66665,
+	0xd5354,
+	0x2acab,
+	0x55154,
+	0xaaeab
 };
 
 // Output solid red scanline if we are given NULL for tmdsbuff
-static uint32_t __attribute__((aligned(8))) __dvi_const(empty_scanline_tmds)[6] = {
-	0x9aaaa, 0x95555, // 0x00
-	0x9aaaa, 0x95555, // 0x00
-	0x6aaa9, 0x65556  // 0xfc
+#if DVI_SYMBOLS_PER_WORD == 2
+static uint32_t __attribute__((aligned(8))) __dvi_const(empty_scanline_tmds)[3] = {
+	0x523520u, // 0x00
+	0x523520u, // 0x00
+	0x784897u  // 0xfc
 };
+#else
+#error "Can't handle empty scanlines with pixel-per-word right now"
+#endif
 
 void dvi_timing_state_init(struct dvi_timing_state *t) {
 	t->v_ctr = 0;
@@ -255,17 +259,17 @@ void dvi_setup_scanline_for_vblank(const struct dvi_timing *t, const struct dvi_
 	const uint32_t *sym_no_sync   = get_ctrl_sym(false,  false             );
 
 	dma_cb_t *synclist = dvi_lane_from_list(l, TMDS_SYNC_LANE);
-	_set_data_cb(&synclist[0], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_front_porch,   2, false);
-	_set_data_cb(&synclist[1], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_on,  t->h_sync_width,    2, false);
-	_set_data_cb(&synclist[2], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_back_porch,    2, true);
-	_set_data_cb(&synclist[3], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_active_pixels, 2, false);
+	_set_data_cb(&synclist[0], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_front_porch   / DVI_SYMBOLS_PER_WORD, 2, false);
+	_set_data_cb(&synclist[1], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_on,  t->h_sync_width    / DVI_SYMBOLS_PER_WORD, 2, false);
+	_set_data_cb(&synclist[2], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_back_porch    / DVI_SYMBOLS_PER_WORD, 2, true);
+	_set_data_cb(&synclist[3], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_active_pixels / DVI_SYMBOLS_PER_WORD, 2, false);
 
 	for (int i = 0; i < N_TMDS_LANES; ++i) {
 		if (i == TMDS_SYNC_LANE)
 			continue;
 		dma_cb_t *cblist = dvi_lane_from_list(l, i);
-		_set_data_cb(&cblist[0], &dma_cfg[i], sym_no_sync, t->h_front_porch + t->h_sync_width + t->h_back_porch, 2, false);
-		_set_data_cb(&cblist[1], &dma_cfg[i], sym_no_sync, t->h_active_pixels, 2, false);
+		_set_data_cb(&cblist[0], &dma_cfg[i], sym_no_sync,(t->h_front_porch + t->h_sync_width + t->h_back_porch) / DVI_SYMBOLS_PER_WORD, 2, false);
+		_set_data_cb(&cblist[1], &dma_cfg[i], sym_no_sync, t->h_active_pixels / DVI_SYMBOLS_PER_WORD, 2, false);
 	}
 }
 
@@ -277,23 +281,26 @@ void dvi_setup_scanline_for_active(const struct dvi_timing *t, const struct dvi_
 	const uint32_t *sym_no_sync   = get_ctrl_sym(false,                false             );
 
 	dma_cb_t *synclist = dvi_lane_from_list(l, TMDS_SYNC_LANE);
-	_set_data_cb(&synclist[0], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_front_porch, 2, false);
-	_set_data_cb(&synclist[1], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_on,  t->h_sync_width,  2, false);
-	_set_data_cb(&synclist[2], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_back_porch,  2, true);
+	_set_data_cb(&synclist[0], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_front_porch / DVI_SYMBOLS_PER_WORD, 2, false);
+	_set_data_cb(&synclist[1], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_on,  t->h_sync_width  / DVI_SYMBOLS_PER_WORD, 2, false);
+	_set_data_cb(&synclist[2], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_back_porch  / DVI_SYMBOLS_PER_WORD, 2, true);
 
 	for (int i = 0; i < N_TMDS_LANES; ++i) {
 		dma_cb_t *cblist = dvi_lane_from_list(l, i);
 		if (i != TMDS_SYNC_LANE) {
-			_set_data_cb(&cblist[0], &dma_cfg[i], sym_no_sync, t->h_front_porch + t->h_sync_width + t->h_back_porch, 2, false);
+			_set_data_cb(&cblist[0], &dma_cfg[i], sym_no_sync,
+				(t->h_front_porch + t->h_sync_width + t->h_back_porch) / DVI_SYMBOLS_PER_WORD, 2, false);
 		}
 		int target_block = i == TMDS_SYNC_LANE ? DVI_SYNC_LANE_CHUNKS - 1 :  DVI_NOSYNC_LANE_CHUNKS - 1;
 		if (tmdsbuf) {
 			// Non-repeating DMA for the freshly-encoded TMDS buffer
-			_set_data_cb(&cblist[target_block], &dma_cfg[i], tmdsbuf + i * t->h_active_pixels, t->h_active_pixels, 0, false);
+			_set_data_cb(&cblist[target_block], &dma_cfg[i], tmdsbuf + i * (t->h_active_pixels / DVI_SYMBOLS_PER_WORD),
+				t->h_active_pixels / DVI_SYMBOLS_PER_WORD, 0, false);
 		}
 		else {
 			// 8-byte read ring mode to repeat the correct DC-balanced symbol pair on blank scanlines
-			_set_data_cb(&cblist[target_block], &dma_cfg[i], &empty_scanline_tmds[2 * i], t->h_active_pixels, 3, false);
+			_set_data_cb(&cblist[target_block], &dma_cfg[i], &empty_scanline_tmds[2 * i / DVI_SYMBOLS_PER_WORD],
+				t->h_active_pixels / DVI_SYMBOLS_PER_WORD, DVI_SYMBOLS_PER_WORD == 2 ? 3 : 2, false);
 		}
 	}
 }
@@ -303,7 +310,7 @@ void __dvi_func(dvi_update_scanline_data_dma)(const struct dvi_timing *t, const
 #if DVI_MONOCHROME_TMDS
 		const uint32_t *lane_tmdsbuf = tmdsbuf;
 #else
-		const uint32_t *lane_tmdsbuf = tmdsbuf + i * t->h_active_pixels;
+		const uint32_t *lane_tmdsbuf = tmdsbuf + i * t->h_active_pixels / DVI_SYMBOLS_PER_WORD;
 #endif
 		if (i == TMDS_SYNC_LANE)
 			dvi_lane_from_list(l, i)[3].read_addr = lane_tmdsbuf;
diff --git a/software/libdvi/tmds_encode.S b/software/libdvi/tmds_encode.S
index 3e21708..9a68fd1 100644
--- a/software/libdvi/tmds_encode.S
+++ b/software/libdvi/tmds_encode.S
@@ -44,7 +44,7 @@
 
 decl_func tmds_encode_loop_16bpp
 	push {r4, r5, r6, r7, lr}
-	lsls r2, #3
+	lsls r2, #2
 	add r2, r1
 	mov ip, r2
 	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
@@ -55,10 +55,12 @@ decl_func tmds_encode_loop_16bpp
 	ldmia r0!, {r4}
 	str r4, [r2, #ACCUM0_OFFS]
 	ldr r4, [r2, #PEEK0_OFFS]
-	ldmia r4, {r4, r5}
+	ldr r4, [r4]
 	ldr r6, [r2, #PEEK1_OFFS]
-	ldmia r6, {r6, r7}
-	stmia r1!, {r4, r5, r6, r7}
+	ldr r6, [r6]
+	// TODO our pixels are now 2 per word instead of 1 per word, so this store is
+	// now 2 words instead of 4; reexpand it.
+	stmia r1!, {r4, r6}
 .endr
 2:
 	cmp r1, ip
@@ -75,7 +77,7 @@ decl_func tmds_encode_loop_16bpp
 
 decl_func tmds_encode_loop_16bpp_leftshift
 	push {r4, r5, r6, r7, lr}
-	lsls r2, #3
+	lsls r2, #2
 	add r2, r1
 	mov ip, r2
 	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
@@ -87,10 +89,10 @@ decl_func tmds_encode_loop_16bpp_leftshift
 	lsls r4, r3
 	str r4, [r2, #ACCUM0_OFFS]
 	ldr r4, [r2, #PEEK0_OFFS]
-	ldmia r4, {r4, r5}
+	ldr r4, [r4]
 	ldr r6, [r2, #PEEK1_OFFS]
-	ldmia r6, {r6, r7}
-	stmia r1!, {r4, r5, r6, r7}
+	ldr r6, [r6]
+	stmia r1!, {r4, r6}
 .endr
 2:
 	cmp r1, ip
@@ -103,7 +105,7 @@ decl_func tmds_encode_loop_16bpp_leftshift
 
 decl_func tmds_encode_loop_8bpp
 	push {r4, r5, r6, r7, lr}
-	lsls r2, #3
+	lsls r2, #2
 	add r2, r1
 	mov ip, r2
 	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
@@ -112,17 +114,16 @@ decl_func tmds_encode_loop_8bpp
 1:
 .rept TMDS_ENCODE_UNROLL
 	ldmia  r0!, {r4}
-	str r4, [r2, #ACCUM0_OFFS]
 	str r4, [r2, #ACCUM0_OFFS + INTERP1]
+	str r4, [r2, #ACCUM0_OFFS]
 	ldr r4, [r2, #PEEK0_OFFS]
-	ldmia r4, {r4, r5}
-	ldr r6, [r2, #PEEK1_OFFS]
-	ldmia r6, {r6, r7}
-	stmia r1!, {r4, r5, r6, r7}
-	ldr r4, [r2, #PEEK0_OFFS + INTERP1]
-	ldmia r4, {r4, r5}
-	ldr r6, [r2, #PEEK1_OFFS + INTERP1]
-	ldmia r6, {r6, r7}
+	ldr r4, [r4]
+	ldr r5, [r2, #PEEK1_OFFS]
+	ldr r5, [r5]
+	ldr r6, [r2, #PEEK0_OFFS + INTERP1]
+	ldr r6, [r6]
+	ldr r7, [r2, #PEEK1_OFFS + INTERP1]
+	ldr r7, [r7]
 	stmia r1!, {r4, r5, r6, r7}
 .endr
 2:
@@ -155,14 +156,13 @@ decl_func tmds_encode_loop_8bpp_leftshift
 	lsls r4, r3
 	str r4, [r2, #ACCUM0_OFFS]
 	ldr r4, [r2, #PEEK0_OFFS]
-	ldmia r4, {r4, r5}
-	ldr r6, [r2, #PEEK1_OFFS]
-	ldmia r6, {r6, r7}
-	stmia r1!, {r4, r5, r6, r7}
-	ldr r4, [r2, #PEEK0_OFFS + INTERP1]
-	ldmia r4, {r4, r5}
-	ldr r6, [r2, #PEEK1_OFFS + INTERP1]
-	ldmia r6, {r6, r7}
+	ldr r4, [r4]
+	ldr r5, [r2, #PEEK1_OFFS]
+	ldr r5, [r5]
+	ldr r6, [r2, #PEEK0_OFFS + INTERP1]
+	ldr r6, [r6]
+	ldr r7, [r2, #PEEK1_OFFS + INTERP1]
+	ldr r7, [r7]
 	stmia r1!, {r4, r5, r6, r7}
 .endr
 2:
diff --git a/software/libdvi/tmds_table.h b/software/libdvi/tmds_table.h
index ce7c52d..216100d 100644
--- a/software/libdvi/tmds_table.h
+++ b/software/libdvi/tmds_table.h
@@ -4,73 +4,73 @@
 // with data content *almost* equal (1 LSB off) to input value left shifted by
 // two. The pairs of symbols have a net DC balance of 0.
 //
-// Each symbol is represented by a 20 bit value consisting of 10 differential
-// bit pairs.
+// The two symbols are concatenated in the 20 LSBs of a data word, with the
+// first symbol in least-significant position.
 //
 // Note the declaration isn't included here, just the table body. This is in
 // case you want multiple copies of the table in different SRAMs (particularly
 // scratch X/Y).
-0x9aaaa, 0x95555,
-0x9555a, 0x9aaa5,
-0x9556a, 0x9aa95,
-0x9aa9a, 0x95565,
-0x955aa, 0x9aa55,
-0x9aa5a, 0x955a5,
-0x9aa6a, 0x95595,
-0x9559a, 0x9aa65,
-0x956aa, 0x9a955,
-0x9a95a, 0x956a5,
-0x9a96a, 0x95695,
-0x9569a, 0x9a965,
-0x9a9aa, 0x95655,
-0x9565a, 0x9a9a5,
-0x9566a, 0x9a995,
-0x69aa9, 0x66556,
-0x95aaa, 0x9a555,
-0x9a55a, 0x95aa5,
-0x9a56a, 0x95a95,
-0x95a9a, 0x9a565,
-0x9a5aa, 0x95a55,
-0x95a5a, 0x9a5a5,
-0x95a6a, 0x9a595,
-0x696a9, 0x66956,
-0x9a6aa, 0x95955,
-0x9595a, 0x9a6a5,
-0x9596a, 0x9a695,
-0x695a9, 0x66a56,
-0x959aa, 0x9a655,
-0x69569, 0x66a96,
-0x69559, 0x66aa6,
-0x66aa9, 0x69556,
-0x96aaa, 0x99555,
-0x9955a, 0x96aa5,
-0x9956a, 0x96a95,
-0x96a9a, 0x99565,
-0x995aa, 0x96a55,
-0x96a5a, 0x995a5,
-0x96a6a, 0x99595,
-0x6a6a9, 0x65956,
-0x996aa, 0x96955,
-0x9695a, 0x996a5,
-0x9696a, 0x99695,
-0x6a5a9, 0x65a56,
-0x969aa, 0x99655,
-0x6a569, 0x65a96,
-0x6a559, 0x65aa6,
-0x65aa9, 0x6a556,
-0x99aaa, 0x96555,
-0x9655a, 0x99aa5,
-0x9656a, 0x99a95,
-0x6a9a9, 0x65656,
-0x965aa, 0x99a55,
-0x6a969, 0x65696,
-0x6a959, 0x656a6,
-0x656a9, 0x6a956,
-0x966aa, 0x99955,
-0x6aa69, 0x65596,
-0x6aa59, 0x655a6,
-0x655a9, 0x6aa56,
-0x6aa99, 0x65566,
-0x65569, 0x6aa96,
-0x65559, 0x6aaa6,
-0x6aaa9, 0x65556,
+0x523520u,
+0x265724u,
+0x269816u,
+0x519428u,
+0x278000u,
+0x511244u,
+0x515336u,
+0x273908u,
+0x294368u,
+0x494876u,
+0x498968u,
+0x290276u,
+0x507152u,
+0x282092u,
+0x286184u,
+0x719425u,
+0x327104u,
+0x462140u,
+0x466232u,
+0x323012u,
+0x474416u,
+0x314828u,
+0x318920u,
+0x686689u,
+0x490784u,
+0x298460u,
+0x302552u,
+0x670321u,
+0x310736u,
+0x662137u,
+0x658045u,
+0x653953u,
+0x392576u,
+0x396668u,
+0x400760u,
+0x388484u,
+0x408944u,
+0x380300u,
+0x384392u,
+0x752161u,
+0x425312u,
+0x363932u,
+0x368024u,
+0x735793u,
+0x376208u,
+0x727609u,
+0x723517u,
+0x588481u,
+0x458048u,
+0x331196u,
+0x335288u,
+0x768529u,
+0x343472u,
+0x760345u,
+0x756253u,
+0x555745u,
+0x359840u,
+0x776713u,
+0x772621u,
+0x539377u,
+0x780805u,
+0x531193u,
+0x527101u,
+0x784897u,
diff --git a/software/libdvi/tmds_table_gen.py b/software/libdvi/tmds_table_gen.py
index 8b39b35..e66c0da 100755
--- a/software/libdvi/tmds_table_gen.py
+++ b/software/libdvi/tmds_table_gen.py
@@ -81,20 +81,34 @@ def differentialise(x, n):
 enc = TMDSEncode()
 
 
-def disptable_format(sym):
-	return differentialise(sym, 10) | ((popcount(sym) * 2 - 10 & 0x3f) << 26)
+###
+# Pixel-doubled table:
 
-print("// Non-negative running disparity:")
-for i in range(0, 256, 4):
-	enc.imbalance = 1
-	print("0x{:08x},".format(disptable_format(enc.encode(i, 0, 1))))
+# for i in range(0, 256, 4):
+# 	sym0 = enc.encode(i, 0, 1)
+# 	sym1 = enc.encode(i ^ 1, 0, 1)
+# 	print(f"0x{sym0 | (sym1 << 10)}u,")
 
-print("// Negative running disparity:")
-for i in range(0, 256, 4):
-	enc.imbalance = -1
-	print("0x{:08x},".format(disptable_format(enc.encode(i, 0, 1))))
+###
+# Fullres table stuff:
 
+# def disptable_format(sym):
+# 	return differentialise(sym, 10) | ((popcount(sym) * 2 - 10 & 0x3f) << 26)
 
-# for i in range(4):
-# 	print("0x{:05x},".format(differentialise(enc.encode(0, i, 0), 10)))
+# print("// Non-negative running disparity:")
+# for i in range(0, 256, 4):
+# 	enc.imbalance = 1
+# 	print("0x{:08x},".format(disptable_format(enc.encode(i, 0, 1))))
+
+# print("// Negative running disparity:")
+# for i in range(0, 256, 4):
+# 	enc.imbalance = -1
+# 	print("0x{:08x},".format(disptable_format(enc.encode(i, 0, 1))))
+
+###
+# Control symbols:
+
+for i in range(4):
+	sym = enc.encode(0, i, 0)
+	print(f"0x{sym << 10 | sym:05x},")