kopia lustrzana https://github.com/Wren6991/PicoDVI
				
				
				
			
		
			
				
	
	
		
			624 wiersze
		
	
	
		
			16 KiB
		
	
	
	
		
			ArmAsm
		
	
	
			
		
		
	
	
			624 wiersze
		
	
	
		
			16 KiB
		
	
	
	
		
			ArmAsm
		
	
	
#include "hardware/regs/addressmap.h"
 | 
						|
#include "hardware/regs/sio.h"
 | 
						|
#include "dvi_config_defs.h"
 | 
						|
 | 
						|
// Offsets suitable for ldr/str (must be <= 0x7c):
 | 
						|
#define ACCUM0_OFFS     (SIO_INTERP0_ACCUM0_OFFSET     - SIO_INTERP0_ACCUM0_OFFSET)
 | 
						|
#define ACCUM1_OFFS     (SIO_INTERP0_ACCUM1_OFFSET     - SIO_INTERP0_ACCUM0_OFFSET)
 | 
						|
#define ACCUM1_ADD_OFFS (SIO_INTERP0_ACCUM1_ADD_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
 | 
						|
#define PEEK0_OFFS      (SIO_INTERP0_PEEK_LANE0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
 | 
						|
#define PEEK1_OFFS      (SIO_INTERP0_PEEK_LANE1_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
 | 
						|
#define PEEK2_OFFS      (SIO_INTERP0_PEEK_FULL_OFFSET  - SIO_INTERP0_ACCUM0_OFFSET)
 | 
						|
#define INTERP1         (SIO_INTERP1_ACCUM0_OFFSET     - SIO_INTERP0_ACCUM0_OFFSET)
 | 
						|
// Note the entirety of INTERP0 and INTERP1 fits inside this 5-bit
 | 
						|
// word-addressed space... almost as though it were intentional! :)
 | 
						|
 | 
						|
.syntax unified
 | 
						|
.cpu cortex-m0plus
 | 
						|
.thumb
 | 
						|
 | 
						|
.macro decl_func_x name
 | 
						|
.section .scratch_x.\name, "ax"
 | 
						|
.global \name
 | 
						|
.type \name,%function
 | 
						|
.thumb_func
 | 
						|
\name:
 | 
						|
.endm
 | 
						|
 | 
						|
.macro decl_func_y name
 | 
						|
.section .scratch_y.\name, "ax"
 | 
						|
.global \name
 | 
						|
.type \name,%function
 | 
						|
.thumb_func
 | 
						|
\name:
 | 
						|
.endm
 | 
						|
 | 
						|
#define decl_func decl_func_x
 | 
						|
 | 
						|
// ----------------------------------------------------------------------------
 | 
						|
// Pixel-doubling encoders for RGB
 | 
						|
 | 
						|
// r0: Input buffer (word-aligned)
 | 
						|
// r1: Output buffer (word-aligned)
 | 
						|
// r2: Input size (pixels)
 | 
						|
 | 
						|
.macro do_channel_16bpp r_ibase r_inout0 r_out1
 | 
						|
	str \r_inout0, [\r_ibase, #ACCUM0_OFFS]
 | 
						|
	ldr \r_inout0, [\r_ibase, #PEEK0_OFFS]
 | 
						|
	ldr \r_inout0, [\r_inout0]
 | 
						|
	ldr \r_out1, [\r_ibase, #PEEK1_OFFS]
 | 
						|
	ldr \r_out1, [\r_out1]
 | 
						|
.endm
 | 
						|
 | 
						|
decl_func tmds_encode_loop_16bpp
 | 
						|
	push {r4, r5, r6, r7, lr}
 | 
						|
	lsls r2, #2
 | 
						|
	add r2, r1
 | 
						|
	mov ip, r2
 | 
						|
	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
 | 
						|
	b 2f
 | 
						|
.align 2
 | 
						|
1:
 | 
						|
.rept TMDS_ENCODE_UNROLL
 | 
						|
	ldmia r0!, {r4, r6}
 | 
						|
	do_channel_16bpp r2, r4, r5
 | 
						|
	do_channel_16bpp r2, r6, r7
 | 
						|
	stmia r1!, {r4, r5, r6, r7}
 | 
						|
.endr
 | 
						|
2:
 | 
						|
	cmp r1, ip
 | 
						|
	bne 1b
 | 
						|
	pop {r4, r5, r6, r7, pc}
 | 
						|
 | 
						|
// Same as above, but scale data to make up for lack of left shift
 | 
						|
// in interpolator (costs 1 cycle per 2 pixels)
 | 
						|
//
 | 
						|
// r0: Input buffer (word-aligned)
 | 
						|
// r1: Output buffer (word-aligned)
 | 
						|
// r2: Input size (pixels)
 | 
						|
// r3: Left shift amount
 | 
						|
 | 
						|
decl_func tmds_encode_loop_16bpp_leftshift
 | 
						|
	push {r4, r5, r6, r7, lr}
 | 
						|
	lsls r2, #2
 | 
						|
	add r2, r1
 | 
						|
	mov ip, r2
 | 
						|
	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
 | 
						|
	b 2f
 | 
						|
.align 2
 | 
						|
1:
 | 
						|
.rept TMDS_ENCODE_UNROLL
 | 
						|
	ldmia r0!, {r4, r6}
 | 
						|
	lsls r4, r3
 | 
						|
	do_channel_16bpp r2, r4, r5
 | 
						|
	lsls r6, r3
 | 
						|
	do_channel_16bpp r2, r6, r7
 | 
						|
	stmia r1!, {r4, r5, r6, r7}
 | 
						|
.endr
 | 
						|
2:
 | 
						|
	cmp r1, ip
 | 
						|
	bne 1b
 | 
						|
	pop {r4, r5, r6, r7, pc}
 | 
						|
 | 
						|
// r0: Input buffer (word-aligned)
 | 
						|
// r1: Output buffer (word-aligned)
 | 
						|
// r2: Input size (pixels)
 | 
						|
 | 
						|
decl_func tmds_encode_loop_8bpp
 | 
						|
	push {r4, r5, r6, r7, lr}
 | 
						|
	lsls r2, #2
 | 
						|
	add r2, r1
 | 
						|
	mov ip, r2
 | 
						|
	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
 | 
						|
	b 2f
 | 
						|
.align 2
 | 
						|
1:
 | 
						|
.rept TMDS_ENCODE_UNROLL
 | 
						|
	ldmia  r0!, {r4}
 | 
						|
	str r4, [r2, #ACCUM0_OFFS + INTERP1]
 | 
						|
	str r4, [r2, #ACCUM0_OFFS]
 | 
						|
	ldr r4, [r2, #PEEK0_OFFS]
 | 
						|
	ldr r4, [r4]
 | 
						|
	ldr r5, [r2, #PEEK1_OFFS]
 | 
						|
	ldr r5, [r5]
 | 
						|
	ldr r6, [r2, #PEEK0_OFFS + INTERP1]
 | 
						|
	ldr r6, [r6]
 | 
						|
	ldr r7, [r2, #PEEK1_OFFS + INTERP1]
 | 
						|
	ldr r7, [r7]
 | 
						|
	stmia r1!, {r4, r5, r6, r7}
 | 
						|
.endr
 | 
						|
2:
 | 
						|
	cmp r1, ip
 | 
						|
	bne 1b
 | 
						|
	pop {r4, r5, r6, r7, pc}
 | 
						|
 | 
						|
// r0: Input buffer (word-aligned)
 | 
						|
// r1: Output buffer (word-aligned)
 | 
						|
// r2: Input size (pixels)
 | 
						|
// r3: Left shift amount
 | 
						|
//
 | 
						|
// Note that only the data written to interp0 (pixel 0, 1) is leftshifted, not
 | 
						|
// the data written to interp1 (pixel 2, 3). Otherwise we always lose MSBs, as
 | 
						|
// the LUT offset MSB is at bit 8, so pixel 0 always requires some left shift,
 | 
						|
// since its channel MSBs are no greater than 7.
 | 
						|
 | 
						|
decl_func tmds_encode_loop_8bpp_leftshift
 | 
						|
	push {r4, r5, r6, r7, lr}
 | 
						|
	lsls r2, #3
 | 
						|
	add r2, r1
 | 
						|
	mov ip, r2
 | 
						|
	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
 | 
						|
	b 2f
 | 
						|
.align 2
 | 
						|
1:
 | 
						|
.rept TMDS_ENCODE_UNROLL
 | 
						|
	ldmia  r0!, {r4}
 | 
						|
	str r4, [r2, #ACCUM0_OFFS + INTERP1]
 | 
						|
	lsls r4, r3
 | 
						|
	str r4, [r2, #ACCUM0_OFFS]
 | 
						|
	ldr r4, [r2, #PEEK0_OFFS]
 | 
						|
	ldr r4, [r4]
 | 
						|
	ldr r5, [r2, #PEEK1_OFFS]
 | 
						|
	ldr r5, [r5]
 | 
						|
	ldr r6, [r2, #PEEK0_OFFS + INTERP1]
 | 
						|
	ldr r6, [r6]
 | 
						|
	ldr r7, [r2, #PEEK1_OFFS + INTERP1]
 | 
						|
	ldr r7, [r7]
 | 
						|
	stmia r1!, {r4, r5, r6, r7}
 | 
						|
.endr
 | 
						|
2:
 | 
						|
	cmp r1, ip
 | 
						|
	bne 1b
 | 
						|
	pop {r4, r5, r6, r7, pc}
 | 
						|
 | 
						|
// ----------------------------------------------------------------------------
 | 
						|
// Fast 1bpp black/white encoder (full res)
 | 
						|
 | 
						|
// Taking the encoder from DVI spec, with initial balance 0:
 | 
						|
// 
 | 
						|
// - Encoding either 0x00 or 0xff will produce a running balance of -8, with
 | 
						|
//   output symbol of 0x100 or 0x200
 | 
						|
// 
 | 
						|
// - Subsequently encoding either 0x01 or 0xfe will return the balance to 0, with
 | 
						|
//  output symbol of 0x1ff or 0x2ff
 | 
						|
// 
 | 
						|
// So we can do 1bpp encode with a lookup of x coordinate LSB, and input
 | 
						|
// colour bit. If we process pixels in even-sized blocks, only the colour
 | 
						|
// lookup is needed.
 | 
						|
 | 
						|
// Encode 8 pixels @ 1bpp (using two table lookups)
 | 
						|
// r3 contains lookup mask (preshifted)
 | 
						|
// r8 contains pointer to encode table
 | 
						|
// 2.125 cyc/pix
 | 
						|
.macro tmds_encode_1bpp_body shift_instr0 shamt0 shift_instr1 shamt1
 | 
						|
	\shift_instr0 r4, r2, #\shamt0
 | 
						|
	ands r4, r3
 | 
						|
	add r4, r8
 | 
						|
	ldmia r4, {r4, r5}
 | 
						|
	\shift_instr1 r6, r2, #\shamt1
 | 
						|
	ands r6, r3
 | 
						|
	add r6, r8
 | 
						|
	ldmia r6, {r6, r7}
 | 
						|
	stmia r1!, {r4, r5, r6, r7}
 | 
						|
.endm
 | 
						|
 | 
						|
// r0: input buffer (word-aligned)
 | 
						|
// r1: output buffer (word-aligned)
 | 
						|
// r2: output pixel count
 | 
						|
decl_func tmds_encode_1bpp
 | 
						|
	push {r4-r7, lr}
 | 
						|
	mov r7, r8
 | 
						|
	push {r7}
 | 
						|
	lsls r2, #1
 | 
						|
	add r2, r1
 | 
						|
	mov ip, r2
 | 
						|
	adr r4, tmds_1bpp_table
 | 
						|
	mov r8, r4
 | 
						|
	// Mask: 4 bit index, 8 bytes per entry
 | 
						|
	movs r3, #0x78
 | 
						|
	b 2f
 | 
						|
1:
 | 
						|
	ldmia r0!, {r2}
 | 
						|
#if !DVI_1BPP_BIT_REVERSE
 | 
						|
	tmds_encode_1bpp_body lsls 3  lsrs 1
 | 
						|
	tmds_encode_1bpp_body lsrs 5  lsrs 9
 | 
						|
	tmds_encode_1bpp_body lsrs 13 lsrs 17
 | 
						|
	tmds_encode_1bpp_body lsrs 21 lsrs 25
 | 
						|
#else
 | 
						|
	tmds_encode_1bpp_body lsrs 1   lsls 3
 | 
						|
	tmds_encode_1bpp_body lsrs 9   lsrs 5
 | 
						|
	tmds_encode_1bpp_body lsrs 17  lsrs 13
 | 
						|
	tmds_encode_1bpp_body lsrs 25  lsrs 21
 | 
						|
#endif
 | 
						|
2:
 | 
						|
	cmp r1, ip
 | 
						|
	blo 1b
 | 
						|
 | 
						|
	pop {r7}
 | 
						|
	mov r8, r7
 | 
						|
	pop {r4-r7, pc}
 | 
						|
 | 
						|
.align 2
 | 
						|
tmds_1bpp_table:
 | 
						|
#if !DVI_1BPP_BIT_REVERSE
 | 
						|
	.word 0x7fd00, 0x7fd00  // 0000
 | 
						|
	.word 0x7fe00, 0x7fd00  // 0001
 | 
						|
	.word 0xbfd00, 0x7fd00  // 0010
 | 
						|
	.word 0xbfe00, 0x7fd00  // 0011
 | 
						|
	.word 0x7fd00, 0x7fe00  // 0100
 | 
						|
	.word 0x7fe00, 0x7fe00  // 0101
 | 
						|
	.word 0xbfd00, 0x7fe00  // 0110
 | 
						|
	.word 0xbfe00, 0x7fe00  // 0111
 | 
						|
	.word 0x7fd00, 0xbfd00  // 1000
 | 
						|
	.word 0x7fe00, 0xbfd00  // 1001
 | 
						|
	.word 0xbfd00, 0xbfd00  // 1010
 | 
						|
	.word 0xbfe00, 0xbfd00  // 1011
 | 
						|
	.word 0x7fd00, 0xbfe00  // 1100
 | 
						|
	.word 0x7fe00, 0xbfe00  // 1101
 | 
						|
	.word 0xbfd00, 0xbfe00  // 1110
 | 
						|
	.word 0xbfe00, 0xbfe00  // 1111
 | 
						|
#else
 | 
						|
	.word 0x7fd00, 0x7fd00  // 0000
 | 
						|
	.word 0x7fd00, 0xbfd00  // 1000
 | 
						|
	.word 0x7fd00, 0x7fe00  // 0100
 | 
						|
	.word 0x7fd00, 0xbfe00  // 1100
 | 
						|
	.word 0xbfd00, 0x7fd00  // 0010
 | 
						|
	.word 0xbfd00, 0xbfd00  // 1010
 | 
						|
	.word 0xbfd00, 0x7fe00  // 0110
 | 
						|
	.word 0xbfd00, 0xbfe00  // 1110
 | 
						|
	.word 0x7fe00, 0x7fd00  // 0001
 | 
						|
	.word 0x7fe00, 0xbfd00  // 1001
 | 
						|
	.word 0x7fe00, 0x7fe00  // 0101
 | 
						|
	.word 0x7fe00, 0xbfe00  // 1101
 | 
						|
	.word 0xbfe00, 0x7fd00  // 0011
 | 
						|
	.word 0xbfe00, 0xbfd00  // 1011
 | 
						|
	.word 0xbfe00, 0x7fe00  // 0111
 | 
						|
	.word 0xbfe00, 0xbfe00  // 1111
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
// ----------------------------------------------------------------------------
 | 
						|
// Full-resolution 2bpp encode (for 2bpp grayscale, or bitplaned RGB222)
 | 
						|
 | 
						|
// Even-x-position pixels are encoded as symbols with imbalance -4, and odd
 | 
						|
// pixels with +4, so that we can mix-and-match our even/odd codewords and
 | 
						|
// always get a properly balanced sequence:
 | 
						|
//
 | 
						|
// level 0: (05 -> 103), then (04 -> 1fc)  (decimal 5, 4)
 | 
						|
// level 1: (50 -> 130), then (51 -> 1cf)  (decimal 80, 81)
 | 
						|
// level 2: (af -> 230), then (ae -> 2cf)  (decimal 175, 174)
 | 
						|
// level 3: (fa -> 203), then (fb -> 2fc)  (decimal 250, 251)
 | 
						|
//
 | 
						|
// These correspond to roughly 255 times (0, 1/3, 2/3, 1).
 | 
						|
//
 | 
						|
// Alternatively we could use symbols with 0 balance, which results in lower
 | 
						|
// contrast but avoids the LSB bobble:
 | 
						|
//
 | 
						|
// level 0: (10 -> 1f0) always
 | 
						|
// level 1: (5a -> 263) always
 | 
						|
// level 2: (a5 -> 163) always
 | 
						|
// level 3: (ef -> 2f0) always
 | 
						|
 | 
						|
// Table base pointer in r0. Input pixels in r2.
 | 
						|
.macro encode_2bpp_body shift_instr shamt rd
 | 
						|
	\shift_instr \rd, r2, #\shamt
 | 
						|
	ands \rd, r3
 | 
						|
	ldr \rd, [r0, \rd]
 | 
						|
.endm
 | 
						|
 | 
						|
// r0: input buffer (word-aligned)
 | 
						|
// r1: output buffer (word-aligned)
 | 
						|
// r2: output pixel count
 | 
						|
decl_func tmds_encode_2bpp
 | 
						|
	push {r4-r7, lr}
 | 
						|
	mov r7, r8
 | 
						|
	push {r7}
 | 
						|
	mov r8, r0
 | 
						|
	adr r0, tmds_2bpp_table
 | 
						|
	// Mask: 4-bit index into 4-byte entries.
 | 
						|
	movs r3, #0x3c
 | 
						|
	// Limit pointer: 1 word per 2 pixels
 | 
						|
	lsls r2, #1
 | 
						|
	add r2, r1
 | 
						|
	mov ip, r2
 | 
						|
	b 2f
 | 
						|
1:
 | 
						|
	mov r4, r8
 | 
						|
	ldmia r4!, {r2}
 | 
						|
	mov r8, r4
 | 
						|
	encode_2bpp_body lsls 2  r4
 | 
						|
	encode_2bpp_body lsrs 2  r5
 | 
						|
	encode_2bpp_body lsrs 6  r6
 | 
						|
	encode_2bpp_body lsrs 10 r7
 | 
						|
	stmia r1!, {r4-r7}
 | 
						|
	encode_2bpp_body lsrs 14 r4
 | 
						|
	encode_2bpp_body lsrs 18 r5
 | 
						|
	encode_2bpp_body lsrs 22 r6
 | 
						|
	encode_2bpp_body lsrs 26 r7
 | 
						|
	stmia r1!, {r4-r7}
 | 
						|
2:
 | 
						|
	cmp r1, ip
 | 
						|
	blo 1b
 | 
						|
	pop {r7}
 | 
						|
	mov r8, r7
 | 
						|
	pop {r4-r7, pc}
 | 
						|
 | 
						|
.align 2
 | 
						|
tmds_2bpp_table:
 | 
						|
	.word 0x7f103 // 00, 00
 | 
						|
	.word 0x7f130 // 01, 00
 | 
						|
	.word 0x7f230 // 10, 00
 | 
						|
	.word 0x7f203 // 11, 00
 | 
						|
	.word 0x73d03 // 00, 01
 | 
						|
	.word 0x73d30 // 01, 01
 | 
						|
	.word 0x73e30 // 10, 01
 | 
						|
	.word 0x73e03 // 11, 01
 | 
						|
	.word 0xb3d03 // 00, 10
 | 
						|
	.word 0xb3d30 // 01, 10
 | 
						|
	.word 0xb3e30 // 10, 10
 | 
						|
	.word 0xb3e03 // 11, 10
 | 
						|
	.word 0xbf103 // 00, 11
 | 
						|
	.word 0xbf130 // 01, 11
 | 
						|
	.word 0xbf230 // 10, 11
 | 
						|
	.word 0xbf203 // 11, 11
 | 
						|
 | 
						|
// ----------------------------------------------------------------------------
 | 
						|
// Full-resolution RGB encode (not very practical)
 | 
						|
 | 
						|
// Non-doubled TMDS encode. 8.333 cycles per pixel, no exceptions. (This is
 | 
						|
// taking horizontal blanking (at VGA) and dual core into account, and
 | 
						|
// assuming the 3 channels are encoded individually.)
 | 
						|
//
 | 
						|
// Here is an idea
 | 
						|
// Have a table with a 7 bit lookup. The lookup is the 6 colour data bits (in
 | 
						|
// ACCUM0), concatenated with the sign bit of our running disparity (from
 | 
						|
// ACCUM1). Each table entry is a 20-bit TMDS symbol (pseudodifferential),
 | 
						|
// with the symbol's disparity stored left-justified in the upper 12 bits, as
 | 
						|
// e.g. a 6 bit signed integer.
 | 
						|
//
 | 
						|
// - Load pixel data.                        cyc: 0.75 (ldmia 2 words, every 4 pixels)
 | 
						|
// - Write pixel to ACCUM0.                  cyc: 1
 | 
						|
// - Read address from PEEK2.                cyc: 1
 | 
						|
// - Load encoded pixel from address.        cyc: 2
 | 
						|
// - Write disparity data to ACCUM1_ADD      cyc: 1
 | 
						|
// - Write encoded data to output buffer.    cyc: 1.25 (stmia 4 words, every 4 pixels)
 | 
						|
//
 | 
						|
// With decent register allocation we may be able to load 4 pixels at
 | 
						|
// once (2 words), and write 4 at once (4 words). This gives 7 cyc/pix.
 | 
						|
//
 | 
						|
// One issue is that the TMDS data in the bottom of ACCUM1 will eventually
 | 
						|
// overflow and affect the running disparity, but with 16 zeroes in between,
 | 
						|
// this would take much longer than one scanline, so everything is fine if
 | 
						|
// we clear the accumulator at the start of the scanline.
 | 
						|
//
 | 
						|
// Note that we need to use two interpolators to get the bits from both pixels
 | 
						|
// -- we are not outputting a single DC-balanced stream, but rather two
 | 
						|
// interleaved streams which are each DC-balanced. This is fine electrically,
 | 
						|
// but our output here will *NOT* match the TMDS encoder given in the DVI
 | 
						|
// spec.
 | 
						|
 | 
						|
// You can define TMDS_FULLRES_NO_DC_BALANCE to disable the running balance
 | 
						|
// feedback. With the feedback enabled (default), the output is DC balanced,
 | 
						|
// but there are just barely enough CPU cycles to do all the encode, so it's
 | 
						|
// essentially a party trick. If you disable DC balancing, the performance is
 | 
						|
// much better, and many monitors will still accept the signals as long as you
 | 
						|
// DC couple your DVI signals.
 | 
						|
 | 
						|
.macro tmds_fullres_encode_loop_body ra rb
 | 
						|
	str \ra, [r2, #ACCUM0_OFFS + INTERP1]
 | 
						|
	str \ra, [r2, #ACCUM0_OFFS]
 | 
						|
	ldr \ra, [r2, #PEEK2_OFFS]
 | 
						|
	ldr \ra, [\ra]
 | 
						|
#if !TMDS_FULLRES_NO_DC_BALANCE
 | 
						|
	str \ra, [r2, #ACCUM1_ADD_OFFS]
 | 
						|
#endif
 | 
						|
	ldr \rb, [r2, #PEEK2_OFFS + INTERP1]
 | 
						|
	ldr \rb, [\rb]
 | 
						|
#if !TMDS_FULLRES_NO_DC_BALANCE
 | 
						|
	str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1]
 | 
						|
#endif
 | 
						|
.endm
 | 
						|
 | 
						|
// r0: Input buffer (word-aligned)
 | 
						|
// r1: Output buffer (word-aligned)
 | 
						|
// r2: Pixel count
 | 
						|
 | 
						|
.macro tmds_fullres_encode_loop_16bpp
 | 
						|
	push {r4-r7, lr}
 | 
						|
	mov r4, r8
 | 
						|
	push {r4}
 | 
						|
 | 
						|
 | 
						|
	lsls r2, #2
 | 
						|
	add r2, r1
 | 
						|
	mov ip, r2
 | 
						|
	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
 | 
						|
	// DC balance defined to be 0 at start of scanline:
 | 
						|
	movs r4, #0
 | 
						|
	str r4, [r2, #ACCUM1_OFFS]
 | 
						|
#if TMDS_FULLRES_NO_DC_BALANCE
 | 
						|
	// Alternate parity between odd/even symbols if no feedback
 | 
						|
	mvns r4, r4
 | 
						|
#endif
 | 
						|
	str r4, [r2, #ACCUM1_OFFS + INTERP1]
 | 
						|
 | 
						|
	// Keep loop start pointer in r8 so we can get a longer backward branch
 | 
						|
	adr r4, 1f
 | 
						|
	adds r4, #1 // god damn thumb bit why is this a thing
 | 
						|
	mov r8, r4
 | 
						|
	b 2f
 | 
						|
	.align 2
 | 
						|
1:
 | 
						|
.rept 16
 | 
						|
	ldmia r0!, {r4, r6}
 | 
						|
	tmds_fullres_encode_loop_body r4 r5
 | 
						|
	tmds_fullres_encode_loop_body r6 r7
 | 
						|
	stmia r1!, {r4, r5, r6, r7}
 | 
						|
.endr
 | 
						|
2:
 | 
						|
	cmp r1, ip
 | 
						|
	beq 1f
 | 
						|
	bx r8
 | 
						|
1:
 | 
						|
	pop {r4}
 | 
						|
	mov r8, r4
 | 
						|
	pop {r4-r7, pc}
 | 
						|
.endm
 | 
						|
 | 
						|
// One copy each in X and Y, so the two cores don't step on each other
 | 
						|
decl_func_x tmds_fullres_encode_loop_16bpp_x
 | 
						|
	tmds_fullres_encode_loop_16bpp
 | 
						|
decl_func_y tmds_fullres_encode_loop_16bpp_y
 | 
						|
	tmds_fullres_encode_loop_16bpp
 | 
						|
 | 
						|
 | 
						|
.macro tmds_fullres_encode_loop_body_leftshift ra rb
 | 
						|
	// Note we apply the leftshift for INTERP0 only
 | 
						|
	str \ra, [r2, #ACCUM0_OFFS + INTERP1]
 | 
						|
	lsls \ra, r3
 | 
						|
	str \ra, [r2, #ACCUM0_OFFS]
 | 
						|
	ldr \ra, [r2, #PEEK2_OFFS]
 | 
						|
	ldr \ra, [\ra]
 | 
						|
#if !TMDS_FULLRES_NO_DC_BALANCE
 | 
						|
	str \ra, [r2, #ACCUM1_ADD_OFFS]
 | 
						|
#endif
 | 
						|
	ldr \rb, [r2, #PEEK2_OFFS + INTERP1]
 | 
						|
	ldr \rb, [\rb]
 | 
						|
#if !TMDS_FULLRES_NO_DC_BALANCE
 | 
						|
	str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1]
 | 
						|
#endif
 | 
						|
.endm
 | 
						|
 | 
						|
// r0: Input buffer (word-aligned)
 | 
						|
// r1: Output buffer (word-aligned)
 | 
						|
// r2: Pixel count
 | 
						|
// r3: Left shift amount
 | 
						|
 | 
						|
.macro tmds_fullres_encode_loop_16bpp_leftshift
 | 
						|
	push {r4-r7, lr}
 | 
						|
	mov r4, r8
 | 
						|
	mov r5, r9
 | 
						|
	push {r4-r5}
 | 
						|
 | 
						|
	lsls r2, #2
 | 
						|
	add r2, r1
 | 
						|
	mov ip, r2
 | 
						|
	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
 | 
						|
	// DC balance defined to be 0 at start of scanline:
 | 
						|
	movs r4, #0
 | 
						|
	str r4, [r2, #ACCUM1_OFFS]
 | 
						|
#if TMDS_FULLRES_NO_DC_BALANCE
 | 
						|
	// Alternate parity between odd/even symbols if there's no balance feedback
 | 
						|
	mvns r4, r4
 | 
						|
#endif
 | 
						|
	str r4, [r2, #ACCUM1_OFFS + INTERP1]
 | 
						|
 | 
						|
	adr r4, 1f
 | 
						|
	adds r4, #1
 | 
						|
	mov r8, r4
 | 
						|
	b 2f
 | 
						|
	.align 2
 | 
						|
1:
 | 
						|
.rept 16 // 64 pixels per iteration
 | 
						|
	ldmia r0!, {r4, r6}
 | 
						|
	tmds_fullres_encode_loop_body_leftshift r4 r5
 | 
						|
	tmds_fullres_encode_loop_body_leftshift r6 r7
 | 
						|
	stmia r1!, {r4, r5, r6, r7}
 | 
						|
.endr
 | 
						|
2:
 | 
						|
	cmp r1, ip
 | 
						|
	beq 1f
 | 
						|
	bx r8
 | 
						|
1:
 | 
						|
	pop {r4-r5}
 | 
						|
	mov r8, r4
 | 
						|
	mov r9, r5
 | 
						|
	pop {r4-r7, pc}
 | 
						|
.endm
 | 
						|
 | 
						|
decl_func_x tmds_fullres_encode_loop_16bpp_leftshift_x
 | 
						|
	tmds_fullres_encode_loop_16bpp_leftshift
 | 
						|
decl_func_y tmds_fullres_encode_loop_16bpp_leftshift_y
 | 
						|
	tmds_fullres_encode_loop_16bpp_leftshift
 | 
						|
 | 
						|
 | 
						|
// ----------------------------------------------------------------------------
 | 
						|
// Full-resolution 8bpp paletted encode
 | 
						|
 | 
						|
// Variant of tmds_fullres_encode_loop_16bpp that reads
 | 
						|
// 8-bit wide pixels packed 4 per word.  The interpolator
 | 
						|
// base is set to a reordered list of TMDS symbols based
 | 
						|
// on a user colour palette.
 | 
						|
 | 
						|
// Two pixels input in rd[17:2]. Two symbols output in rd[19:0]. r2 contains
 | 
						|
// interp base pointer. r7 used as temporary.
 | 
						|
.macro tmds_palette_encode_loop_body rd
 | 
						|
	str \rd, [r2, #ACCUM0_OFFS]
 | 
						|
	str \rd, [r2, #ACCUM0_OFFS + INTERP1]
 | 
						|
	ldr \rd, [r2, #PEEK2_OFFS]
 | 
						|
	ldr \rd, [\rd]
 | 
						|
#if !TMDS_FULLRES_NO_DC_BALANCE
 | 
						|
	str \rd, [r2, #ACCUM1_ADD_OFFS]
 | 
						|
#endif
 | 
						|
	ldr r7, [r2, #PEEK2_OFFS + INTERP1]
 | 
						|
	ldr r7, [r7]
 | 
						|
#if !TMDS_FULLRES_NO_DC_BALANCE
 | 
						|
	str r7, [r2, #ACCUM1_ADD_OFFS + INTERP1]
 | 
						|
#endif
 | 
						|
	lsls r7, #10
 | 
						|
	orrs \rd, r7
 | 
						|
.endm
 | 
						|
 | 
						|
.macro tmds_palette_encode_loop
 | 
						|
	push {r4-r7, lr}
 | 
						|
	mov r4, r8
 | 
						|
	push {r4}
 | 
						|
 | 
						|
 | 
						|
	lsls r2, #1
 | 
						|
	add r2, r1
 | 
						|
	mov ip, r2
 | 
						|
	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
 | 
						|
	// DC balance defined to be 0 at start of scanline:
 | 
						|
	movs r4, #0
 | 
						|
	str r4, [r2, #ACCUM1_OFFS]
 | 
						|
#if TMDS_FULLRES_NO_DC_BALANCE
 | 
						|
	// Alternate parity between odd/even symbols if there's no balance feedback
 | 
						|
	mvns r4, r4
 | 
						|
#endif
 | 
						|
	str r4, [r2, #ACCUM1_OFFS + INTERP1]
 | 
						|
 | 
						|
	// Keep loop start pointer in r8 so we can get a longer backward branch
 | 
						|
	adr r4, 1f
 | 
						|
	adds r4, #1 // god damn thumb bit why is this a thing
 | 
						|
	mov r8, r4
 | 
						|
	b 2f
 | 
						|
	.align 2
 | 
						|
1:
 | 
						|
.rept 10
 | 
						|
	ldmia r0!, {r3, r5}
 | 
						|
	lsrs r4, r3, #14
 | 
						|
	lsls r3, #2
 | 
						|
	lsrs r6, r5, #14
 | 
						|
	lsls r5, #2
 | 
						|
	tmds_palette_encode_loop_body r3
 | 
						|
	tmds_palette_encode_loop_body r4
 | 
						|
	tmds_palette_encode_loop_body r5
 | 
						|
	tmds_palette_encode_loop_body r6
 | 
						|
	stmia r1!, {r3, r4, r5, r6}
 | 
						|
.endr
 | 
						|
2:
 | 
						|
	cmp r1, ip
 | 
						|
	beq 1f
 | 
						|
	bx r8
 | 
						|
1:
 | 
						|
	pop {r4}
 | 
						|
	mov r8, r4
 | 
						|
	pop {r4-r7, pc}
 | 
						|
.endm
 | 
						|
 | 
						|
decl_func_x tmds_palette_encode_loop_x
 | 
						|
	tmds_palette_encode_loop
 | 
						|
decl_func_y tmds_palette_encode_loop_y
 | 
						|
	tmds_palette_encode_loop
 |