PicoDVI/software/libdvi/tmds_encode.S

624 wiersze
16 KiB
ArmAsm

#include "hardware/regs/addressmap.h"
#include "hardware/regs/sio.h"
#include "dvi_config_defs.h"
// Offsets suitable for ldr/str (must be <= 0x7c):
#define ACCUM0_OFFS (SIO_INTERP0_ACCUM0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define ACCUM1_OFFS (SIO_INTERP0_ACCUM1_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define ACCUM1_ADD_OFFS (SIO_INTERP0_ACCUM1_ADD_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define PEEK0_OFFS (SIO_INTERP0_PEEK_LANE0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define PEEK1_OFFS (SIO_INTERP0_PEEK_LANE1_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define PEEK2_OFFS (SIO_INTERP0_PEEK_FULL_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define INTERP1 (SIO_INTERP1_ACCUM0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
// Note the entirety of INTERP0 and INTERP1 fits inside this 5-bit
// word-addressed space... almost as though it were intentional! :)
.syntax unified
.cpu cortex-m0plus
.thumb
.macro decl_func_x name
.section .scratch_x.\name, "ax"
.global \name
.type \name,%function
.thumb_func
\name:
.endm
.macro decl_func_y name
.section .scratch_y.\name, "ax"
.global \name
.type \name,%function
.thumb_func
\name:
.endm
#define decl_func decl_func_x
// ----------------------------------------------------------------------------
// Pixel-doubling encoders for RGB
// r0: Input buffer (word-aligned)
// r1: Output buffer (word-aligned)
// r2: Input size (pixels)
.macro do_channel_16bpp r_ibase r_inout0 r_out1
str \r_inout0, [\r_ibase, #ACCUM0_OFFS]
ldr \r_inout0, [\r_ibase, #PEEK0_OFFS]
ldr \r_inout0, [\r_inout0]
ldr \r_out1, [\r_ibase, #PEEK1_OFFS]
ldr \r_out1, [\r_out1]
.endm
decl_func tmds_encode_loop_16bpp
push {r4, r5, r6, r7, lr}
lsls r2, #2
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
b 2f
.align 2
1:
.rept TMDS_ENCODE_UNROLL
ldmia r0!, {r4, r6}
do_channel_16bpp r2, r4, r5
do_channel_16bpp r2, r6, r7
stmia r1!, {r4, r5, r6, r7}
.endr
2:
cmp r1, ip
bne 1b
pop {r4, r5, r6, r7, pc}
// Same as above, but scale data to make up for lack of left shift
// in interpolator (costs 1 cycle per 2 pixels)
//
// r0: Input buffer (word-aligned)
// r1: Output buffer (word-aligned)
// r2: Input size (pixels)
// r3: Left shift amount
decl_func tmds_encode_loop_16bpp_leftshift
push {r4, r5, r6, r7, lr}
lsls r2, #2
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
b 2f
.align 2
1:
.rept TMDS_ENCODE_UNROLL
ldmia r0!, {r4, r6}
lsls r4, r3
do_channel_16bpp r2, r4, r5
lsls r6, r3
do_channel_16bpp r2, r6, r7
stmia r1!, {r4, r5, r6, r7}
.endr
2:
cmp r1, ip
bne 1b
pop {r4, r5, r6, r7, pc}
// r0: Input buffer (word-aligned)
// r1: Output buffer (word-aligned)
// r2: Input size (pixels)
decl_func tmds_encode_loop_8bpp
push {r4, r5, r6, r7, lr}
lsls r2, #2
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
b 2f
.align 2
1:
.rept TMDS_ENCODE_UNROLL
ldmia r0!, {r4}
str r4, [r2, #ACCUM0_OFFS + INTERP1]
str r4, [r2, #ACCUM0_OFFS]
ldr r4, [r2, #PEEK0_OFFS]
ldr r4, [r4]
ldr r5, [r2, #PEEK1_OFFS]
ldr r5, [r5]
ldr r6, [r2, #PEEK0_OFFS + INTERP1]
ldr r6, [r6]
ldr r7, [r2, #PEEK1_OFFS + INTERP1]
ldr r7, [r7]
stmia r1!, {r4, r5, r6, r7}
.endr
2:
cmp r1, ip
bne 1b
pop {r4, r5, r6, r7, pc}
// r0: Input buffer (word-aligned)
// r1: Output buffer (word-aligned)
// r2: Input size (pixels)
// r3: Left shift amount
//
// Note that only the data written to interp0 (pixel 0, 1) is leftshifted, not
// the data written to interp1 (pixel 2, 3). Otherwise we always lose MSBs, as
// the LUT offset MSB is at bit 8, so pixel 0 always requires some left shift,
// since its channel MSBs are no greater than 7.
decl_func tmds_encode_loop_8bpp_leftshift
push {r4, r5, r6, r7, lr}
lsls r2, #3
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
b 2f
.align 2
1:
.rept TMDS_ENCODE_UNROLL
ldmia r0!, {r4}
str r4, [r2, #ACCUM0_OFFS + INTERP1]
lsls r4, r3
str r4, [r2, #ACCUM0_OFFS]
ldr r4, [r2, #PEEK0_OFFS]
ldr r4, [r4]
ldr r5, [r2, #PEEK1_OFFS]
ldr r5, [r5]
ldr r6, [r2, #PEEK0_OFFS + INTERP1]
ldr r6, [r6]
ldr r7, [r2, #PEEK1_OFFS + INTERP1]
ldr r7, [r7]
stmia r1!, {r4, r5, r6, r7}
.endr
2:
cmp r1, ip
bne 1b
pop {r4, r5, r6, r7, pc}
// ----------------------------------------------------------------------------
// Fast 1bpp black/white encoder (full res)
// Taking the encoder from DVI spec, with initial balance 0:
//
// - Encoding either 0x00 or 0xff will produce a running balance of -8, with
// output symbol of 0x100 or 0x200
//
// - Subsequently encoding either 0x01 or 0xfe will return the balance to 0, with
// output symbol of 0x1ff or 0x2ff
//
// So we can do 1bpp encode with a lookup of x coordinate LSB, and input
// colour bit. If we process pixels in even-sized blocks, only the colour
// lookup is needed.
// Encode 8 pixels @ 1bpp (using two table lookups)
// r3 contains lookup mask (preshifted)
// r8 contains pointer to encode table
// 2.125 cyc/pix
.macro tmds_encode_1bpp_body shift_instr0 shamt0 shift_instr1 shamt1
\shift_instr0 r4, r2, #\shamt0
ands r4, r3
add r4, r8
ldmia r4, {r4, r5}
\shift_instr1 r6, r2, #\shamt1
ands r6, r3
add r6, r8
ldmia r6, {r6, r7}
stmia r1!, {r4, r5, r6, r7}
.endm
// r0: input buffer (word-aligned)
// r1: output buffer (word-aligned)
// r2: output pixel count
decl_func tmds_encode_1bpp
push {r4-r7, lr}
mov r7, r8
push {r7}
lsls r2, #1
add r2, r1
mov ip, r2
adr r4, tmds_1bpp_table
mov r8, r4
// Mask: 4 bit index, 8 bytes per entry
movs r3, #0x78
b 2f
1:
ldmia r0!, {r2}
#if !DVI_1BPP_BIT_REVERSE
tmds_encode_1bpp_body lsls 3 lsrs 1
tmds_encode_1bpp_body lsrs 5 lsrs 9
tmds_encode_1bpp_body lsrs 13 lsrs 17
tmds_encode_1bpp_body lsrs 21 lsrs 25
#else
tmds_encode_1bpp_body lsrs 1 lsls 3
tmds_encode_1bpp_body lsrs 9 lsrs 5
tmds_encode_1bpp_body lsrs 17 lsrs 13
tmds_encode_1bpp_body lsrs 25 lsrs 21
#endif
2:
cmp r1, ip
blo 1b
pop {r7}
mov r8, r7
pop {r4-r7, pc}
.align 2
tmds_1bpp_table:
#if !DVI_1BPP_BIT_REVERSE
.word 0x7fd00, 0x7fd00 // 0000
.word 0x7fe00, 0x7fd00 // 0001
.word 0xbfd00, 0x7fd00 // 0010
.word 0xbfe00, 0x7fd00 // 0011
.word 0x7fd00, 0x7fe00 // 0100
.word 0x7fe00, 0x7fe00 // 0101
.word 0xbfd00, 0x7fe00 // 0110
.word 0xbfe00, 0x7fe00 // 0111
.word 0x7fd00, 0xbfd00 // 1000
.word 0x7fe00, 0xbfd00 // 1001
.word 0xbfd00, 0xbfd00 // 1010
.word 0xbfe00, 0xbfd00 // 1011
.word 0x7fd00, 0xbfe00 // 1100
.word 0x7fe00, 0xbfe00 // 1101
.word 0xbfd00, 0xbfe00 // 1110
.word 0xbfe00, 0xbfe00 // 1111
#else
.word 0x7fd00, 0x7fd00 // 0000
.word 0x7fd00, 0xbfd00 // 1000
.word 0x7fd00, 0x7fe00 // 0100
.word 0x7fd00, 0xbfe00 // 1100
.word 0xbfd00, 0x7fd00 // 0010
.word 0xbfd00, 0xbfd00 // 1010
.word 0xbfd00, 0x7fe00 // 0110
.word 0xbfd00, 0xbfe00 // 1110
.word 0x7fe00, 0x7fd00 // 0001
.word 0x7fe00, 0xbfd00 // 1001
.word 0x7fe00, 0x7fe00 // 0101
.word 0x7fe00, 0xbfe00 // 1101
.word 0xbfe00, 0x7fd00 // 0011
.word 0xbfe00, 0xbfd00 // 1011
.word 0xbfe00, 0x7fe00 // 0111
.word 0xbfe00, 0xbfe00 // 1111
#endif
// ----------------------------------------------------------------------------
// Full-resolution 2bpp encode (for 2bpp grayscale, or bitplaned RGB222)
// Even-x-position pixels are encoded as symbols with imbalance -4, and odd
// pixels with +4, so that we can mix-and-match our even/odd codewords and
// always get a properly balanced sequence:
//
// level 0: (05 -> 103), then (04 -> 1fc) (decimal 5, 4)
// level 1: (50 -> 130), then (51 -> 1cf) (decimal 80, 81)
// level 2: (af -> 230), then (ae -> 2cf) (decimal 175, 174)
// level 3: (fa -> 203), then (fb -> 2fc) (decimal 250, 251)
//
// These correspond to roughly 255 times (0, 1/3, 2/3, 1).
//
// Alternatively we could use symbols with 0 balance, which results in lower
// contrast but avoids the LSB bobble:
//
// level 0: (10 -> 1f0) always
// level 1: (5a -> 263) always
// level 2: (a5 -> 163) always
// level 3: (ef -> 2f0) always
// Table base pointer in r0. Input pixels in r2.
.macro encode_2bpp_body shift_instr shamt rd
\shift_instr \rd, r2, #\shamt
ands \rd, r3
ldr \rd, [r0, \rd]
.endm
// r0: input buffer (word-aligned)
// r1: output buffer (word-aligned)
// r2: output pixel count
decl_func tmds_encode_2bpp
push {r4-r7, lr}
mov r7, r8
push {r7}
mov r8, r0
adr r0, tmds_2bpp_table
// Mask: 4-bit index into 4-byte entries.
movs r3, #0x3c
// Limit pointer: 1 word per 2 pixels
lsls r2, #1
add r2, r1
mov ip, r2
b 2f
1:
mov r4, r8
ldmia r4!, {r2}
mov r8, r4
encode_2bpp_body lsls 2 r4
encode_2bpp_body lsrs 2 r5
encode_2bpp_body lsrs 6 r6
encode_2bpp_body lsrs 10 r7
stmia r1!, {r4-r7}
encode_2bpp_body lsrs 14 r4
encode_2bpp_body lsrs 18 r5
encode_2bpp_body lsrs 22 r6
encode_2bpp_body lsrs 26 r7
stmia r1!, {r4-r7}
2:
cmp r1, ip
blo 1b
pop {r7}
mov r8, r7
pop {r4-r7, pc}
.align 2
tmds_2bpp_table:
.word 0x7f103 // 00, 00
.word 0x7f130 // 01, 00
.word 0x7f230 // 10, 00
.word 0x7f203 // 11, 00
.word 0x73d03 // 00, 01
.word 0x73d30 // 01, 01
.word 0x73e30 // 10, 01
.word 0x73e03 // 11, 01
.word 0xb3d03 // 00, 10
.word 0xb3d30 // 01, 10
.word 0xb3e30 // 10, 10
.word 0xb3e03 // 11, 10
.word 0xbf103 // 00, 11
.word 0xbf130 // 01, 11
.word 0xbf230 // 10, 11
.word 0xbf203 // 11, 11
// ----------------------------------------------------------------------------
// Full-resolution RGB encode (not very practical)
// Non-doubled TMDS encode. 8.333 cycles per pixel, no exceptions. (This is
// taking horizontal blanking (at VGA) and dual core into account, and
// assuming the 3 channels are encoded individually.)
//
// Here is an idea
// Have a table with a 7 bit lookup. The lookup is the 6 colour data bits (in
// ACCUM0), concatenated with the sign bit of our running disparity (from
// ACCUM1). Each table entry is a 20-bit TMDS symbol (pseudodifferential),
// with the symbol's disparity stored left-justified in the upper 12 bits, as
// e.g. a 6 bit signed integer.
//
// - Load pixel data. cyc: 0.75 (ldmia 2 words, every 4 pixels)
// - Write pixel to ACCUM0. cyc: 1
// - Read address from PEEK2. cyc: 1
// - Load encoded pixel from address. cyc: 2
// - Write disparity data to ACCUM1_ADD cyc: 1
// - Write encoded data to output buffer. cyc: 1.25 (stmia 4 words, every 4 pixels)
//
// With decent register allocation we may be able to load 4 pixels at
// once (2 words), and write 4 at once (4 words). This gives 7 cyc/pix.
//
// One issue is that the TMDS data in the bottom of ACCUM1 will eventually
// overflow and affect the running disparity, but with 16 zeroes in between,
// this would take much longer than one scanline, so everything is fine if
// we clear the accumulator at the start of the scanline.
//
// Note that we need to use two interpolators to get the bits from both pixels
// -- we are not outputting a single DC-balanced stream, but rather two
// interleaved streams which are each DC-balanced. This is fine electrically,
// but our output here will *NOT* match the TMDS encoder given in the DVI
// spec.
// You can define TMDS_FULLRES_NO_DC_BALANCE to disable the running balance
// feedback. With the feedback enabled (default), the output is DC balanced,
// but there are just barely enough CPU cycles to do all the encode, so it's
// essentially a party trick. If you disable DC balancing, the performance is
// much better, and many monitors will still accept the signals as long as you
// DC couple your DVI signals.
.macro tmds_fullres_encode_loop_body ra rb
str \ra, [r2, #ACCUM0_OFFS + INTERP1]
str \ra, [r2, #ACCUM0_OFFS]
ldr \ra, [r2, #PEEK2_OFFS]
ldr \ra, [\ra]
#if !TMDS_FULLRES_NO_DC_BALANCE
str \ra, [r2, #ACCUM1_ADD_OFFS]
#endif
ldr \rb, [r2, #PEEK2_OFFS + INTERP1]
ldr \rb, [\rb]
#if !TMDS_FULLRES_NO_DC_BALANCE
str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1]
#endif
.endm
// r0: Input buffer (word-aligned)
// r1: Output buffer (word-aligned)
// r2: Pixel count
.macro tmds_fullres_encode_loop_16bpp
push {r4-r7, lr}
mov r4, r8
push {r4}
lsls r2, #2
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
// DC balance defined to be 0 at start of scanline:
movs r4, #0
str r4, [r2, #ACCUM1_OFFS]
#if TMDS_FULLRES_NO_DC_BALANCE
// Alternate parity between odd/even symbols if no feedback
mvns r4, r4
#endif
str r4, [r2, #ACCUM1_OFFS + INTERP1]
// Keep loop start pointer in r8 so we can get a longer backward branch
adr r4, 1f
adds r4, #1 // god damn thumb bit why is this a thing
mov r8, r4
b 2f
.align 2
1:
.rept 16
ldmia r0!, {r4, r6}
tmds_fullres_encode_loop_body r4 r5
tmds_fullres_encode_loop_body r6 r7
stmia r1!, {r4, r5, r6, r7}
.endr
2:
cmp r1, ip
beq 1f
bx r8
1:
pop {r4}
mov r8, r4
pop {r4-r7, pc}
.endm
// One copy each in X and Y, so the two cores don't step on each other
decl_func_x tmds_fullres_encode_loop_16bpp_x
tmds_fullres_encode_loop_16bpp
decl_func_y tmds_fullres_encode_loop_16bpp_y
tmds_fullres_encode_loop_16bpp
.macro tmds_fullres_encode_loop_body_leftshift ra rb
// Note we apply the leftshift for INTERP0 only
str \ra, [r2, #ACCUM0_OFFS + INTERP1]
lsls \ra, r3
str \ra, [r2, #ACCUM0_OFFS]
ldr \ra, [r2, #PEEK2_OFFS]
ldr \ra, [\ra]
#if !TMDS_FULLRES_NO_DC_BALANCE
str \ra, [r2, #ACCUM1_ADD_OFFS]
#endif
ldr \rb, [r2, #PEEK2_OFFS + INTERP1]
ldr \rb, [\rb]
#if !TMDS_FULLRES_NO_DC_BALANCE
str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1]
#endif
.endm
// r0: Input buffer (word-aligned)
// r1: Output buffer (word-aligned)
// r2: Pixel count
// r3: Left shift amount
.macro tmds_fullres_encode_loop_16bpp_leftshift
push {r4-r7, lr}
mov r4, r8
mov r5, r9
push {r4-r5}
lsls r2, #2
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
// DC balance defined to be 0 at start of scanline:
movs r4, #0
str r4, [r2, #ACCUM1_OFFS]
#if TMDS_FULLRES_NO_DC_BALANCE
// Alternate parity between odd/even symbols if there's no balance feedback
mvns r4, r4
#endif
str r4, [r2, #ACCUM1_OFFS + INTERP1]
adr r4, 1f
adds r4, #1
mov r8, r4
b 2f
.align 2
1:
.rept 16 // 64 pixels per iteration
ldmia r0!, {r4, r6}
tmds_fullres_encode_loop_body_leftshift r4 r5
tmds_fullres_encode_loop_body_leftshift r6 r7
stmia r1!, {r4, r5, r6, r7}
.endr
2:
cmp r1, ip
beq 1f
bx r8
1:
pop {r4-r5}
mov r8, r4
mov r9, r5
pop {r4-r7, pc}
.endm
decl_func_x tmds_fullres_encode_loop_16bpp_leftshift_x
tmds_fullres_encode_loop_16bpp_leftshift
decl_func_y tmds_fullres_encode_loop_16bpp_leftshift_y
tmds_fullres_encode_loop_16bpp_leftshift
// ----------------------------------------------------------------------------
// Full-resolution 8bpp paletted encode
// Variant of tmds_fullres_encode_loop_16bpp that reads
// 8-bit wide pixels packed 4 per word. The interpolator
// base is set to a reordered list of TMDS symbols based
// on a user colour palette.
// Two pixels input in rd[17:2]. Two symbols output in rd[19:0]. r2 contains
// interp base pointer. r7 used as temporary.
.macro tmds_palette_encode_loop_body rd
str \rd, [r2, #ACCUM0_OFFS]
str \rd, [r2, #ACCUM0_OFFS + INTERP1]
ldr \rd, [r2, #PEEK2_OFFS]
ldr \rd, [\rd]
#if !TMDS_FULLRES_NO_DC_BALANCE
str \rd, [r2, #ACCUM1_ADD_OFFS]
#endif
ldr r7, [r2, #PEEK2_OFFS + INTERP1]
ldr r7, [r7]
#if !TMDS_FULLRES_NO_DC_BALANCE
str r7, [r2, #ACCUM1_ADD_OFFS + INTERP1]
#endif
lsls r7, #10
orrs \rd, r7
.endm
.macro tmds_palette_encode_loop
push {r4-r7, lr}
mov r4, r8
push {r4}
lsls r2, #1
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
// DC balance defined to be 0 at start of scanline:
movs r4, #0
str r4, [r2, #ACCUM1_OFFS]
#if TMDS_FULLRES_NO_DC_BALANCE
// Alternate parity between odd/even symbols if there's no balance feedback
mvns r4, r4
#endif
str r4, [r2, #ACCUM1_OFFS + INTERP1]
// Keep loop start pointer in r8 so we can get a longer backward branch
adr r4, 1f
adds r4, #1 // god damn thumb bit why is this a thing
mov r8, r4
b 2f
.align 2
1:
.rept 10
ldmia r0!, {r3, r5}
lsrs r4, r3, #14
lsls r3, #2
lsrs r6, r5, #14
lsls r5, #2
tmds_palette_encode_loop_body r3
tmds_palette_encode_loop_body r4
tmds_palette_encode_loop_body r5
tmds_palette_encode_loop_body r6
stmia r1!, {r3, r4, r5, r6}
.endr
2:
cmp r1, ip
beq 1f
bx r8
1:
pop {r4}
mov r8, r4
pop {r4-r7, pc}
.endm
decl_func_x tmds_palette_encode_loop_x
tmds_palette_encode_loop
decl_func_y tmds_palette_encode_loop_y
tmds_palette_encode_loop