From fca0bb076a565bf34bd4214b0a27bcb8e20d9e09 Mon Sep 17 00:00:00 2001 From: Laurence Bank Date: Tue, 26 Mar 2024 12:28:41 +0000 Subject: [PATCH] PNGdec: Sync with upstream. * Fixed pixel conversion of 1-bpp grayscale to RGB565 * Improved perf for systems which allow unaligned memory accesses * corrected optimized code to not go past buffer end * speed improvements --- libraries/pngdec/inffast.c | 119 ++++++++++++++++++++++++++++--------- libraries/pngdec/inflate.c | 37 +++++++++++- libraries/pngdec/inflate.h | 3 +- libraries/pngdec/png.inl | 74 ++++++++++++++--------- 4 files changed, 174 insertions(+), 59 deletions(-) diff --git a/libraries/pngdec/inffast.c b/libraries/pngdec/inffast.c index 70cd753c..2f71cab0 100644 --- a/libraries/pngdec/inffast.c +++ b/libraries/pngdec/inffast.c @@ -8,6 +8,20 @@ #include "inflate.h" #include "inffast.h" +#if (INTPTR_MAX == INT64_MAX) || defined(HAL_ESP32_HAL_H_) || defined(TEENSYDUINO) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM7) +#define ALLOWS_UNALIGNED +#endif + +#if INTPTR_MAX == INT64_MAX +#define REGISTER_WIDTH 64 +typedef uint64_t BIGUINT; +typedef uint32_t SMALLUINT; +#else +#define REGISTER_WIDTH 32 +typedef uint32_t BIGUINT; +typedef uint16_t SMALLUINT; +#endif // native register size + #ifdef ASMINF # pragma message("Assembler code may have bugs -- use at your own risk") #else @@ -64,7 +78,8 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ unsigned whave; /* valid bytes in the window */ unsigned wnext; /* window write index */ unsigned char FAR *window; /* allocated sliding window, if wsize != 0 */ - unsigned long hold; /* local strm->hold */ + BIGUINT hold, tmpbits; /* local strm->hold */ +// unsigned long hold; /* local strm->hold */ unsigned bits; /* local strm->bits */ code const FAR *lcode; /* local strm->lencode */ code const FAR *dcode; /* local strm->distcode */ @@ -101,11 +116,18 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ /* decode literals and length/distances until end-of-block or not enough input data or output space */ do { - if (bits < 15) { + if (bits < (REGISTER_WIDTH/2)) { // helps on 32 and 64-bit CPUs +#ifdef ALLOWS_UNALIGNED + tmpbits = *(SMALLUINT *)in; + hold |= (BIGUINT)(tmpbits << bits); + in += sizeof(SMALLUINT); + bits += (REGISTER_WIDTH / 2); +#else hold += (unsigned long)(*in++) << bits; bits += 8; hold += (unsigned long)(*in++) << bits; bits += 8; +#endif } here = lcode[hold & lmask]; dolen: @@ -123,20 +145,29 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ len = (unsigned)(here.val); op &= 15; /* number of extra bits */ if (op) { +#if REGISTER_WIDTH == 32 if (bits < op) { - hold += (unsigned long)(*in++) << bits; + hold += (uint32_t)(*in++) << bits; bits += 8; } +#endif len += (unsigned)hold & ((1U << op) - 1); hold >>= op; bits -= op; } Tracevv((stderr, "inflate: length %u\n", len)); - if (bits < 15) { + if (bits < (REGISTER_WIDTH/2)) { // helps on 32 and 64-bit CPUs +#ifdef UNALIGNED_OK + tmpbits = *(SMALLUINT *)in; + hold |= (BIGUINT)(tmpbits << bits); + in += sizeof(SMALLUINT); + bits += (REGISTER_WIDTH / 2); +#else hold += (unsigned long)(*in++) << bits; bits += 8; hold += (unsigned long)(*in++) << bits; bits += 8; +#endif } here = dcode[hold & dmask]; dodist: @@ -147,14 +178,22 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ if (op & 16) { /* distance base */ dist = (unsigned)(here.val); op &= 15; /* number of extra bits */ +#if REGISTER_WIDTH == 32 if (bits < op) { +#ifdef ALLOWS_UNALIGNED + hold |= (*(uint16_t *)in << bits); + bits += 16; + in += 2; +#else hold += (unsigned long)(*in++) << bits; bits += 8; - if (bits < op) { + if (bits < op) { // this is NEVER true hold += (unsigned long)(*in++) << bits; bits += 8; } +#endif // ALLOWS_UNALIGNED } +#endif // 32-bit CPU dist += (unsigned)hold & ((1U << op) - 1); #ifdef INFLATE_STRICT if (dist > dmax) { @@ -236,12 +275,18 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ from = out - dist; /* rest from output */ } } -// if (len > 50 && len < dist) { -// memmove(out, from, len); -// out += len; -// from += len; -// len = 0; -// } else { +#ifdef ALLOWS_UNALIGNED + { + uint8_t *pEnd = out+len; + while (out < pEnd) { + *(uint32_t *)out = *(uint32_t *)from; + out += 4; + from += 4; + } + // correct for possible overshoot of destination ptr + out = pEnd; + } +#else while (len > 2) { *out++ = *from++; *out++ = *from++; @@ -253,22 +298,38 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ if (len > 1) *out++ = *from++; } -// } +#endif // ALLOWS_UNALIGNED } else { from = out - dist; /* copy direct from output */ - // Larry Bank added - - // For relatively large runs, it's faster to let memmove - // use whatever code is efficient on the target platform -// if (dist == 1) { // frequent case for images -// memset(out, *from, len); -// out += len; -// } else if (len > 50 && len < dist) { -// memmove(out, from, len); -// out += len; -// from += len; -// len = 0; -// } else { +#ifdef ALLOWS_UNALIGNED + { + uint8_t *pEnd = out+len; + int overlap = (int)(intptr_t)(out-from); + if (overlap >= 4) { // overlap of source/dest won't impede normal copy + while (out < pEnd) { + *(uint32_t *)out = *(uint32_t *)from; + out += 4; + from += 4; + } + // correct for possible overshoot of destination ptr + out = pEnd; + } else if (overlap == 1) { // copy 1-byte pattern + uint32_t pattern = *from; + pattern = pattern | (pattern << 8); + pattern = pattern | (pattern << 16); + while (out < pEnd) { + *(uint32_t *)out = pattern; + out += 4; + } + out = pEnd; // correct possible overshoot + } else { // overlap of 2 or 3 + while (out < pEnd) { + *out++ = *from++; + } + } + } +#else do { /* minimum length is three */ *out++ = *from++; *out++ = *from++; @@ -280,7 +341,7 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ if (len > 1) *out++ = *from++; } -// } +#endif // ALLOWS_UNALIGNED } } else if ((op & 64) == 0) { /* 2nd level distance code */ @@ -310,10 +371,10 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ } while (in < last && out < end); /* return unused bytes (on entry, bits < 8, so in won't go too far back) */ - len = bits >> 3; - in -= len; - bits -= len << 3; - hold &= (1U << bits) - 1; +// len = bits >> 3; +// in -= len; +// bits -= len << 3; +// hold &= (1 << bits) - 1; /* update state and return */ strm->next_in = in; diff --git a/libraries/pngdec/inflate.c b/libraries/pngdec/inflate.c index 3ed1139c..a0b07ec4 100644 --- a/libraries/pngdec/inflate.c +++ b/libraries/pngdec/inflate.c @@ -85,6 +85,10 @@ #include "inflate.h" #include "inffast.h" +#if (INTPTR_MAX == INT64_MAX) || defined(HAL_ESP32_HAL_H_) || defined(TEENSYDUINO) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM7) +#define ALLOWS_UNALIGNED +#endif + #ifdef MAKEFIXED # ifndef BUILDFIXED # define BUILDFIXED @@ -262,7 +266,8 @@ int value; state->bits = 0; return Z_OK; } - if (bits > 16 || state->bits + (uInt)bits > 32) return Z_STREAM_ERROR; + if (bits > 16 || state->bits + (uInt)bits > 32) + return Z_STREAM_ERROR; value &= (1L << bits) - 1; state->hold += (unsigned)value << state->bits; state->bits += (uInt)bits; @@ -1191,9 +1196,39 @@ int check_crc; if (copy > left) copy = left; left -= copy; state->length -= copy; +#ifdef ALLOWS_UNALIGNED + { + uint8_t *pEnd = put+copy; + int overlap = (int)(intptr_t)(put-from); + if (overlap >= 4) { // overlap of source/dest won't impede normal copy + while (put < pEnd-3) { // overwriting the output buffer here would be bad, so respect the true length + *(uint32_t *)put = *(uint32_t *)from; + put += 4; + from += 4; + } + while (put < pEnd) { // tail end + *put++ = *from++; + } + } else if (overlap == 1) { // copy 1-byte pattern + uint32_t pattern = *from; + pattern = pattern | (pattern << 8); + pattern = pattern | (pattern << 16); + while (put < pEnd) { + *(uint32_t *)put = pattern; + put += 4; + } + put = pEnd; // correct possible overshoot + } else { // overlap of 2 or 3 + while (put < pEnd) { + *put++ = *from++; + } + } + } +#else do { *put++ = *from++; } while (--copy); +#endif // ALLOWS_UNALIGNED if (state->length == 0) state->mode = LEN; break; case LIT: diff --git a/libraries/pngdec/inflate.h b/libraries/pngdec/inflate.h index a46cce6b..7a0c29d0 100644 --- a/libraries/pngdec/inflate.h +++ b/libraries/pngdec/inflate.h @@ -98,7 +98,8 @@ struct inflate_state { unsigned wnext; /* window write index */ unsigned char FAR *window; /* allocated sliding window, if needed */ /* bit accumulator */ - unsigned long hold; /* input bit accumulator */ + uint64_t hold; /* input bit accumulator */ +// unsigned long hold; /* input bit accumulator */ unsigned bits; /* number of bits in "in" */ /* for string and stored block copying */ unsigned length; /* literal or length of data to copy */ diff --git a/libraries/pngdec/png.inl b/libraries/pngdec/png.inl index 8b477024..da6a2f37 100644 --- a/libraries/pngdec/png.inl +++ b/libraries/pngdec/png.inl @@ -243,15 +243,33 @@ PNG_STATIC void PNGRGB565(PNGDRAW *pDraw, uint16_t *pPixels, int iEndiannes, uin } break; case PNG_PIXEL_GRAYSCALE: - for (x=0; xiWidth; x++) { - c = *s++; - usPixel = (c >> 3); // blue - usPixel |= ((c >> 2) << 5); // green - usPixel |= ((c >> 3) << 11); // red - if (iEndiannes == PNG_RGB565_BIG_ENDIAN) - usPixel = __builtin_bswap16(usPixel); - *pDest++ = usPixel; - } + switch (pDraw->iBpp) { + case 8: + for (x=0; xiWidth; x++) { + c = *s++; + usPixel = (c >> 3); // blue + usPixel |= ((c >> 2) << 5); // green + usPixel |= ((c >> 3) << 11); // red + if (iEndiannes == PNG_RGB565_BIG_ENDIAN) + usPixel = __builtin_bswap16(usPixel); + *pDest++ = usPixel; + } + break; + case 1: + for (x=0; xiWidth; x++) { + if ((x & 7) == 0) { + c = *s++; + } + if (c & 0x80) { + usPixel = 0xffff; + } else { + usPixel = 0; + } + *pDest++ = usPixel; + c <<= 1; + } + break; + } // switch on bpp break; case PNG_PIXEL_TRUECOLOR: for (x=0; xiWidth; x++) { @@ -302,15 +320,15 @@ PNG_STATIC void PNGRGB565(PNGDRAW *pDraw, uint16_t *pPixels, int iEndiannes, uin } break; case 1: - for (x=0; xiWidth; x+=4) { - c = *s++; - for (j=0; j<8; j++) { // work on pairs of bits - usPixel = pDraw->pFastPalette[c >> 7]; - if (iEndiannes == PNG_RGB565_BIG_ENDIAN) - usPixel = __builtin_bswap16(usPixel); - *pDest++ = usPixel; - c <<= 1; + for (x=0; xiWidth; x++) { + if ((x & 7) == 0) { + c = *s++; } + usPixel = pDraw->pFastPalette[c >> 7]; + if (iEndiannes == PNG_RGB565_BIG_ENDIAN) + usPixel = __builtin_bswap16(usPixel); + *pDest++ = usPixel; + c <<= 1; } break; } // switch on bpp @@ -379,18 +397,18 @@ PNG_STATIC void PNGRGB565(PNGDRAW *pDraw, uint16_t *pPixels, int iEndiannes, uin } break; case 1: - for (x=0; xiWidth; x+=4) { - c = *s++; - for (j=0; j<8; j++) { // work on pairs of bits - pPal = &pDraw->pPalette[(c >> 7) * 3]; - usPixel = (pPal[2] >> 3); // blue - usPixel |= ((pPal[1] >> 2) << 5); // green - usPixel |= ((pPal[0] >> 3) << 11); // red - if (iEndiannes == PNG_RGB565_BIG_ENDIAN) - usPixel = __builtin_bswap16(usPixel); - *pDest++ = usPixel; - c <<= 1; + for (x=0; xiWidth; x++) { + if ((x & 7) == 0) { + c = *s++; } + pPal = &pDraw->pPalette[(c >> 7) * 3]; + usPixel = (pPal[2] >> 3); // blue + usPixel |= ((pPal[1] >> 2) << 5); // green + usPixel |= ((pPal[0] >> 3) << 11); // red + if (iEndiannes == PNG_RGB565_BIG_ENDIAN) + usPixel = __builtin_bswap16(usPixel); + *pDest++ = usPixel; + c <<= 1; } break; } // switch on bits per pixel