osci-render/Source/img/qoixx.hpp

#ifndef QOIXX_HPP_INCLUDED_
#define QOIXX_HPP_INCLUDED_

#include<cstdint>
#include<cstddef>
#include<cstring>
#include<vector>
#include<type_traits>
#include<memory>
#include<stdexcept>
#include<bit>
#include<numeric>
#include<array>
#include<utility>

#ifndef QOIXX_NO_SIMD
#if defined(__ARM_FEATURE_SVE)
#include<arm_sve.h>
#include<arm_neon.h>
#elif defined(__aarch64__)
#include<arm_neon.h>
#elif defined(__AVX2__)
#include<immintrin.h>
#endif
#endif

namespace qoixx{

namespace detail{

template<typename T>
requires(sizeof(T) == 1 && !std::same_as<T, bool>)
struct contiguous_puller{
  static constexpr bool is_contiguous = true;
  const T* t;
  inline std::uint8_t pull()noexcept{
    return static_cast<std::uint8_t>(*t++);
  }
  inline const std::uint8_t* raw_pointer()noexcept{
    return reinterpret_cast<const std::uint8_t*>(t);
  }
  inline void advance(std::size_t n)noexcept{
    t += n;
  }
};

template<typename T>
struct default_container_operator;

template<typename T, typename A>
requires(sizeof(T) == 1)
struct default_container_operator<std::vector<T, A>>{
  using target_type = std::vector<T, A>;
  static inline target_type construct(std::size_t size){
    target_type t(size);
    return t;
  }
  struct pusher{
    static constexpr bool is_contiguous = true;
    target_type* t;
    std::size_t i = 0;
    inline void push(std::uint8_t x)noexcept{
      (*t)[i++] = static_cast<T>(x);
    }
    template<typename U>
    requires std::unsigned_integral<U> && (sizeof(U) != 1)
    inline void push(U t)noexcept{
      this->push(static_cast<std::uint8_t>(t));
    }
    inline target_type finalize()noexcept{
      t->resize(i);
      return std::move(*t);
    }
    inline std::uint8_t* raw_pointer()noexcept{
      return reinterpret_cast<std::uint8_t*>(t->data())+i;
    }
    inline void advance(std::size_t n)noexcept{
      i += n;
    }
  };
  static constexpr pusher create_pusher(target_type& t)noexcept{
    return {&t};
  }
  using puller = contiguous_puller<T>;
  static constexpr puller create_puller(const target_type& t)noexcept{
    return {t.data()};
  }
  static inline std::size_t size(const target_type& t)noexcept{
    return t.size();
  }
  static constexpr bool valid(const target_type& t)noexcept{
    return t.capacity() != 0;
  }
};

template<typename T>
requires(sizeof(T) == 1)
struct default_container_operator<std::pair<std::unique_ptr<T[]>, std::size_t>>{
  using target_type = std::pair<std::unique_ptr<T[]>, std::size_t>;
  static inline target_type construct(std::size_t size){
    return {typename target_type::first_type{static_cast<T*>(::operator new[](size))}, 0};
  }
  struct pusher{
    static constexpr bool is_contiguous = true;
    target_type* t;
    inline void push(std::uint8_t x)noexcept{
      t->first[t->second++] = static_cast<T>(x);
    }
    template<typename U>
    requires std::unsigned_integral<U> && (sizeof(U) != 1)
    inline void push(U t)noexcept{
      this->push(static_cast<std::uint8_t>(t));
    }
    inline target_type finalize()noexcept{
      return std::move(*t);
    }
    inline std::uint8_t* raw_pointer()noexcept{
      return reinterpret_cast<std::uint8_t*>(t->first.get())+t->second;
    }
    inline void advance(std::size_t n)noexcept{
      t->second += n;
    }
  };
  static constexpr pusher create_pusher(target_type& t)noexcept{
    return {&t};
  }
  using puller = contiguous_puller<T>;
  static constexpr puller create_puller(const target_type& t)noexcept{
    return {t.first.get()};
  }
  static inline std::size_t size(const target_type& t)noexcept{
    return t.second;
  }
  static constexpr bool valid(const target_type& t)noexcept{
    return t.first != nullptr;
  }
};

template<typename T>
requires(sizeof(T) == 1)
struct default_container_operator<std::pair<T*, std::size_t>>{
  using target_type = std::pair<T*, std::size_t>;
  using puller = contiguous_puller<T>;
  static constexpr puller create_puller(const target_type& t)noexcept{
    return {t.first};
  }
  static inline std::size_t size(const target_type& t)noexcept{
    return t.second;
  }
  static inline bool valid(const target_type& t)noexcept{
    return t.first != nullptr;
  }
};

}

template<typename T>
struct container_operator : detail::default_container_operator<T>{};

class qoi{
  template<std::size_t Size>
  static inline void efficient_memcpy(void* dst, const void* src){
    if constexpr(Size == 3){
      std::memcpy(dst, src, 2);
      std::memcpy(static_cast<std::byte*>(dst)+2, static_cast<const std::byte*>(src)+2, 1);
    }
    else
      std::memcpy(dst, src, Size);
  }
  template<std::size_t Size, typename T>
  static inline void push(T& dst, const void* src){
    if constexpr(T::is_contiguous){
      auto*const ptr = dst.raw_pointer();
      dst.advance(Size);
      efficient_memcpy<Size>(ptr, src);
    }
    else{
      const auto* ptr = static_cast<const std::uint8_t*>(src);
      auto size = Size;
      while(size --> 0)
        dst.push(*ptr++);
    }
  }
  template<std::size_t Size, typename T>
  static inline void pull(void* dst, T& src){
    if constexpr(T::is_contiguous){
      const auto*const ptr = src.raw_pointer();
      src.advance(Size);
      efficient_memcpy<Size>(dst, ptr);
    }
    else{
      auto* ptr = static_cast<std::uint8_t*>(dst);
      auto size = Size;
      while(size --> 0)
        *ptr++ = src.pull();
    }
  }
  enum chunk_tag : std::uint32_t{
    index = 0b0000'0000u,
    diff  = 0b0100'0000u,
    luma  = 0b1000'0000u,
    run   = 0b1100'0000u,
    rgb   = 0b1111'1110u,
    rgba  = 0b1111'1111u,
  };
  static constexpr std::size_t index_size = 64u;
 public:
  enum class colorspace : std::uint8_t{
    srgb = 0,
    linear = 1,
  };
  struct desc{
    std::uint32_t width;
    std::uint32_t height;
    std::uint8_t channels;
    qoi::colorspace colorspace;
    constexpr bool operator==(const desc&)const noexcept = default;
  };
  struct rgba_t{
    std::uint8_t r, g, b, a;
    inline std::uint32_t v()const{
      static_assert(sizeof(rgba_t) == sizeof(std::uint32_t));
      if constexpr(std::endian::native == std::endian::little){
        std::uint32_t x;
        std::memcpy(&x, this, sizeof(std::uint32_t));
        return x;
      }
      else
        return std::uint32_t{r}       |
               std::uint32_t{g} <<  8 |
               std::uint32_t{b} << 16 |
               std::uint32_t{a} << 24;
    }
    inline std::uint_fast32_t hash()const{
      static constexpr std::uint64_t constant =
        static_cast<std::uint64_t>(3u) << 56 |
                                   5u  << 16 |
        static_cast<std::uint64_t>(7u) << 40 |
                                  11u;
      const auto v = static_cast<std::uint64_t>(this->v());
      return (((v<<32|v)&0xFF00FF0000FF00FF)*constant)>>56;
    }
    inline bool operator==(const rgba_t& rhs)const{
      return v() == rhs.v();
    }
    inline bool operator!=(const rgba_t& rhs)const{
      return v() != rhs.v();
    }
  };
  struct rgb_t{
    std::uint8_t r, g, b;
    inline std::uint32_t v()const{
      static_assert(sizeof(rgb_t) == 3u);
      if constexpr(std::endian::native == std::endian::little){
        std::uint32_t x = 255u << 24u;
        efficient_memcpy<3>(&x, this);
        return x;
      }
      else
        return std::uint32_t{r}       |
               std::uint32_t{g} <<  8 |
               std::uint32_t{b} << 16 |
                           255u << 24;
    }
    inline std::uint_fast32_t hash()const{
      static constexpr std::uint64_t constant =
        static_cast<std::uint64_t>(3u) << 56 |
                                   5u  << 16 |
        static_cast<std::uint64_t>(7u) << 40 |
                                  11u;
      const auto v =
        static_cast<std::uint64_t>(r)          |
        static_cast<std::uint64_t>(g)    << 40 |
        static_cast<std::uint64_t>(b)    << 16 |
        static_cast<std::uint64_t>(0xff) << 56 ;
      return (v*constant)>>56;
    }
    inline bool operator==(const rgb_t& rhs)const{
      return ((this->r^rhs.r)|(this->g^rhs.g)|(this->b^rhs.b)) == 0;
    }
  };
  static constexpr std::uint32_t magic =
    113u /*q*/ << 24 | 111u /*o*/ << 16 | 105u /*i*/ <<  8 | 102u /*f*/ ;
  static constexpr std::size_t header_size =
    sizeof(magic) +
    sizeof(std::declval<desc>().width) +
    sizeof(std::declval<desc>().height) +
    sizeof(std::declval<desc>().channels) +
    sizeof(std::declval<desc>().colorspace);
  static constexpr std::size_t pixels_max = 400000000u;
  static constexpr std::uint8_t padding[8] = {0, 0, 0, 0, 0, 0, 0, 1};
  template<typename Puller>
  static inline std::uint32_t read_32(Puller& p){
    if constexpr(std::endian::native == std::endian::big && Puller::is_contiguous){
      std::uint32_t x;
      pull<sizeof(x)>(&x, p);
      return x;
    }
    else{
      const auto _1 = p.pull();
      const auto _2 = p.pull();
      const auto _3 = p.pull();
      const auto _4 = p.pull();
      return static_cast<std::uint32_t>(_1 << 24 | _2 << 16 | _3 << 8 | _4);
    }
  }
  template<typename Pusher>
  static inline void write_32(Pusher& p, std::uint32_t value){
    if constexpr(std::endian::native == std::endian::big && Pusher::is_contiguous)
      push<sizeof(value)>(p, value);
    else{
      p.push((value & 0xff000000) >> 24);
      p.push((value & 0x00ff0000) >> 16);
      p.push((value & 0x0000ff00) >>  8);
      p.push( value & 0x000000ff       );
    }
  }
 private:
  template<bool Alpha>
  using local_rgba_pixel_t = std::conditional_t<Alpha, rgba_t, rgb_t>;
  template<bool Alpha>
  static constexpr local_rgba_pixel_t<Alpha> default_pixel()noexcept{
    if constexpr(Alpha)
      return {0, 0, 0, 255};
    else
      return {};
  }
  template<bool Alpha>
  struct local_pixel{
    std::uint8_t rgb = static_cast<std::uint8_t>(chunk_tag::rgb);
    local_rgba_pixel_t<Alpha> v;
  };
  static_assert(std::has_unique_object_representations_v<local_pixel<true>> and std::has_unique_object_representations_v<local_pixel<false>>);
  template<std::uint_fast8_t Channels, typename Pusher, typename Puller>
  static inline void encode_body(Pusher& p, Puller& pixels, rgba_t (&index)[index_size], std::size_t px_len, local_rgba_pixel_t<Channels == 4u> px_prev = default_pixel<Channels == 4u>(), std::uint8_t prev_hash = static_cast<std::uint8_t>(index_size), std::size_t run = 0){
    local_pixel<Channels == 4u> px;
    while(px_len--)[[likely]]{
      pull<Channels>(&px.v, pixels);
      if(px.v.v() == px_prev.v()){
        ++run;
        continue;
      }
      if(run > 0){
        while(run >= 62)[[unlikely]]{
          static constexpr std::uint8_t x = chunk_tag::run | 61;
          p.push(x);
          run -= 62;
        }
        if(run > 1){
          p.push(chunk_tag::run | (run-1));
          run = 0;
        }
        else if(run == 1){
          if(prev_hash == index_size)[[unlikely]]
            p.push(chunk_tag::run);
          else
            p.push(chunk_tag::index | prev_hash);
          run = 0;
        }
      }

      const auto index_pos = px.v.hash() % index_size;
      prev_hash = index_pos;

      do{
        if(index[index_pos].v() == px.v.v()){
          p.push(chunk_tag::index | index_pos);
          break;
        }
        efficient_memcpy<Channels>(index + index_pos, &px.v);
        if constexpr(Channels == 3)
          index[index_pos].a = 255u;

        if constexpr(Channels == 4)
          if(px.v.a != px_prev.a){
            p.push(chunk_tag::rgba);
            push<4>(p, &px.v);
            break;
          }
        const auto vg_2 = static_cast<int>(px.v.g) - static_cast<int>(px_prev.g);
        if(const std::uint8_t g = vg_2+32; g < 64){
          const auto vr = static_cast<int>(px.v.r) - static_cast<int>(px_prev.r) + 2;
          const auto vg = vg_2 + 2;
          const auto vb = static_cast<int>(px.v.b) - static_cast<int>(px_prev.b) + 2;

          if(static_cast<std::uint8_t>(vr|vg|vb) < 4){
            p.push(chunk_tag::diff | vr << 4 | vg << 2 | vb);
            break;
          }
          const auto vg_r = vr - vg + 8;
          const auto vg_b = vb - vg + 8;
          if(static_cast<std::uint8_t>(vg_r|vg_b) < 16){
            p.push(chunk_tag::luma | g);
            p.push(vg_r << 4 | vg_b);
          }
          else
            push<4>(p, &px);
        }
        else
          push<4>(p, &px);
      }while(false);
      efficient_memcpy<Channels>(&px_prev, &px.v);
    }
    while(run >= 62)[[unlikely]]{
      static constexpr std::uint8_t x = chunk_tag::run | 61;
      p.push(x);
      run -= 62;
    }
    if(run > 0)
      p.push(chunk_tag::run | (run-1));
  }
#ifndef QOIXX_NO_SIMD
#if defined(__ARM_FEATURE_SVE)
  template<bool Alpha>
  using pixels_type = std::conditional_t<Alpha, svuint8x4_t, svuint8x3_t>;
  template<typename... Args>
  requires (std::same_as<std::decay_t<Args>, svuint8_t> && ...)
  static inline pixels_type<sizeof...(Args) == 4> create(Args&&... args)noexcept{
    if constexpr(sizeof...(Args) == 4)
      return svcreate4_u8(std::forward<Args>(args)...);
    else
      return svcreate3_u8(std::forward<Args>(args)...);
  }
  template<std::size_t ImmIndex>
  static inline svuint8_t get(svuint8x4_t t)noexcept{
    return svget4_u8(t, ImmIndex);
  }
  template<std::size_t ImmIndex>
  static inline svuint8_t get(svuint8x3_t t)noexcept{
    return svget3_u8(t, ImmIndex);
  }
  template<bool Alpha>
  static inline pixels_type<Alpha> load(svbool_t pg, const std::uint8_t* ptr)noexcept{
    if constexpr(Alpha)
      return svld4_u8(pg, ptr);
    else
      return svld3_u8(pg, ptr);
  }
  template<std::size_t SVERegisterSize, std::uint_fast8_t Channels, typename Pusher, typename Puller>
  static inline void encode_sve(Pusher& p_, Puller& pixels_, const desc& desc){
    static constexpr bool Alpha = Channels == 4;
    std::uint8_t* p = p_.raw_pointer();
    const std::uint8_t* pixels = pixels_.raw_pointer();

    rgba_t index[index_size] = {};

    const auto zero = svdup_n_u8(0);
    const auto iota = svindex_u8(0, 1);

    pixels_type<Alpha> prev;
    if constexpr(Alpha)
      prev = create(zero, zero, zero, svdup_n_u8(255));
    else
      prev = create(zero, zero, zero);

    std::size_t run = 0;
    rgba_t px = {0, 0, 0, 255};
    auto prev_hash = static_cast<std::uint8_t>(index_size);

    const std::size_t px_len = desc.width * desc.height;
    static constexpr auto vector_lanes = SVERegisterSize/8;
    for(std::size_t i = 0; i < px_len; i += vector_lanes){
      const auto mask = svwhilelt_b8_u64(i, px_len);
      const auto num = std::min(px_len-i, vector_lanes);
      const auto pxs = load<Alpha>(mask, pixels);
      static constexpr std::uint64_t imm = SVERegisterSize/8-1;
      auto rv = svsub_u8_x(mask, get<0>(pxs), svext_u8(get<0>(prev), get<0>(pxs), imm));
      auto gv = svsub_u8_x(mask, get<1>(pxs), svext_u8(get<1>(prev), get<1>(pxs), imm));
      auto bv = svsub_u8_x(mask, get<2>(pxs), svext_u8(get<2>(prev), get<2>(pxs), imm));
      [[maybe_unused]] svbool_t av;
      bool alpha = true;
      if constexpr(Alpha){
        av = svcmpeq_n_u8(mask, svsub_u8_x(mask, get<3>(pxs), svext_u8(get<3>(prev), get<3>(pxs), imm)), 0);
        alpha = !svptest_any(mask, svnot_b_z(mask, av));
      }
      auto runv = svcmpeq_n_u8(mask, svorr_u8_x(mask, svorr_u8_x(mask, rv, gv), bv), 0);
      if constexpr(Alpha)
        runv = svand_b_z(mask, runv, av);
      const auto not_runv = svnot_b_z(mask, runv);
      if(!svptest_any(mask, not_runv)){
        run += num;
        pixels += num*Channels;
        continue;
      }
      const auto r = svminv_u8(not_runv, iota);
      run += r;
      pixels += r*Channels;
      if(run > 0){
        while(run >= 62)[[unlikely]]{
          static constexpr std::uint8_t x = chunk_tag::run | 61;
          *p++ = x;
          run -= 62;
        }
        if(run > 1){
          *p++ = chunk_tag::run | (run-1);
          run = 0;
        }
        else if(run == 1){
          if(prev_hash == index_size)[[unlikely]]
            *p++ = chunk_tag::run;
          else
            *p++ = chunk_tag::index | prev_hash;
          run = 0;
        }
      }
      rv = svadd_n_u8_x(mask, rv, 2);
      gv = svadd_n_u8_x(mask, gv, 2);
      bv = svadd_n_u8_x(mask, bv, 2);
      const auto diffv = svorr_u8_z(svcmplt_n_u8(mask, svorr_u8_z(mask, svorr_u8_x(mask, rv, gv), bv), 4), svorr_n_u8_x(mask, svlsl_n_u8_x(mask, rv, 4), chunk_tag::diff), svorr_u8_x(mask, svlsl_n_u8_x(mask, gv, 2), bv));
      rv = svadd_n_u8_x(mask, svsub_u8_x(mask, rv, gv), 8);
      bv = svadd_n_u8_x(mask, svsub_u8_x(mask, bv, gv), 8);
      gv = svadd_n_u8_x(mask, gv, 30);
      const auto lu = svorr_n_u8_z(svcmpeq_n_u8(mask, svorr_u8_x(mask, svand_n_u8_x(mask, svorr_u8_x(mask, rv, bv), 0xf0), svand_n_u8_x(mask, gv, 0xc0)), 0), gv, chunk_tag::luma);
      const auto ma = svorr_u8_x(mask, svlsl_n_u8_x(mask, rv, 4), bv);
      svuint8_t hash;
      if constexpr(Alpha)
        hash = svand_n_u8_x(mask, svadd_u8_x(mask, svadd_u8_x(mask, svmul_n_u8_x(mask, get<0>(pxs), 3), svmul_n_u8_x(mask, get<1>(pxs), 5)), svadd_u8_x(mask, svmul_n_u8_x(mask, get<2>(pxs), 7), svmul_n_u8_x(mask, get<3>(pxs), 11))), 63);
      else
        hash = svand_n_u8_x(mask, svadd_u8_x(mask, svadd_u8_x(mask, svmul_n_u8_x(mask, get<0>(pxs), 3), svmul_n_u8_x(mask, get<1>(pxs), 5)), svadd_n_u8_x(mask, svmul_n_u8_x(mask, get<2>(pxs), 7), static_cast<std::uint8_t>(255*11))), 63);
      std::uint8_t runs[SVERegisterSize/8], diffs[SVERegisterSize/8], lumas[SVERegisterSize/8*2], hashs[SVERegisterSize/8];
      [[maybe_unused]] std::uint8_t alphas[SVERegisterSize/8];
      svst1_u8(mask, runs, svadd_n_u8_m(runv, zero, 1));
      svst1_u8(mask, diffs, diffv);
      const auto luma = svcreate2_u8(lu, ma);
      svst2_u8(mask, lumas, luma);
      svst1_u8(mask, hashs, hash);
      if constexpr(Alpha)
        if(!alpha)
          svst1_u8(mask, alphas, svadd_n_u8_m(av, zero, 1));
      for(std::size_t i = r; i < num; ++i){
        if(runs[i]){
          ++run;
          pixels += Channels;
          continue;
        }
        if(run > 1){
          *p++ = chunk_tag::run | (run-1);
          run = 0;
        }
        else if(run == 1){
          if(prev_hash == index_size)[[unlikely]]
            *p++ = chunk_tag::run;
          else
            *p++ = chunk_tag::index | prev_hash;
          run = 0;
        }
        const auto index_pos = hashs[i];
        prev_hash = index_pos;
        efficient_memcpy<Channels>(&px, pixels);
        pixels += Channels;
        if(index[index_pos] == px){
          *p++ = chunk_tag::index | index_pos;
          continue;
        }
        index[index_pos] = px;

        if constexpr(Alpha)
          if(!alpha && !alphas[i]){
            *p++ = chunk_tag::rgba;
            std::memcpy(p, &px, 4);
            p += 4;
            continue;
          }
        if(diffs[i])
          *p++ = diffs[i];
        else if(lumas[i*2]){
          std::memcpy(p, lumas + i*2, 2);
          p += 2;
        }
        else{
          *p++ = chunk_tag::rgb;
          efficient_memcpy<3>(p, &px);
          p += 3;
        }
      }
      prev = pxs;
    }
    while(run >= 62)[[unlikely]]{
      static constexpr std::uint8_t x = chunk_tag::run | 61;
      *p++ = x;
      run -= 62;
    }
    if(run > 0){
      *p++ = chunk_tag::run | (run-1);
      run = 0;
    }
    p_.advance(p-p_.raw_pointer());
    pixels_.advance(px_len*Channels);

    push<sizeof(padding)>(p_, padding);
  }
#elif defined(__aarch64__)
  template<bool Alpha>
  using pixels_type = std::conditional_t<Alpha, uint8x16x4_t, uint8x16x3_t>;
  template<bool Alpha>
  static inline pixels_type<Alpha> load(const std::uint8_t* ptr)noexcept{
    if constexpr(Alpha)
      return vld4q_u8(ptr);
    else
      return vld3q_u8(ptr);
  }
  static constexpr std::size_t simd_lanes = 16;
  template<std::uint_fast8_t Channels, typename Pusher, typename Puller>
  static inline void encode_neon(Pusher& p_, Puller& pixels_, const desc& desc){
    static constexpr bool Alpha = Channels == 4;
    std::uint8_t* p = p_.raw_pointer();
    const std::uint8_t* pixels = pixels_.raw_pointer();

    rgba_t index[index_size] = {};

    const auto zero = vdupq_n_u8(0);
    static constexpr std::uint8_t iota_[simd_lanes] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
    const auto iota = vld1q_u8(iota_);

    pixels_type<Alpha> prev;
    prev.val[0] = prev.val[1] = prev.val[2] = zero;
    if constexpr(Alpha)
      prev.val[3] = vdupq_n_u8(255);

    std::size_t run = 0;
    rgba_t px = {0, 0, 0, 255};
    auto prev_hash = static_cast<std::uint8_t>(index_size);

    std::size_t px_len = desc.width * desc.height;
    std::size_t simd_len = px_len / simd_lanes;
    const std::size_t simd_len_16 = simd_len * simd_lanes;
    px_len -= simd_len_16;
    pixels_.advance(simd_len_16*Channels);
    while(simd_len--){
      const auto pxs = load<Alpha>(pixels);
      pixels_type<Alpha> diff;
      diff.val[0] = vsubq_u8(pxs.val[0], vextq_u8(prev.val[0], pxs.val[0], simd_lanes-1));
      diff.val[1] = vsubq_u8(pxs.val[1], vextq_u8(prev.val[1], pxs.val[1], simd_lanes-1));
      diff.val[2] = vsubq_u8(pxs.val[2], vextq_u8(prev.val[2], pxs.val[2], simd_lanes-1));
      bool alpha = true;
      if constexpr(Alpha){
        diff.val[3] = vsubq_u8(pxs.val[3], vextq_u8(prev.val[3], pxs.val[3], simd_lanes-1));
        diff.val[3] = vceqq_u8(diff.val[3], zero);
        alpha = vminvq_u8(diff.val[3]) != 0;
      }
      auto runv = vceqq_u8(vorrq_u8(vorrq_u8(diff.val[0], diff.val[1]), diff.val[2]), zero);
      if(vminvq_u8(runv) != 0 && alpha){
        run += simd_lanes;
        pixels += simd_lanes*Channels;
        continue;
      }
      if constexpr(Alpha)
        runv = vandq_u8(runv, diff.val[3]);
      const auto r = vminvq_u8(vorrq_u8(vandq_u8(vmvnq_u8(runv), iota), runv));
      run += r;
      pixels += r*Channels;
      if(run > 0){
        while(run >= 62)[[unlikely]]{
          static constexpr std::uint8_t x = chunk_tag::run | 61;
          *p++ = x;
          run -= 62;
        }
        if(run > 1){
          *p++ = chunk_tag::run | (run-1);
          run = 0;
        }
        else if(run == 1){
          if(prev_hash == index_size)[[unlikely]]
            *p++ = chunk_tag::run;
          else
            *p++ = chunk_tag::index | prev_hash;
          run = 0;
        }
      }
      const auto two = vdupq_n_u8(2);
      diff.val[0] = vaddq_u8(diff.val[0], two);
      diff.val[1] = vaddq_u8(diff.val[1], two);
      diff.val[2] = vaddq_u8(diff.val[2], two);
      const auto four = vdupq_n_u8(4);
      const auto diffv = vandq_u8(vorrq_u8(vorrq_u8(vdupq_n_u8(chunk_tag::diff), vshlq_n_u8(diff.val[0], 4)), vorrq_u8(vshlq_n_u8(diff.val[1], 2), diff.val[2])), vcltq_u8(vorrq_u8(vorrq_u8(diff.val[0], diff.val[1]), diff.val[2]), four));
      const auto eight = vdupq_n_u8(8);
      diff.val[0] = vaddq_u8(vsubq_u8(diff.val[0], diff.val[1]), eight);
      diff.val[2] = vaddq_u8(vsubq_u8(diff.val[2], diff.val[1]), eight);
      diff.val[1] = vaddq_u8(diff.val[1], vdupq_n_u8(30));
      const auto lu = vandq_u8(vorrq_u8(vdupq_n_u8(chunk_tag::luma), diff.val[1]), vceqq_u8(vorrq_u8(vandq_u8(vorrq_u8(diff.val[0], diff.val[2]), vdupq_n_u8(0xf0)), vandq_u8(diff.val[1], vdupq_n_u8(0xc0))), zero));
      const auto ma = vorrq_u8(vshlq_n_u8(diff.val[0], 4), diff.val[2]);
      uint8x16_t hash;
      if constexpr(Alpha)
        hash = vandq_u8(vaddq_u8(vaddq_u8(vmulq_u8(pxs.val[0], vdupq_n_u8(3)), vmulq_u8(pxs.val[1], vdupq_n_u8(5))), vaddq_u8(vmulq_u8(pxs.val[2], vdupq_n_u8(7)), vmulq_u8(pxs.val[3], vdupq_n_u8(11)))), vdupq_n_u8(63));
      else
        hash = vandq_u8(vaddq_u8(vaddq_u8(vmulq_u8(pxs.val[0], vdupq_n_u8(3)), vmulq_u8(pxs.val[1], vdupq_n_u8(5))), vaddq_u8(vmulq_u8(pxs.val[2], vdupq_n_u8(7)), vdupq_n_u8(static_cast<std::uint8_t>(255*11)))), vdupq_n_u8(63));
      std::uint8_t runs[simd_lanes], diffs[simd_lanes], lumas[simd_lanes*2], hashs[simd_lanes];
      [[maybe_unused]] std::uint8_t alphas[simd_lanes];
      vst1q_u8(runs, runv);
      vst1q_u8(diffs, diffv);
      vst2q_u8(lumas, (uint8x16x2_t{lu, ma}));
      vst1q_u8(hashs, hash);
      if constexpr(Alpha)
        if(!alpha)
          vst1q_u8(alphas, diff.val[3]);
      for(std::size_t i = r; i < simd_lanes; ++i){
        if(runs[i]){
          ++run;
          pixels += Channels;
          continue;
        }
        if(run > 1){
          *p++ = chunk_tag::run | (run-1);
          run = 0;
        }
        else if(run == 1){
          if(prev_hash == index_size)[[unlikely]]
            *p++ = chunk_tag::run;
          else
            *p++ = chunk_tag::index | prev_hash;
          run = 0;
        }
        const auto index_pos = hashs[i];
        prev_hash = index_pos;
        efficient_memcpy<Channels>(&px, pixels);
        pixels += Channels;
        if(index[index_pos] == px){
          *p++ = chunk_tag::index | index_pos;
          continue;
        }
        index[index_pos] = px;

        if constexpr(Alpha)
          if(!alpha && !alphas[i]){
            *p++ = chunk_tag::rgba;
            std::memcpy(p, &px, 4);
            p += 4;
            continue;
          }
        if(diffs[i])
          *p++ = diffs[i];
        else if(lumas[i*2]){
          std::memcpy(p, lumas + i*2, 2);
          p += 2;
        }
        else{
          *p++ = chunk_tag::rgb;
          efficient_memcpy<3>(p, &px);
          p += 3;
        }
      }
      prev = pxs;
    }
    p_.advance(p-p_.raw_pointer());

    if constexpr(Alpha)
      encode_body<Channels>(p_, pixels_, index, px_len, px, prev_hash, run);
    else{
      rgb_t px_prev;
      efficient_memcpy<3>(&px_prev, &px);
      encode_body<Channels>(p_, pixels_, index, px_len, px_prev, prev_hash, run);
    }

    push<sizeof(padding)>(p_, padding);
  }
#elif defined(__AVX2__)
  static constexpr unsigned de_bruijn_bit_position_sequence[32] = {
    0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
  };
  static constexpr unsigned lsb32(std::uint32_t x)noexcept{
    return de_bruijn_bit_position_sequence[(static_cast<std::uint32_t>(x&-static_cast<std::int32_t>(x))*0x077cb531u) >> 27];
  }
  template<std::uint8_t M>
  static inline __m256i slli_epi8(__m256i v)noexcept{
    const auto mask = _mm256_set1_epi8(static_cast<std::uint8_t>(0xff << M) >> M);
    return _mm256_slli_epi16(_mm256_and_si256(v, mask), M);
  }
  template<std::uint8_t M>
  static inline __m256i mul_epi8(__m256i v)noexcept{
    if constexpr(M == 0)
      return _mm256_setzero_si256();
    else if constexpr(M == 1)
      return v;
    else if constexpr(M == 2)
      return slli_epi8<1>(v);
    else if constexpr(M == 3)
      return _mm256_add_epi8(slli_epi8<1>(v), v);
    else if constexpr(M == 4)
      return slli_epi8<2>(v);
    else if constexpr(M == 5)
      return _mm256_add_epi8(slli_epi8<2>(v), v);
    else if constexpr(M == 6)
      return _mm256_add_epi8(slli_epi8<2>(v), slli_epi8<1>(v));
    else if constexpr(M == 7)
      return _mm256_sub_epi8(slli_epi8<3>(v), v);
    else if constexpr(M == 8)
      return slli_epi8<3>(v);
    else if constexpr(M == 9)
      return _mm256_add_epi8(slli_epi8<3>(v), v);
    else if constexpr(M == 10)
      return _mm256_add_epi8(slli_epi8<3>(v), slli_epi8<1>(v));
    else if constexpr(M == 11)
      return _mm256_add_epi8(_mm256_add_epi8(slli_epi8<3>(v), slli_epi8<1>(v)), v);
    else if constexpr(M == 12)
      return _mm256_add_epi8(slli_epi8<3>(v), slli_epi8<2>(v));
    else if constexpr(M == 13)
      return _mm256_add_epi8(_mm256_add_epi8(slli_epi8<3>(v), slli_epi8<2>(v)), v);
    else if constexpr(M == 14)
      return _mm256_sub_epi8(slli_epi8<4>(v), slli_epi8<1>(v));
    else if constexpr(M == 15)
      return _mm256_sub_epi8(slli_epi8<4>(v), v);
    else
      static_assert(M <= 15);
  }
  static inline __m256i prev_vector(__m256i pxs, __m256i prev)noexcept{
    const auto permute = _mm256_permute2x128_si256(pxs, pxs, 0x08);
    const auto inserted = _mm256_inserti128_si256(permute, _mm256_extracti128_si256(prev, 1), 0);
    return _mm256_alignr_epi8(pxs, inserted, 15);
  }
  template<bool Alpha>
  struct pixels_type{
    __m256i val[3+Alpha];
  };
  static constexpr std::size_t simd_lanes = 256/8;
  template<bool Alpha>
  static inline pixels_type<Alpha> load(const std::uint8_t* ptr)noexcept{
    if constexpr(Alpha){
      const auto t1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
      const auto t2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr+simd_lanes));
      const auto t3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr+simd_lanes*2));
      const auto t4 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr+simd_lanes*3));
      const auto lo12 = _mm256_unpacklo_epi8(t1, t2);
      const auto lo34 = _mm256_unpacklo_epi8(t3, t4);
      const auto lolo12lo34 = _mm256_unpacklo_epi16(lo12, lo34);
      const auto hilo12lo34 = _mm256_unpackhi_epi16(lo12, lo34);
      const auto lololo12lo34hilo12lo34 = _mm256_unpacklo_epi32(lolo12lo34, hilo12lo34);
      const auto hilolo12lo34hilo12lo34 = _mm256_unpackhi_epi32(lolo12lo34, hilo12lo34);
      const auto hi12 = _mm256_unpackhi_epi8(t1, t2);
      const auto hi34 = _mm256_unpackhi_epi8(t3, t4);
      const auto lohi12hi34 = _mm256_unpacklo_epi16(hi12, hi34);
      const auto hihi12hi34 = _mm256_unpackhi_epi16(hi12, hi34);
      const auto lolohi12hi34hihi12hi34 = _mm256_unpacklo_epi32(lohi12hi34, hihi12hi34);
      const auto lolololo12lo34hilo12lo34lolohi12hi34hihi12hi34 = _mm256_unpacklo_epi64(lololo12lo34hilo12lo34, lolohi12hi34hihi12hi34);
      const auto hilololo12lo34hilo12lo34lolohi12hi34hihi12hi34 = _mm256_unpackhi_epi64(lololo12lo34hilo12lo34, lolohi12hi34hihi12hi34);
      const auto hilohi12hi34hihi12hi34 = _mm256_unpackhi_epi32(lohi12hi34, hihi12hi34);
      const auto lohilolo12lo34hilo12lo34hilohi12hi34hihi12hi34 = _mm256_unpacklo_epi64(hilolo12lo34hilo12lo34, hilohi12hi34hihi12hi34);
      const auto hihilolo12lo34hilo12lo34hilohi12hi34hihi12hi34 = _mm256_unpackhi_epi64(hilolo12lo34hilo12lo34, hilohi12hi34hihi12hi34);
      const auto mask1 = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
      const auto mask2 = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
      const auto r = _mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(lolololo12lo34hilo12lo34lolohi12hi34hihi12hi34, mask1), mask2);
      const auto g = _mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(hilololo12lo34hilo12lo34lolohi12hi34hihi12hi34, mask1), mask2);
      const auto b = _mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(lohilolo12lo34hilo12lo34hilohi12hi34hihi12hi34, mask1), mask2);
      const auto a = _mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(hihilolo12lo34hilo12lo34hilohi12hi34hihi12hi34, mask1), mask2);
      return {{r, g, b, a}};
    }
    else{
      const auto t1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr));
      const auto t2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr+simd_lanes/2));
      const auto t3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr+simd_lanes));
      const auto t4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr+simd_lanes*3/2));
      const auto t5 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr+simd_lanes*2));
      const auto t6 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr+simd_lanes*5/2));
      const auto mask01 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
      const auto mask02 = _mm_setr_epi8(2, 5, 8, 11, 14, 0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13);
      const auto mask03 = _mm_setr_epi8(1, 4, 7, 10, 13, 2, 5, 8, 11, 14, 0, 3, 6, 9, 12, 15);
      static constexpr char _128 = static_cast<char>(0b1000'0000);
      const auto mask11 = _mm_setr_epi8(0, 0, 0, 0, 0, 0, _128, _128, _128, _128, _128, _128, _128, _128, _128, _128);
      const auto mask21 = _mm_setr_epi8(_128, _128, _128, _128, _128, _128, _128, _128, _128, _128, _128, 0, 0, 0, 0, 0);
      const auto mask12 = _mm_setr_epi8(_128, _128, _128, _128, _128, 0, 0, 0, 0, 0, 0, _128, _128, _128, _128, _128);
      const auto mask22 = _mm_setr_epi8(0, 0, 0, 0, 0, _128, _128, _128, _128, _128, _128, _128, _128, _128, _128, _128);
      const auto mask13 = _mm_setr_epi8(_128, _128, _128, _128, _128, _128, _128, _128, _128, _128, 0, 0, 0, 0, 0, 0);
      const auto mask23 = _mm_setr_epi8(_128, _128, _128, _128, _128, 0, 0, 0, 0, 0, _128, _128, _128, _128, _128, _128);
      const auto x1 = _mm_shuffle_epi8(t1, mask01);
      const auto x2 = _mm_shuffle_epi8(t2, mask02);
      const auto x3 = _mm_shuffle_epi8(t3, mask03);
      const auto x4 = _mm_shuffle_epi8(t4, mask01);
      const auto x5 = _mm_shuffle_epi8(t5, mask02);
      const auto x6 = _mm_shuffle_epi8(t6, mask03);
      const auto r1 = _mm_blendv_epi8(_mm_alignr_epi8(x3, x3, 5), _mm_blendv_epi8(x1, _mm_alignr_epi8(x2, x2, 10), mask11), mask21);
      const auto g1 = _mm_blendv_epi8(_mm_alignr_epi8(x1, x1, 6), _mm_blendv_epi8(x2, _mm_alignr_epi8(x3, x3, 10), mask12), mask22);
      const auto b1 = _mm_blendv_epi8(_mm_alignr_epi8(x2, x2, 6), _mm_blendv_epi8(x3, _mm_alignr_epi8(x1, x1, 11), mask13), mask23);
      const auto r2 = _mm_blendv_epi8(_mm_alignr_epi8(x6, x6, 5), _mm_blendv_epi8(x4, _mm_alignr_epi8(x5, x5, 10), mask11), mask21);
      const auto g2 = _mm_blendv_epi8(_mm_alignr_epi8(x4, x4, 6), _mm_blendv_epi8(x5, _mm_alignr_epi8(x6, x6, 10), mask12), mask22);
      const auto b2 = _mm_blendv_epi8(_mm_alignr_epi8(x5, x5, 6), _mm_blendv_epi8(x6, _mm_alignr_epi8(x4, x4, 11), mask13), mask23);
      const auto r = _mm256_inserti128_si256(_mm256_castsi128_si256(r1), r2, 1);
      const auto g = _mm256_inserti128_si256(_mm256_castsi128_si256(g1), g2, 1);
      const auto b = _mm256_inserti128_si256(_mm256_castsi128_si256(b1), b2, 1);
      return {{r, g, b}};
    }
  }
  template<std::uint_fast8_t Channels, typename Pusher, typename Puller>
  static inline void encode_avx2(Pusher& p_, Puller& pixels_, const desc& desc){
    static constexpr bool Alpha = Channels == 4;
    std::uint8_t* p = p_.raw_pointer();
    const std::uint8_t* pixels = pixels_.raw_pointer();

    rgba_t index[index_size] = {};

    const auto zero = _mm256_setzero_si256();

    pixels_type<Alpha> prev;
    prev.val[0] = prev.val[1] = prev.val[2] = zero;
    if constexpr(Alpha)
      prev.val[3] = _mm256_set1_epi8(static_cast<char>(0xff));

    std::size_t run = 0;
    rgba_t px = {0, 0, 0, 255};
    auto prev_hash = static_cast<std::uint8_t>(index_size);

    std::size_t px_len = desc.width * desc.height;
    std::size_t simd_len = px_len / simd_lanes;
    const std::size_t simd_len_32 = simd_len * simd_lanes;
    px_len -= simd_len_32;
    pixels_.advance(simd_len_32*Channels);
    while(simd_len--){
      const auto pxs = load<Alpha>(pixels);
      pixels_type<Alpha> diff;
      diff.val[0] = _mm256_sub_epi8(pxs.val[0], prev_vector(pxs.val[0], prev.val[0]));
      diff.val[1] = _mm256_sub_epi8(pxs.val[1], prev_vector(pxs.val[1], prev.val[1]));
      diff.val[2] = _mm256_sub_epi8(pxs.val[2], prev_vector(pxs.val[2], prev.val[2]));
      bool alpha = true;
      if constexpr(Alpha){
        diff.val[3] = _mm256_sub_epi8(pxs.val[3], prev_vector(pxs.val[3], prev.val[3]));
        alpha = _mm256_testz_si256(diff.val[3], diff.val[3]);
        diff.val[3] = _mm256_cmpeq_epi8(diff.val[3], zero);
      }
      const auto ored = _mm256_or_si256(_mm256_or_si256(diff.val[0], diff.val[1]), diff.val[2]);
      auto runv = _mm256_cmpeq_epi8(ored, zero);
      if(_mm256_testz_si256(ored, ored) && alpha){
        run += simd_lanes;
        pixels += simd_lanes*Channels;
        continue;
      }
      if constexpr(Alpha)
        runv = _mm256_and_si256(runv, diff.val[3]);
      const auto r = lsb32(~_mm256_movemask_epi8(runv));
      run += r;
      pixels += r*Channels;
      if(run > 0){
        while(run >= 62)[[unlikely]]{
          static constexpr std::uint8_t x = chunk_tag::run | 61;
          *p++ = x;
          run -= 62;
        }
        if(run > 1){
          *p++ = static_cast<std::uint8_t>(chunk_tag::run | (run-1));
          run = 0;
        }
        else if(run == 1){
          if(prev_hash == index_size)[[unlikely]]
            *p++ = chunk_tag::run;
          else
            *p++ = chunk_tag::index | prev_hash;
          run = 0;
        }
      }
      const auto two = _mm256_set1_epi8(2);
      diff.val[0] = _mm256_add_epi8(diff.val[0], two);
      diff.val[1] = _mm256_add_epi8(diff.val[1], two);
      diff.val[2] = _mm256_add_epi8(diff.val[2], two);
      const auto diffor = _mm256_or_si256(_mm256_or_si256(diff.val[0], diff.val[1]), diff.val[2]);
      const auto diffv = _mm256_and_si256(_mm256_or_si256(_mm256_or_si256(_mm256_set1_epi8(chunk_tag::diff), slli_epi8<4>(diff.val[0])), _mm256_or_si256(slli_epi8<2>(diff.val[1]), diff.val[2])), _mm256_cmpeq_epi8(_mm256_and_si256(diffor, _mm256_set1_epi8(0b11)), diffor));
      const auto eight = _mm256_set1_epi8(8);
      diff.val[0] = _mm256_add_epi8(_mm256_sub_epi8(diff.val[0], diff.val[1]), eight);
      diff.val[2] = _mm256_add_epi8(_mm256_sub_epi8(diff.val[2], diff.val[1]), eight);
      diff.val[1] = _mm256_add_epi8(diff.val[1], _mm256_set1_epi8(30));
      const auto luma_mask = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
      const auto lu = _mm256_permutevar8x32_epi32(_mm256_and_si256(_mm256_or_si256(_mm256_set1_epi8(static_cast<char>(chunk_tag::luma)), diff.val[1]), _mm256_cmpeq_epi8(_mm256_or_si256(_mm256_and_si256(_mm256_or_si256(diff.val[0], diff.val[2]), _mm256_set1_epi8(static_cast<char>(0xf0))), _mm256_and_si256(diff.val[1], _mm256_set1_epi8(static_cast<char>(0xc0)))), zero)), luma_mask);
      const auto ma = _mm256_permutevar8x32_epi32(_mm256_or_si256(slli_epi8<4>(diff.val[0]), diff.val[2]), luma_mask);
      __m256i hash;
      if constexpr(Alpha)
        hash = _mm256_and_si256(_mm256_add_epi8(_mm256_add_epi8(mul_epi8<3>(pxs.val[0]), mul_epi8<5>(pxs.val[1])), _mm256_add_epi8(mul_epi8<7>(pxs.val[2]), mul_epi8<11>(pxs.val[3]))), _mm256_set1_epi8(63));
      else
        hash = _mm256_and_si256(_mm256_add_epi8(_mm256_add_epi8(mul_epi8<3>(pxs.val[0]), mul_epi8<5>(pxs.val[1])), _mm256_add_epi8(mul_epi8<7>(pxs.val[2]), _mm256_set1_epi8(static_cast<std::uint8_t>(255*11)))), _mm256_set1_epi8(63));
      alignas(alignof(__m256i)) std::uint8_t runs[simd_lanes], diffs[simd_lanes], lumas[simd_lanes*2], hashs[simd_lanes];
      [[maybe_unused]] alignas(alignof(__m256i)) std::uint8_t alphas[simd_lanes];
      _mm256_store_si256(reinterpret_cast<__m256i*>(runs), runv);
      _mm256_store_si256(reinterpret_cast<__m256i*>(diffs), diffv);
      _mm256_store_si256(reinterpret_cast<__m256i*>(lumas), _mm256_unpacklo_epi8(lu, ma));
      _mm256_store_si256(reinterpret_cast<__m256i*>(lumas)+1, _mm256_unpackhi_epi8(lu, ma));
      _mm256_store_si256(reinterpret_cast<__m256i*>(hashs), hash);
      if constexpr(Alpha)
        if(!alpha)
          _mm256_store_si256(reinterpret_cast<__m256i*>(alphas), diff.val[3]);
      for(std::size_t i = r; i < simd_lanes; ++i){
        if(runs[i]){
          ++run;
          pixels += Channels;
          continue;
        }
        if(run > 1){
          *p++ = static_cast<std::uint8_t>(chunk_tag::run | (run-1));
          run = 0;
        }
        else if(run == 1){
          if(prev_hash == index_size)[[unlikely]]
            *p++ = chunk_tag::run;
          else
            *p++ = chunk_tag::index | prev_hash;
          run = 0;
        }
        const auto index_pos = hashs[i];
        prev_hash = index_pos;
        efficient_memcpy<Channels>(&px, pixels);
        pixels += Channels;
        if(index[index_pos] == px){
          *p++ = chunk_tag::index | index_pos;
          continue;
        }
        index[index_pos] = px;

        if constexpr(Alpha)
          if(!alpha && !alphas[i]){
            *p++ = chunk_tag::rgba;
            std::memcpy(p, &px, 4);
            p += 4;
            continue;
          }
        if(diffs[i])
          *p++ = diffs[i];
        else if(lumas[i*2]){
          std::memcpy(p, lumas + i*2, 2);
          p += 2;
        }
        else{
          *p++ = chunk_tag::rgb;
          efficient_memcpy<3>(p, &px);
          p += 3;
        }
      }
      prev = pxs;
    }
    p_.advance(p-p_.raw_pointer());

    if constexpr(Alpha)
      encode_body<Channels>(p_, pixels_, index, px_len, px, prev_hash, run);
    else{
      rgb_t px_prev;
      efficient_memcpy<3>(&px_prev, &px);
      encode_body<Channels>(p_, pixels_, index, px_len, px_prev, prev_hash, run);
    }

    push<sizeof(padding)>(p_, padding);
  }
#endif
#endif

  template<std::uint_fast8_t Channels, typename Pusher, typename Puller>
  static inline void encode_impl(Pusher& p, Puller& pixels, const desc& desc){
    rgba_t index[index_size] = {};

    std::size_t px_len = desc.width * desc.height;
    encode_body<Channels>(p, pixels, index, px_len);

    push<sizeof(padding)>(p, padding);
  }

  template<typename Puller>
  static inline desc decode_header(Puller& p){
    desc d;
    const auto magic_ = read_32(p);
    d.width = read_32(p);
    d.height = read_32(p);
    d.channels = p.pull();
    d.colorspace = static_cast<qoi::colorspace>(p.pull());
    if(
      d.width == 0 || d.height == 0 || magic_ != magic ||
      d.channels < 3 || d.channels > 4 ||
      d.height >= pixels_max / d.width
    )[[unlikely]]
      throw std::runtime_error("qoixx::qoi::decode: invalid header");
    return d;
  }

#ifndef QOIXX_DECODE_WITH_TABLES
#define QOIXX_HPP_DECODE_WITH_TABLES_NOT_DEFINED
#ifdef __aarch64__
#define QOIXX_DECODE_WITH_TABLES 0
#else
#define QOIXX_DECODE_WITH_TABLES 1
#endif
#endif

#if QOIXX_DECODE_WITH_TABLES
  static constexpr std::size_t hash_table_offset = std::numeric_limits<std::uint8_t>::max()+1 - chunk_tag::diff;
  static constexpr std::array<int, std::numeric_limits<std::uint8_t>::max()+1+chunk_tag::run-chunk_tag::diff> create_hash_diff_table(){
    std::array<int, std::numeric_limits<std::uint8_t>::max()+1+chunk_tag::run-chunk_tag::diff> table = {};
    for(std::size_t i = 0; i <= std::numeric_limits<std::uint8_t>::max(); ++i){
      constexpr std::uint32_t mask_tail_4 = 0b0000'1111u;
      const auto vr = (i >> 4);
      const auto vb = (i & mask_tail_4);
      table[i] = (vr*3 + vb*7) % index_size;
    }
    for(std::size_t i = chunk_tag::diff; i < chunk_tag::luma; ++i){
      constexpr std::uint32_t mask_tail_2 = 0b0000'0011u;
      const auto vr = static_cast<int>((i >> 4) & mask_tail_2) - 2;
      const auto vg = static_cast<int>((i >> 2) & mask_tail_2) - 2;
      const auto vb = static_cast<int>( i       & mask_tail_2) - 2;
      table[i+hash_table_offset] = static_cast<std::uint8_t>((vr*3 + vg*5 + vb*7) % index_size);
    }
    for(std::size_t i = chunk_tag::luma; i < chunk_tag::run; ++i){
        constexpr int vgv = chunk_tag::luma+40;
        const int vg = i - vgv;
        table[i+hash_table_offset] = static_cast<std::uint8_t>((vg*3 + (vg+8)*5 + vg*7) % index_size);
    }
    return table;
  }
  static constexpr std::array<std::array<std::uint8_t, 2>, std::numeric_limits<std::uint8_t>::max()+1> create_luma_table(){
    std::array<std::array<std::uint8_t, 2>, std::numeric_limits<std::uint8_t>::max()+1> table = {};
    for(std::size_t i = 0; i <= std::numeric_limits<std::uint8_t>::max(); ++i){
      constexpr std::uint32_t mask_tail_4 = 0b0000'1111u;
      const auto vr = (i >> 4);
      const auto vb = (i & mask_tail_4);
      table[i][0] = static_cast<uint8_t>(vr);
      table[i][1] = static_cast<uint8_t>(vb);
    }
    return table;
  }
  static constexpr std::array<std::array<std::int8_t, 3>, chunk_tag::luma> create_diff_table(){
    std::array<std::array<std::int8_t, 3>, chunk_tag::luma> table = {};
    for(std::size_t i = chunk_tag::diff; i < chunk_tag::luma; ++i){
      constexpr std::uint32_t mask_tail_2 = 0b0000'0011u;
      const auto vr = ((i >> 4) & mask_tail_2) - 2;
      const auto vg = ((i >> 2) & mask_tail_2) - 2;
      const auto vb = ( i       & mask_tail_2) - 2;
      table[i][0] = static_cast<uint8_t>(vr);
      table[i][1] = static_cast<uint8_t>(vg);
      table[i][2] = static_cast<uint8_t>(vb);
    }
    return table;
  }
#endif

  template<std::size_t Channels, typename Pusher, typename Puller>
  static inline void decode_impl(Pusher& pixels, Puller& p, std::size_t px_len, std::size_t size){
#ifndef __aarch64__
    using rgba_t = std::conditional_t<Channels == 4, qoi::rgba_t, qoi::rgb_t>;
#endif
    rgba_t px = {};
    if constexpr(std::is_same<rgba_t, qoi::rgba_t>::value)
      px.a = 255;
    rgba_t index[index_size];
    if constexpr(std::is_same<rgba_t, qoi::rgba_t>::value){
      index[(0*3+0*5+0*7+0*11)%index_size] = {};
      index[(0*3+0*5+0*7+255*11)%index_size] = px;
    }
    else
      index[(0*3+0*5+0*7+255*11)%index_size] = {};

#if QOIXX_DECODE_WITH_TABLES
#define QOIXX_HPP_WITH_TABLES(...) __VA_ARGS__
#define QOIXX_HPP_WITHOUT_TABLES(...)
#else
#define QOIXX_HPP_WITH_TABLES(...)
#define QOIXX_HPP_WITHOUT_TABLES(...) __VA_ARGS__
#endif

    QOIXX_HPP_WITH_TABLES(
    auto hash = px.hash() % index_size;
    static constexpr auto luma_hash_diff_table = create_hash_diff_table();
    static constexpr auto hash_diff_table = luma_hash_diff_table.data() + hash_table_offset;
    )

    const auto f = [&pixels, &p, &px_len, &size, &px, &index QOIXX_HPP_WITH_TABLES(, &hash)]{
      const auto b1 = p.pull();
      --size;

#if defined(__aarch64__) and not defined(QOIXX_NO_SIMD)
#define QOIXX_HPP_DECODE_RUN(px, run) { \
    if constexpr(Pusher::is_contiguous){ \
      ++run; \
      if(run >= 8){ \
        std::conditional_t<Channels == 4, uint8x8x4_t, uint8x8x3_t> data = {vdup_n_u8(px.r), vdup_n_u8(px.g), vdup_n_u8(px.b)}; \
        if constexpr(Channels == 4) \
          data.val[3] = vdup_n_u8(px.a); \
        while(run>=8){ \
          if constexpr(Channels == 4) \
            vst4_u8(pixels.raw_pointer(), data); \
          else \
            vst3_u8(pixels.raw_pointer(), data); \
          pixels.advance(Channels*8); \
          run -= 8; \
        } \
      } \
      while(run--){push<Channels>(pixels, &px);} \
    } \
    else \
      do{push<Channels>(pixels, &px);}while(run--); \
  }
#else
#define QOIXX_HPP_DECODE_RUN(px, run) do{push<Channels>(pixels, &px);}while(run--);
#endif

      if(b1 >= chunk_tag::run){
        if(b1 < chunk_tag::rgb){
          /*run*/
          static constexpr std::uint32_t mask_tail_6 = 0b0011'1111u;
          std::size_t run = b1 & mask_tail_6;
          if(run >= px_len)[[unlikely]]
            run = px_len;
          px_len -= run;
          QOIXX_HPP_DECODE_RUN(px, run)
          return;
        }
        if(b1 == chunk_tag::rgb){
          pull<3>(&px, p);
          size -= 3;
          QOIXX_HPP_WITH_TABLES(hash = px.hash() % index_size;)
        }
        if constexpr(Channels == 4){
          if(b1 == chunk_tag::rgba){
            pull<4>(&px, p);
            size -= 4;
            QOIXX_HPP_WITH_TABLES(hash = px.hash() % index_size;)
          }
        }
        else{
          if(b1 == chunk_tag::rgba)[[unlikely]]{
            pull<3>(&px, p);
            p.advance(1);
            size -= 4;
            QOIXX_HPP_WITH_TABLES(hash = px.hash() % index_size;)
          }
        }
      }
      else if(b1 < chunk_tag::diff){
        /*index*/
        if constexpr(std::is_same<rgba_t, qoi::rgba_t>::value)
          px = index[b1];
        else
          efficient_memcpy<Channels>(&px, index + b1);
        push<Channels>(pixels, &px);
        QOIXX_HPP_WITH_TABLES(hash = b1;)
        return;
      }
      else if(b1 >= chunk_tag::luma){
        /*luma*/
        const auto b2 = p.pull();
        --size;
        QOIXX_HPP_WITH_TABLES(
        static constexpr auto table = create_luma_table();
        const auto drb = table[b2];
        )
        static constexpr int vgv = chunk_tag::luma+40;
        const int vg = b1 - vgv;
        QOIXX_HPP_WITH_TABLES(
        px.r += vg + drb[0];
        px.g += vg + 8;
        px.b += vg + drb[1];
        hash = (static_cast<int>(hash)+hash_diff_table[b1]+luma_hash_diff_table[b2]) % index_size;
        ) QOIXX_HPP_WITHOUT_TABLES(
        static constexpr std::uint32_t mask_tail_4 = 0b0000'1111u;
        px.r += vg + (b2 >> 4);
        px.g += vg + 8;
        px.b += vg + (b2 & mask_tail_4);
        )
      }
      else{
        /*diff*/
        QOIXX_HPP_WITH_TABLES(
        static constexpr auto table = create_diff_table();
        const auto drgb = table[b1];
        px.r += drgb[0];
        px.g += drgb[1];
        px.b += drgb[2];
        hash = (static_cast<int>(hash)+hash_diff_table[b1]) % index_size;
        ) QOIXX_HPP_WITHOUT_TABLES(
        static constexpr std::uint32_t mask_tail_2 = 0b0000'0011u;
        px.r += ((b1 >> 4) & mask_tail_2) - 2;
        px.g += ((b1 >> 2) & mask_tail_2) - 2;
        px.b += ( b1       & mask_tail_2) - 2;
        )
      }
#undef QOIXX_HPP_DECODE_RUN
      if constexpr(std::is_same<rgba_t, qoi::rgba_t>::value)
        index[QOIXX_HPP_WITH_TABLES(hash) QOIXX_HPP_WITHOUT_TABLES(px.hash() % index_size)] = px;
      else
        efficient_memcpy<Channels>(index + QOIXX_HPP_WITH_TABLES(hash) QOIXX_HPP_WITHOUT_TABLES(px.hash() % index_size), &px);
#undef QOIXX_HPP_WITHOUT_TABLES
#undef QOIXX_HPP_WITH_TABLES
#ifdef QOIXX_HPP_DECODE_WITH_TABLES_NOT_DEFINED
#undef QOIXX_DECODE_WITH_TABLES
#undef QOIXX_HPP_DECODE_WITH_TABLES_NOT_DEFINED
#endif

      push<Channels>(pixels, &px);
    };

    while(px_len--)[[likely]]{
      f();
      if(size < sizeof(padding))[[unlikely]]{
        throw std::runtime_error("qoixx::qoi::decode: insufficient input data");
      }
    }
  }
 public:
  template<typename T, typename U>
  static inline T encode(const U& u, const desc& desc){
    using coU = container_operator<U>;
    if(!coU::valid(u) || coU::size(u) < desc.width*desc.height*desc.channels || desc.width == 0 || desc.height == 0 || desc.channels < 3 || desc.channels > 4 || desc.height >= pixels_max / desc.width)[[unlikely]]
      throw std::invalid_argument{"qoixx::qoi::encode: invalid argument"};

    const auto max_size = static_cast<std::size_t>(desc.width) * desc.height * (desc.channels + 1) + header_size + sizeof(padding);
    using coT = container_operator<T>;
    T data = coT::construct(max_size);
    auto p = coT::create_pusher(data);
    auto puller = coU::create_puller(u);

    write_32(p, magic);
    write_32(p, desc.width);
    write_32(p, desc.height);
    p.push(desc.channels);
    p.push(static_cast<std::uint8_t>(desc.colorspace));

#ifndef QOIXX_NO_SIMD
#if defined(__ARM_FEATURE_SVE)
    if constexpr(coT::pusher::is_contiguous && coU::puller::is_contiguous)
      if(desc.channels == 4)
#define QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH \
        switch(svcntb()){ \
          QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(128); \
          QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(256); \
          QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(384); \
          QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(512); \
          QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(640); \
          QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(768); \
          QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(896); \
          QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(1024); \
          QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(1152); \
          QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(1280); \
          QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(1408); \
          QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(1536); \
          QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(1664); \
          QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(1792); \
          QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(1920); \
          QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(2048); \
          default: while(true){/*unreachable*/} \
        }
#define QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(i) case i/8: encode_sve<i, 4>(p, puller, desc); break
        QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH
#undef QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE
      else
#define QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(i) case i/8: encode_sve<i, 3>(p, puller, desc); break;
        QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH
#undef QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE
#undef QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH
    else
#elif defined(__aarch64__)
    if constexpr(coT::pusher::is_contiguous && coU::puller::is_contiguous)
      if(desc.channels == 4)
        encode_neon<4>(p, puller, desc);
      else
        encode_neon<3>(p, puller, desc);
    else
#elif defined(__AVX2__)
    if constexpr(coT::pusher::is_contiguous && coU::puller::is_contiguous)
      if(desc.channels == 4)
        encode_avx2<4>(p, puller, desc);
      else
        encode_avx2<3>(p, puller, desc);
    else
#endif
#endif
      if(desc.channels == 4)
        encode_impl<4>(p, puller, desc);
      else
        encode_impl<3>(p, puller, desc);

    return p.finalize();
  }
  template<typename T, typename U>
  requires(sizeof(U) == 1)
  static inline T encode(const U* pixels, std::size_t size, const desc& desc){
    return encode<T>(std::make_pair(pixels, size), desc);
  }
  template<typename T, typename U>
  requires (!std::is_pointer_v<U>)
  static inline std::pair<T, desc> decode(const U& u, std::uint8_t channels = 0){
    using coU = container_operator<U>;
    const auto size = coU::size(u);
    if(!coU::valid(u) || size < header_size + sizeof(padding) || (channels != 0 && channels != 3 && channels != 4))[[unlikely]]
      throw std::invalid_argument{"qoixx::qoi::decode: invalid argument"};
    auto puller = coU::create_puller(u);

    const auto d = decode_header(puller);
    if(channels == 0)
      channels = d.channels;

    const std::size_t px_len = static_cast<std::size_t>(d.width) * d.height;
    using coT = container_operator<T>;
    T data = coT::construct(px_len*channels);
    auto p = coT::create_pusher(data);

    if(channels == 4)
      decode_impl<4>(p, puller, px_len, size);
    else
      decode_impl<3>(p, puller, px_len, size);

    return std::make_pair(std::move(p.finalize()), d);
  }
  template<typename T, typename U>
  requires(sizeof(U) == 1)
  static inline std::pair<T, desc> decode(const U* pixels, std::size_t size, std::uint8_t channels = 0){
    return decode<T>(std::make_pair(pixels, size), channels);
  }
};

}

#endif //QOIXX_HPP_INCLUDED_