kopia lustrzana https://github.com/jameshball/osci-render
1424 wiersze
54 KiB
C++
1424 wiersze
54 KiB
C++
/*
|
|
MIT License
|
|
|
|
Copyright (c) 2022 I
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in all
|
|
copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
SOFTWARE.
|
|
*/
|
|
#ifndef QOIXX_HPP_INCLUDED_
|
|
#define QOIXX_HPP_INCLUDED_
|
|
|
|
#include<cstdint>
|
|
#include<cstddef>
|
|
#include<cstring>
|
|
#include<vector>
|
|
#include<type_traits>
|
|
#include<memory>
|
|
#include<stdexcept>
|
|
#include<bit>
|
|
#include<numeric>
|
|
#include<array>
|
|
#include<utility>
|
|
|
|
#ifndef QOIXX_NO_SIMD
|
|
#if defined(__ARM_FEATURE_SVE)
|
|
#include<arm_sve.h>
|
|
#include<arm_neon.h>
|
|
#elif defined(__aarch64__)
|
|
#include<arm_neon.h>
|
|
#elif defined(__AVX2__)
|
|
#include<immintrin.h>
|
|
#endif
|
|
#endif
|
|
|
|
namespace qoixx{
|
|
|
|
namespace detail{
|
|
|
|
template<typename T>
|
|
requires(sizeof(T) == 1 && !std::same_as<T, bool>)
|
|
struct contiguous_puller{
|
|
static constexpr bool is_contiguous = true;
|
|
const T* t;
|
|
inline std::uint8_t pull()noexcept{
|
|
return static_cast<std::uint8_t>(*t++);
|
|
}
|
|
inline const std::uint8_t* raw_pointer()noexcept{
|
|
return reinterpret_cast<const std::uint8_t*>(t);
|
|
}
|
|
inline void advance(std::size_t n)noexcept{
|
|
t += n;
|
|
}
|
|
};
|
|
|
|
template<typename T>
|
|
struct default_container_operator;
|
|
|
|
template<typename T, typename A>
|
|
requires(sizeof(T) == 1)
|
|
struct default_container_operator<std::vector<T, A>>{
|
|
using target_type = std::vector<T, A>;
|
|
static inline target_type construct(std::size_t size){
|
|
target_type t(size);
|
|
return t;
|
|
}
|
|
struct pusher{
|
|
static constexpr bool is_contiguous = true;
|
|
target_type* t;
|
|
std::size_t i = 0;
|
|
inline void push(std::uint8_t x)noexcept{
|
|
(*t)[i++] = static_cast<T>(x);
|
|
}
|
|
template<typename U>
|
|
requires std::unsigned_integral<U> && (sizeof(U) != 1)
|
|
inline void push(U t)noexcept{
|
|
this->push(static_cast<std::uint8_t>(t));
|
|
}
|
|
inline target_type finalize()noexcept{
|
|
t->resize(i);
|
|
return std::move(*t);
|
|
}
|
|
inline std::uint8_t* raw_pointer()noexcept{
|
|
return reinterpret_cast<std::uint8_t*>(t->data())+i;
|
|
}
|
|
inline void advance(std::size_t n)noexcept{
|
|
i += n;
|
|
}
|
|
};
|
|
static constexpr pusher create_pusher(target_type& t)noexcept{
|
|
return {&t};
|
|
}
|
|
using puller = contiguous_puller<T>;
|
|
static constexpr puller create_puller(const target_type& t)noexcept{
|
|
return {t.data()};
|
|
}
|
|
static inline std::size_t size(const target_type& t)noexcept{
|
|
return t.size();
|
|
}
|
|
static constexpr bool valid(const target_type& t)noexcept{
|
|
return t.capacity() != 0;
|
|
}
|
|
};
|
|
|
|
template<typename T>
|
|
requires(sizeof(T) == 1)
|
|
struct default_container_operator<std::pair<std::unique_ptr<T[]>, std::size_t>>{
|
|
using target_type = std::pair<std::unique_ptr<T[]>, std::size_t>;
|
|
static inline target_type construct(std::size_t size){
|
|
return {typename target_type::first_type{static_cast<T*>(::operator new[](size))}, 0};
|
|
}
|
|
struct pusher{
|
|
static constexpr bool is_contiguous = true;
|
|
target_type* t;
|
|
inline void push(std::uint8_t x)noexcept{
|
|
t->first[t->second++] = static_cast<T>(x);
|
|
}
|
|
template<typename U>
|
|
requires std::unsigned_integral<U> && (sizeof(U) != 1)
|
|
inline void push(U t)noexcept{
|
|
this->push(static_cast<std::uint8_t>(t));
|
|
}
|
|
inline target_type finalize()noexcept{
|
|
return std::move(*t);
|
|
}
|
|
inline std::uint8_t* raw_pointer()noexcept{
|
|
return reinterpret_cast<std::uint8_t*>(t->first.get())+t->second;
|
|
}
|
|
inline void advance(std::size_t n)noexcept{
|
|
t->second += n;
|
|
}
|
|
};
|
|
static constexpr pusher create_pusher(target_type& t)noexcept{
|
|
return {&t};
|
|
}
|
|
using puller = contiguous_puller<T>;
|
|
static constexpr puller create_puller(const target_type& t)noexcept{
|
|
return {t.first.get()};
|
|
}
|
|
static inline std::size_t size(const target_type& t)noexcept{
|
|
return t.second;
|
|
}
|
|
static constexpr bool valid(const target_type& t)noexcept{
|
|
return t.first != nullptr;
|
|
}
|
|
};
|
|
|
|
template<typename T>
|
|
requires(sizeof(T) == 1)
|
|
struct default_container_operator<std::pair<T*, std::size_t>>{
|
|
using target_type = std::pair<T*, std::size_t>;
|
|
using puller = contiguous_puller<T>;
|
|
static constexpr puller create_puller(const target_type& t)noexcept{
|
|
return {t.first};
|
|
}
|
|
static inline std::size_t size(const target_type& t)noexcept{
|
|
return t.second;
|
|
}
|
|
static inline bool valid(const target_type& t)noexcept{
|
|
return t.first != nullptr;
|
|
}
|
|
};
|
|
|
|
}
|
|
|
|
template<typename T>
|
|
struct container_operator : detail::default_container_operator<T>{};
|
|
|
|
class qoi{
|
|
template<std::size_t Size>
|
|
static inline void efficient_memcpy(void* dst, const void* src){
|
|
if constexpr(Size == 3){
|
|
std::memcpy(dst, src, 2);
|
|
std::memcpy(static_cast<std::byte*>(dst)+2, static_cast<const std::byte*>(src)+2, 1);
|
|
}
|
|
else
|
|
std::memcpy(dst, src, Size);
|
|
}
|
|
template<std::size_t Size, typename T>
|
|
static inline void push(T& dst, const void* src){
|
|
if constexpr(T::is_contiguous){
|
|
auto*const ptr = dst.raw_pointer();
|
|
dst.advance(Size);
|
|
efficient_memcpy<Size>(ptr, src);
|
|
}
|
|
else{
|
|
const auto* ptr = static_cast<const std::uint8_t*>(src);
|
|
auto size = Size;
|
|
while(size --> 0)
|
|
dst.push(*ptr++);
|
|
}
|
|
}
|
|
template<std::size_t Size, typename T>
|
|
static inline void pull(void* dst, T& src){
|
|
if constexpr(T::is_contiguous){
|
|
const auto*const ptr = src.raw_pointer();
|
|
src.advance(Size);
|
|
efficient_memcpy<Size>(dst, ptr);
|
|
}
|
|
else{
|
|
auto* ptr = static_cast<std::uint8_t*>(dst);
|
|
auto size = Size;
|
|
while(size --> 0)
|
|
*ptr++ = src.pull();
|
|
}
|
|
}
|
|
enum chunk_tag : std::uint32_t{
|
|
index = 0b0000'0000u,
|
|
diff = 0b0100'0000u,
|
|
luma = 0b1000'0000u,
|
|
run = 0b1100'0000u,
|
|
rgb = 0b1111'1110u,
|
|
rgba = 0b1111'1111u,
|
|
};
|
|
static constexpr std::size_t index_size = 64u;
|
|
public:
|
|
enum class colorspace : std::uint8_t{
|
|
srgb = 0,
|
|
linear = 1,
|
|
};
|
|
struct desc{
|
|
std::uint32_t width;
|
|
std::uint32_t height;
|
|
std::uint8_t channels;
|
|
qoi::colorspace colorspace;
|
|
constexpr bool operator==(const desc&)const noexcept = default;
|
|
};
|
|
struct rgba_t{
|
|
std::uint8_t r, g, b, a;
|
|
inline std::uint32_t v()const{
|
|
static_assert(sizeof(rgba_t) == sizeof(std::uint32_t));
|
|
if constexpr(std::endian::native == std::endian::little){
|
|
std::uint32_t x;
|
|
std::memcpy(&x, this, sizeof(std::uint32_t));
|
|
return x;
|
|
}
|
|
else
|
|
return std::uint32_t{r} |
|
|
std::uint32_t{g} << 8 |
|
|
std::uint32_t{b} << 16 |
|
|
std::uint32_t{a} << 24;
|
|
}
|
|
inline std::uint_fast32_t hash()const{
|
|
static constexpr std::uint64_t constant =
|
|
static_cast<std::uint64_t>(3u) << 56 |
|
|
5u << 16 |
|
|
static_cast<std::uint64_t>(7u) << 40 |
|
|
11u;
|
|
const auto v = static_cast<std::uint64_t>(this->v());
|
|
return (((v<<32|v)&0xFF00FF0000FF00FF)*constant)>>56;
|
|
}
|
|
inline bool operator==(const rgba_t& rhs)const{
|
|
return v() == rhs.v();
|
|
}
|
|
inline bool operator!=(const rgba_t& rhs)const{
|
|
return v() != rhs.v();
|
|
}
|
|
};
|
|
struct rgb_t{
|
|
std::uint8_t r, g, b;
|
|
inline std::uint32_t v()const{
|
|
static_assert(sizeof(rgb_t) == 3u);
|
|
if constexpr(std::endian::native == std::endian::little){
|
|
std::uint32_t x = 255u << 24u;
|
|
efficient_memcpy<3>(&x, this);
|
|
return x;
|
|
}
|
|
else
|
|
return std::uint32_t{r} |
|
|
std::uint32_t{g} << 8 |
|
|
std::uint32_t{b} << 16 |
|
|
255u << 24;
|
|
}
|
|
inline std::uint_fast32_t hash()const{
|
|
static constexpr std::uint64_t constant =
|
|
static_cast<std::uint64_t>(3u) << 56 |
|
|
5u << 16 |
|
|
static_cast<std::uint64_t>(7u) << 40 |
|
|
11u;
|
|
const auto v =
|
|
static_cast<std::uint64_t>(r) |
|
|
static_cast<std::uint64_t>(g) << 40 |
|
|
static_cast<std::uint64_t>(b) << 16 |
|
|
static_cast<std::uint64_t>(0xff) << 56 ;
|
|
return (v*constant)>>56;
|
|
}
|
|
inline bool operator==(const rgb_t& rhs)const{
|
|
return ((this->r^rhs.r)|(this->g^rhs.g)|(this->b^rhs.b)) == 0;
|
|
}
|
|
};
|
|
static constexpr std::uint32_t magic =
|
|
113u /*q*/ << 24 | 111u /*o*/ << 16 | 105u /*i*/ << 8 | 102u /*f*/ ;
|
|
static constexpr std::size_t header_size =
|
|
sizeof(magic) +
|
|
sizeof(std::declval<desc>().width) +
|
|
sizeof(std::declval<desc>().height) +
|
|
sizeof(std::declval<desc>().channels) +
|
|
sizeof(std::declval<desc>().colorspace);
|
|
static constexpr std::size_t pixels_max = 400000000u;
|
|
static constexpr std::uint8_t padding[8] = {0, 0, 0, 0, 0, 0, 0, 1};
|
|
template<typename Puller>
|
|
static inline std::uint32_t read_32(Puller& p){
|
|
if constexpr(std::endian::native == std::endian::big && Puller::is_contiguous){
|
|
std::uint32_t x;
|
|
pull<sizeof(x)>(&x, p);
|
|
return x;
|
|
}
|
|
else{
|
|
const auto _1 = p.pull();
|
|
const auto _2 = p.pull();
|
|
const auto _3 = p.pull();
|
|
const auto _4 = p.pull();
|
|
return static_cast<std::uint32_t>(_1 << 24 | _2 << 16 | _3 << 8 | _4);
|
|
}
|
|
}
|
|
template<typename Pusher>
|
|
static inline void write_32(Pusher& p, std::uint32_t value){
|
|
if constexpr(std::endian::native == std::endian::big && Pusher::is_contiguous)
|
|
push<sizeof(value)>(p, value);
|
|
else{
|
|
p.push((value & 0xff000000) >> 24);
|
|
p.push((value & 0x00ff0000) >> 16);
|
|
p.push((value & 0x0000ff00) >> 8);
|
|
p.push( value & 0x000000ff );
|
|
}
|
|
}
|
|
private:
|
|
template<bool Alpha>
|
|
using local_rgba_pixel_t = std::conditional_t<Alpha, rgba_t, rgb_t>;
|
|
template<bool Alpha>
|
|
static constexpr local_rgba_pixel_t<Alpha> default_pixel()noexcept{
|
|
if constexpr(Alpha)
|
|
return {0, 0, 0, 255};
|
|
else
|
|
return {};
|
|
}
|
|
template<bool Alpha>
|
|
struct local_pixel{
|
|
std::uint8_t rgb = static_cast<std::uint8_t>(chunk_tag::rgb);
|
|
local_rgba_pixel_t<Alpha> v;
|
|
};
|
|
static_assert(std::has_unique_object_representations_v<local_pixel<true>> and std::has_unique_object_representations_v<local_pixel<false>>);
|
|
template<std::uint_fast8_t Channels, typename Pusher, typename Puller>
|
|
static inline void encode_body(Pusher& p, Puller& pixels, rgba_t (&index)[index_size], std::size_t px_len, local_rgba_pixel_t<Channels == 4u> px_prev = default_pixel<Channels == 4u>(), std::uint8_t prev_hash = static_cast<std::uint8_t>(index_size), std::size_t run = 0){
|
|
local_pixel<Channels == 4u> px;
|
|
while(px_len--)[[likely]]{
|
|
pull<Channels>(&px.v, pixels);
|
|
if(px.v.v() == px_prev.v()){
|
|
++run;
|
|
continue;
|
|
}
|
|
if(run > 0){
|
|
while(run >= 62)[[unlikely]]{
|
|
static constexpr std::uint8_t x = chunk_tag::run | 61;
|
|
p.push(x);
|
|
run -= 62;
|
|
}
|
|
if(run > 1){
|
|
p.push(chunk_tag::run | (run-1));
|
|
run = 0;
|
|
}
|
|
else if(run == 1){
|
|
if(prev_hash == index_size)[[unlikely]]
|
|
p.push(chunk_tag::run);
|
|
else
|
|
p.push(chunk_tag::index | prev_hash);
|
|
run = 0;
|
|
}
|
|
}
|
|
|
|
const auto index_pos = px.v.hash() % index_size;
|
|
prev_hash = index_pos;
|
|
|
|
do{
|
|
if(index[index_pos].v() == px.v.v()){
|
|
p.push(chunk_tag::index | index_pos);
|
|
break;
|
|
}
|
|
efficient_memcpy<Channels>(index + index_pos, &px.v);
|
|
if constexpr(Channels == 3)
|
|
index[index_pos].a = 255u;
|
|
|
|
if constexpr(Channels == 4)
|
|
if(px.v.a != px_prev.a){
|
|
p.push(chunk_tag::rgba);
|
|
push<4>(p, &px.v);
|
|
break;
|
|
}
|
|
const auto vg_2 = static_cast<int>(px.v.g) - static_cast<int>(px_prev.g);
|
|
if(const std::uint8_t g = vg_2+32; g < 64){
|
|
const auto vr = static_cast<int>(px.v.r) - static_cast<int>(px_prev.r) + 2;
|
|
const auto vg = vg_2 + 2;
|
|
const auto vb = static_cast<int>(px.v.b) - static_cast<int>(px_prev.b) + 2;
|
|
|
|
if(static_cast<std::uint8_t>(vr|vg|vb) < 4){
|
|
p.push(chunk_tag::diff | vr << 4 | vg << 2 | vb);
|
|
break;
|
|
}
|
|
const auto vg_r = vr - vg + 8;
|
|
const auto vg_b = vb - vg + 8;
|
|
if(static_cast<std::uint8_t>(vg_r|vg_b) < 16){
|
|
p.push(chunk_tag::luma | g);
|
|
p.push(vg_r << 4 | vg_b);
|
|
}
|
|
else
|
|
push<4>(p, &px);
|
|
}
|
|
else
|
|
push<4>(p, &px);
|
|
}while(false);
|
|
efficient_memcpy<Channels>(&px_prev, &px.v);
|
|
}
|
|
while(run >= 62)[[unlikely]]{
|
|
static constexpr std::uint8_t x = chunk_tag::run | 61;
|
|
p.push(x);
|
|
run -= 62;
|
|
}
|
|
if(run > 0)
|
|
p.push(chunk_tag::run | (run-1));
|
|
}
|
|
#ifndef QOIXX_NO_SIMD
|
|
#if defined(__ARM_FEATURE_SVE)
|
|
template<bool Alpha>
|
|
using pixels_type = std::conditional_t<Alpha, svuint8x4_t, svuint8x3_t>;
|
|
template<typename... Args>
|
|
requires (std::same_as<std::decay_t<Args>, svuint8_t> && ...)
|
|
static inline pixels_type<sizeof...(Args) == 4> create(Args&&... args)noexcept{
|
|
if constexpr(sizeof...(Args) == 4)
|
|
return svcreate4_u8(std::forward<Args>(args)...);
|
|
else
|
|
return svcreate3_u8(std::forward<Args>(args)...);
|
|
}
|
|
template<std::size_t ImmIndex>
|
|
static inline svuint8_t get(svuint8x4_t t)noexcept{
|
|
return svget4_u8(t, ImmIndex);
|
|
}
|
|
template<std::size_t ImmIndex>
|
|
static inline svuint8_t get(svuint8x3_t t)noexcept{
|
|
return svget3_u8(t, ImmIndex);
|
|
}
|
|
template<bool Alpha>
|
|
static inline pixels_type<Alpha> load(svbool_t pg, const std::uint8_t* ptr)noexcept{
|
|
if constexpr(Alpha)
|
|
return svld4_u8(pg, ptr);
|
|
else
|
|
return svld3_u8(pg, ptr);
|
|
}
|
|
template<std::size_t SVERegisterSize, std::uint_fast8_t Channels, typename Pusher, typename Puller>
|
|
static inline void encode_sve(Pusher& p_, Puller& pixels_, const desc& desc){
|
|
static constexpr bool Alpha = Channels == 4;
|
|
std::uint8_t* p = p_.raw_pointer();
|
|
const std::uint8_t* pixels = pixels_.raw_pointer();
|
|
|
|
rgba_t index[index_size] = {};
|
|
|
|
const auto zero = svdup_n_u8(0);
|
|
const auto iota = svindex_u8(0, 1);
|
|
|
|
pixels_type<Alpha> prev;
|
|
if constexpr(Alpha)
|
|
prev = create(zero, zero, zero, svdup_n_u8(255));
|
|
else
|
|
prev = create(zero, zero, zero);
|
|
|
|
std::size_t run = 0;
|
|
rgba_t px = {0, 0, 0, 255};
|
|
auto prev_hash = static_cast<std::uint8_t>(index_size);
|
|
|
|
const std::size_t px_len = desc.width * desc.height;
|
|
static constexpr auto vector_lanes = SVERegisterSize/8;
|
|
for(std::size_t i = 0; i < px_len; i += vector_lanes){
|
|
const auto mask = svwhilelt_b8_u64(i, px_len);
|
|
const auto num = std::min(px_len-i, vector_lanes);
|
|
const auto pxs = load<Alpha>(mask, pixels);
|
|
static constexpr std::uint64_t imm = SVERegisterSize/8-1;
|
|
auto rv = svsub_u8_x(mask, get<0>(pxs), svext_u8(get<0>(prev), get<0>(pxs), imm));
|
|
auto gv = svsub_u8_x(mask, get<1>(pxs), svext_u8(get<1>(prev), get<1>(pxs), imm));
|
|
auto bv = svsub_u8_x(mask, get<2>(pxs), svext_u8(get<2>(prev), get<2>(pxs), imm));
|
|
[[maybe_unused]] svbool_t av;
|
|
bool alpha = true;
|
|
if constexpr(Alpha){
|
|
av = svcmpeq_n_u8(mask, svsub_u8_x(mask, get<3>(pxs), svext_u8(get<3>(prev), get<3>(pxs), imm)), 0);
|
|
alpha = !svptest_any(mask, svnot_b_z(mask, av));
|
|
}
|
|
auto runv = svcmpeq_n_u8(mask, svorr_u8_x(mask, svorr_u8_x(mask, rv, gv), bv), 0);
|
|
if constexpr(Alpha)
|
|
runv = svand_b_z(mask, runv, av);
|
|
const auto not_runv = svnot_b_z(mask, runv);
|
|
if(!svptest_any(mask, not_runv)){
|
|
run += num;
|
|
pixels += num*Channels;
|
|
continue;
|
|
}
|
|
const auto r = svminv_u8(not_runv, iota);
|
|
run += r;
|
|
pixels += r*Channels;
|
|
if(run > 0){
|
|
while(run >= 62)[[unlikely]]{
|
|
static constexpr std::uint8_t x = chunk_tag::run | 61;
|
|
*p++ = x;
|
|
run -= 62;
|
|
}
|
|
if(run > 1){
|
|
*p++ = chunk_tag::run | (run-1);
|
|
run = 0;
|
|
}
|
|
else if(run == 1){
|
|
if(prev_hash == index_size)[[unlikely]]
|
|
*p++ = chunk_tag::run;
|
|
else
|
|
*p++ = chunk_tag::index | prev_hash;
|
|
run = 0;
|
|
}
|
|
}
|
|
rv = svadd_n_u8_x(mask, rv, 2);
|
|
gv = svadd_n_u8_x(mask, gv, 2);
|
|
bv = svadd_n_u8_x(mask, bv, 2);
|
|
const auto diffv = svorr_u8_z(svcmplt_n_u8(mask, svorr_u8_z(mask, svorr_u8_x(mask, rv, gv), bv), 4), svorr_n_u8_x(mask, svlsl_n_u8_x(mask, rv, 4), chunk_tag::diff), svorr_u8_x(mask, svlsl_n_u8_x(mask, gv, 2), bv));
|
|
rv = svadd_n_u8_x(mask, svsub_u8_x(mask, rv, gv), 8);
|
|
bv = svadd_n_u8_x(mask, svsub_u8_x(mask, bv, gv), 8);
|
|
gv = svadd_n_u8_x(mask, gv, 30);
|
|
const auto lu = svorr_n_u8_z(svcmpeq_n_u8(mask, svorr_u8_x(mask, svand_n_u8_x(mask, svorr_u8_x(mask, rv, bv), 0xf0), svand_n_u8_x(mask, gv, 0xc0)), 0), gv, chunk_tag::luma);
|
|
const auto ma = svorr_u8_x(mask, svlsl_n_u8_x(mask, rv, 4), bv);
|
|
svuint8_t hash;
|
|
if constexpr(Alpha)
|
|
hash = svand_n_u8_x(mask, svadd_u8_x(mask, svadd_u8_x(mask, svmul_n_u8_x(mask, get<0>(pxs), 3), svmul_n_u8_x(mask, get<1>(pxs), 5)), svadd_u8_x(mask, svmul_n_u8_x(mask, get<2>(pxs), 7), svmul_n_u8_x(mask, get<3>(pxs), 11))), 63);
|
|
else
|
|
hash = svand_n_u8_x(mask, svadd_u8_x(mask, svadd_u8_x(mask, svmul_n_u8_x(mask, get<0>(pxs), 3), svmul_n_u8_x(mask, get<1>(pxs), 5)), svadd_n_u8_x(mask, svmul_n_u8_x(mask, get<2>(pxs), 7), static_cast<std::uint8_t>(255*11))), 63);
|
|
std::uint8_t runs[SVERegisterSize/8], diffs[SVERegisterSize/8], lumas[SVERegisterSize/8*2], hashs[SVERegisterSize/8];
|
|
[[maybe_unused]] std::uint8_t alphas[SVERegisterSize/8];
|
|
svst1_u8(mask, runs, svadd_n_u8_m(runv, zero, 1));
|
|
svst1_u8(mask, diffs, diffv);
|
|
const auto luma = svcreate2_u8(lu, ma);
|
|
svst2_u8(mask, lumas, luma);
|
|
svst1_u8(mask, hashs, hash);
|
|
if constexpr(Alpha)
|
|
if(!alpha)
|
|
svst1_u8(mask, alphas, svadd_n_u8_m(av, zero, 1));
|
|
for(std::size_t i = r; i < num; ++i){
|
|
if(runs[i]){
|
|
++run;
|
|
pixels += Channels;
|
|
continue;
|
|
}
|
|
if(run > 1){
|
|
*p++ = chunk_tag::run | (run-1);
|
|
run = 0;
|
|
}
|
|
else if(run == 1){
|
|
if(prev_hash == index_size)[[unlikely]]
|
|
*p++ = chunk_tag::run;
|
|
else
|
|
*p++ = chunk_tag::index | prev_hash;
|
|
run = 0;
|
|
}
|
|
const auto index_pos = hashs[i];
|
|
prev_hash = index_pos;
|
|
efficient_memcpy<Channels>(&px, pixels);
|
|
pixels += Channels;
|
|
if(index[index_pos] == px){
|
|
*p++ = chunk_tag::index | index_pos;
|
|
continue;
|
|
}
|
|
index[index_pos] = px;
|
|
|
|
if constexpr(Alpha)
|
|
if(!alpha && !alphas[i]){
|
|
*p++ = chunk_tag::rgba;
|
|
std::memcpy(p, &px, 4);
|
|
p += 4;
|
|
continue;
|
|
}
|
|
if(diffs[i])
|
|
*p++ = diffs[i];
|
|
else if(lumas[i*2]){
|
|
std::memcpy(p, lumas + i*2, 2);
|
|
p += 2;
|
|
}
|
|
else{
|
|
*p++ = chunk_tag::rgb;
|
|
efficient_memcpy<3>(p, &px);
|
|
p += 3;
|
|
}
|
|
}
|
|
prev = pxs;
|
|
}
|
|
while(run >= 62)[[unlikely]]{
|
|
static constexpr std::uint8_t x = chunk_tag::run | 61;
|
|
*p++ = x;
|
|
run -= 62;
|
|
}
|
|
if(run > 0){
|
|
*p++ = chunk_tag::run | (run-1);
|
|
run = 0;
|
|
}
|
|
p_.advance(p-p_.raw_pointer());
|
|
pixels_.advance(px_len*Channels);
|
|
|
|
push<sizeof(padding)>(p_, padding);
|
|
}
|
|
#elif defined(__aarch64__)
|
|
template<bool Alpha>
|
|
using pixels_type = std::conditional_t<Alpha, uint8x16x4_t, uint8x16x3_t>;
|
|
template<bool Alpha>
|
|
static inline pixels_type<Alpha> load(const std::uint8_t* ptr)noexcept{
|
|
if constexpr(Alpha)
|
|
return vld4q_u8(ptr);
|
|
else
|
|
return vld3q_u8(ptr);
|
|
}
|
|
static constexpr std::size_t simd_lanes = 16;
|
|
template<std::uint_fast8_t Channels, typename Pusher, typename Puller>
|
|
static inline void encode_neon(Pusher& p_, Puller& pixels_, const desc& desc){
|
|
static constexpr bool Alpha = Channels == 4;
|
|
std::uint8_t* p = p_.raw_pointer();
|
|
const std::uint8_t* pixels = pixels_.raw_pointer();
|
|
|
|
rgba_t index[index_size] = {};
|
|
|
|
const auto zero = vdupq_n_u8(0);
|
|
static constexpr std::uint8_t iota_[simd_lanes] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
|
const auto iota = vld1q_u8(iota_);
|
|
|
|
pixels_type<Alpha> prev;
|
|
prev.val[0] = prev.val[1] = prev.val[2] = zero;
|
|
if constexpr(Alpha)
|
|
prev.val[3] = vdupq_n_u8(255);
|
|
|
|
std::size_t run = 0;
|
|
rgba_t px = {0, 0, 0, 255};
|
|
auto prev_hash = static_cast<std::uint8_t>(index_size);
|
|
|
|
std::size_t px_len = desc.width * desc.height;
|
|
std::size_t simd_len = px_len / simd_lanes;
|
|
const std::size_t simd_len_16 = simd_len * simd_lanes;
|
|
px_len -= simd_len_16;
|
|
pixels_.advance(simd_len_16*Channels);
|
|
while(simd_len--){
|
|
const auto pxs = load<Alpha>(pixels);
|
|
pixels_type<Alpha> diff;
|
|
diff.val[0] = vsubq_u8(pxs.val[0], vextq_u8(prev.val[0], pxs.val[0], simd_lanes-1));
|
|
diff.val[1] = vsubq_u8(pxs.val[1], vextq_u8(prev.val[1], pxs.val[1], simd_lanes-1));
|
|
diff.val[2] = vsubq_u8(pxs.val[2], vextq_u8(prev.val[2], pxs.val[2], simd_lanes-1));
|
|
bool alpha = true;
|
|
if constexpr(Alpha){
|
|
diff.val[3] = vsubq_u8(pxs.val[3], vextq_u8(prev.val[3], pxs.val[3], simd_lanes-1));
|
|
diff.val[3] = vceqq_u8(diff.val[3], zero);
|
|
alpha = vminvq_u8(diff.val[3]) != 0;
|
|
}
|
|
auto runv = vceqq_u8(vorrq_u8(vorrq_u8(diff.val[0], diff.val[1]), diff.val[2]), zero);
|
|
if(vminvq_u8(runv) != 0 && alpha){
|
|
run += simd_lanes;
|
|
pixels += simd_lanes*Channels;
|
|
continue;
|
|
}
|
|
if constexpr(Alpha)
|
|
runv = vandq_u8(runv, diff.val[3]);
|
|
const auto r = vminvq_u8(vorrq_u8(vandq_u8(vmvnq_u8(runv), iota), runv));
|
|
run += r;
|
|
pixels += r*Channels;
|
|
if(run > 0){
|
|
while(run >= 62)[[unlikely]]{
|
|
static constexpr std::uint8_t x = chunk_tag::run | 61;
|
|
*p++ = x;
|
|
run -= 62;
|
|
}
|
|
if(run > 1){
|
|
*p++ = chunk_tag::run | (run-1);
|
|
run = 0;
|
|
}
|
|
else if(run == 1){
|
|
if(prev_hash == index_size)[[unlikely]]
|
|
*p++ = chunk_tag::run;
|
|
else
|
|
*p++ = chunk_tag::index | prev_hash;
|
|
run = 0;
|
|
}
|
|
}
|
|
const auto two = vdupq_n_u8(2);
|
|
diff.val[0] = vaddq_u8(diff.val[0], two);
|
|
diff.val[1] = vaddq_u8(diff.val[1], two);
|
|
diff.val[2] = vaddq_u8(diff.val[2], two);
|
|
const auto four = vdupq_n_u8(4);
|
|
const auto diffv = vandq_u8(vorrq_u8(vorrq_u8(vdupq_n_u8(chunk_tag::diff), vshlq_n_u8(diff.val[0], 4)), vorrq_u8(vshlq_n_u8(diff.val[1], 2), diff.val[2])), vcltq_u8(vorrq_u8(vorrq_u8(diff.val[0], diff.val[1]), diff.val[2]), four));
|
|
const auto eight = vdupq_n_u8(8);
|
|
diff.val[0] = vaddq_u8(vsubq_u8(diff.val[0], diff.val[1]), eight);
|
|
diff.val[2] = vaddq_u8(vsubq_u8(diff.val[2], diff.val[1]), eight);
|
|
diff.val[1] = vaddq_u8(diff.val[1], vdupq_n_u8(30));
|
|
const auto lu = vandq_u8(vorrq_u8(vdupq_n_u8(chunk_tag::luma), diff.val[1]), vceqq_u8(vorrq_u8(vandq_u8(vorrq_u8(diff.val[0], diff.val[2]), vdupq_n_u8(0xf0)), vandq_u8(diff.val[1], vdupq_n_u8(0xc0))), zero));
|
|
const auto ma = vorrq_u8(vshlq_n_u8(diff.val[0], 4), diff.val[2]);
|
|
uint8x16_t hash;
|
|
if constexpr(Alpha)
|
|
hash = vandq_u8(vaddq_u8(vaddq_u8(vmulq_u8(pxs.val[0], vdupq_n_u8(3)), vmulq_u8(pxs.val[1], vdupq_n_u8(5))), vaddq_u8(vmulq_u8(pxs.val[2], vdupq_n_u8(7)), vmulq_u8(pxs.val[3], vdupq_n_u8(11)))), vdupq_n_u8(63));
|
|
else
|
|
hash = vandq_u8(vaddq_u8(vaddq_u8(vmulq_u8(pxs.val[0], vdupq_n_u8(3)), vmulq_u8(pxs.val[1], vdupq_n_u8(5))), vaddq_u8(vmulq_u8(pxs.val[2], vdupq_n_u8(7)), vdupq_n_u8(static_cast<std::uint8_t>(255*11)))), vdupq_n_u8(63));
|
|
std::uint8_t runs[simd_lanes], diffs[simd_lanes], lumas[simd_lanes*2], hashs[simd_lanes];
|
|
[[maybe_unused]] std::uint8_t alphas[simd_lanes];
|
|
vst1q_u8(runs, runv);
|
|
vst1q_u8(diffs, diffv);
|
|
vst2q_u8(lumas, (uint8x16x2_t{lu, ma}));
|
|
vst1q_u8(hashs, hash);
|
|
if constexpr(Alpha)
|
|
if(!alpha)
|
|
vst1q_u8(alphas, diff.val[3]);
|
|
for(std::size_t i = r; i < simd_lanes; ++i){
|
|
if(runs[i]){
|
|
++run;
|
|
pixels += Channels;
|
|
continue;
|
|
}
|
|
if(run > 1){
|
|
*p++ = chunk_tag::run | (run-1);
|
|
run = 0;
|
|
}
|
|
else if(run == 1){
|
|
if(prev_hash == index_size)[[unlikely]]
|
|
*p++ = chunk_tag::run;
|
|
else
|
|
*p++ = chunk_tag::index | prev_hash;
|
|
run = 0;
|
|
}
|
|
const auto index_pos = hashs[i];
|
|
prev_hash = index_pos;
|
|
efficient_memcpy<Channels>(&px, pixels);
|
|
pixels += Channels;
|
|
if(index[index_pos] == px){
|
|
*p++ = chunk_tag::index | index_pos;
|
|
continue;
|
|
}
|
|
index[index_pos] = px;
|
|
|
|
if constexpr(Alpha)
|
|
if(!alpha && !alphas[i]){
|
|
*p++ = chunk_tag::rgba;
|
|
std::memcpy(p, &px, 4);
|
|
p += 4;
|
|
continue;
|
|
}
|
|
if(diffs[i])
|
|
*p++ = diffs[i];
|
|
else if(lumas[i*2]){
|
|
std::memcpy(p, lumas + i*2, 2);
|
|
p += 2;
|
|
}
|
|
else{
|
|
*p++ = chunk_tag::rgb;
|
|
efficient_memcpy<3>(p, &px);
|
|
p += 3;
|
|
}
|
|
}
|
|
prev = pxs;
|
|
}
|
|
p_.advance(p-p_.raw_pointer());
|
|
|
|
if constexpr(Alpha)
|
|
encode_body<Channels>(p_, pixels_, index, px_len, px, prev_hash, run);
|
|
else{
|
|
rgb_t px_prev;
|
|
efficient_memcpy<3>(&px_prev, &px);
|
|
encode_body<Channels>(p_, pixels_, index, px_len, px_prev, prev_hash, run);
|
|
}
|
|
|
|
push<sizeof(padding)>(p_, padding);
|
|
}
|
|
#elif defined(__AVX2__)
|
|
static constexpr unsigned de_bruijn_bit_position_sequence[32] = {
|
|
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
|
|
};
|
|
static constexpr unsigned lsb32(std::uint32_t x)noexcept{
|
|
return de_bruijn_bit_position_sequence[(static_cast<std::uint32_t>(x&-static_cast<std::int32_t>(x))*0x077cb531u) >> 27];
|
|
}
|
|
template<std::uint8_t M>
|
|
static inline __m256i slli_epi8(__m256i v)noexcept{
|
|
const auto mask = _mm256_set1_epi8(static_cast<std::uint8_t>(0xff << M) >> M);
|
|
return _mm256_slli_epi16(_mm256_and_si256(v, mask), M);
|
|
}
|
|
template<std::uint8_t M>
|
|
static inline __m256i mul_epi8(__m256i v)noexcept{
|
|
if constexpr(M == 0)
|
|
return _mm256_setzero_si256();
|
|
else if constexpr(M == 1)
|
|
return v;
|
|
else if constexpr(M == 2)
|
|
return slli_epi8<1>(v);
|
|
else if constexpr(M == 3)
|
|
return _mm256_add_epi8(slli_epi8<1>(v), v);
|
|
else if constexpr(M == 4)
|
|
return slli_epi8<2>(v);
|
|
else if constexpr(M == 5)
|
|
return _mm256_add_epi8(slli_epi8<2>(v), v);
|
|
else if constexpr(M == 6)
|
|
return _mm256_add_epi8(slli_epi8<2>(v), slli_epi8<1>(v));
|
|
else if constexpr(M == 7)
|
|
return _mm256_sub_epi8(slli_epi8<3>(v), v);
|
|
else if constexpr(M == 8)
|
|
return slli_epi8<3>(v);
|
|
else if constexpr(M == 9)
|
|
return _mm256_add_epi8(slli_epi8<3>(v), v);
|
|
else if constexpr(M == 10)
|
|
return _mm256_add_epi8(slli_epi8<3>(v), slli_epi8<1>(v));
|
|
else if constexpr(M == 11)
|
|
return _mm256_add_epi8(_mm256_add_epi8(slli_epi8<3>(v), slli_epi8<1>(v)), v);
|
|
else if constexpr(M == 12)
|
|
return _mm256_add_epi8(slli_epi8<3>(v), slli_epi8<2>(v));
|
|
else if constexpr(M == 13)
|
|
return _mm256_add_epi8(_mm256_add_epi8(slli_epi8<3>(v), slli_epi8<2>(v)), v);
|
|
else if constexpr(M == 14)
|
|
return _mm256_sub_epi8(slli_epi8<4>(v), slli_epi8<1>(v));
|
|
else if constexpr(M == 15)
|
|
return _mm256_sub_epi8(slli_epi8<4>(v), v);
|
|
else
|
|
static_assert(M <= 15);
|
|
}
|
|
static inline __m256i prev_vector(__m256i pxs, __m256i prev)noexcept{
|
|
const auto permute = _mm256_permute2x128_si256(pxs, pxs, 0x08);
|
|
const auto inserted = _mm256_inserti128_si256(permute, _mm256_extracti128_si256(prev, 1), 0);
|
|
return _mm256_alignr_epi8(pxs, inserted, 15);
|
|
}
|
|
template<bool Alpha>
|
|
struct pixels_type{
|
|
__m256i val[3+Alpha];
|
|
};
|
|
static constexpr std::size_t simd_lanes = 256/8;
|
|
template<bool Alpha>
|
|
static inline pixels_type<Alpha> load(const std::uint8_t* ptr)noexcept{
|
|
if constexpr(Alpha){
|
|
const auto t1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
|
|
const auto t2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr+simd_lanes));
|
|
const auto t3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr+simd_lanes*2));
|
|
const auto t4 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr+simd_lanes*3));
|
|
const auto lo12 = _mm256_unpacklo_epi8(t1, t2);
|
|
const auto lo34 = _mm256_unpacklo_epi8(t3, t4);
|
|
const auto lolo12lo34 = _mm256_unpacklo_epi16(lo12, lo34);
|
|
const auto hilo12lo34 = _mm256_unpackhi_epi16(lo12, lo34);
|
|
const auto lololo12lo34hilo12lo34 = _mm256_unpacklo_epi32(lolo12lo34, hilo12lo34);
|
|
const auto hilolo12lo34hilo12lo34 = _mm256_unpackhi_epi32(lolo12lo34, hilo12lo34);
|
|
const auto hi12 = _mm256_unpackhi_epi8(t1, t2);
|
|
const auto hi34 = _mm256_unpackhi_epi8(t3, t4);
|
|
const auto lohi12hi34 = _mm256_unpacklo_epi16(hi12, hi34);
|
|
const auto hihi12hi34 = _mm256_unpackhi_epi16(hi12, hi34);
|
|
const auto lolohi12hi34hihi12hi34 = _mm256_unpacklo_epi32(lohi12hi34, hihi12hi34);
|
|
const auto lolololo12lo34hilo12lo34lolohi12hi34hihi12hi34 = _mm256_unpacklo_epi64(lololo12lo34hilo12lo34, lolohi12hi34hihi12hi34);
|
|
const auto hilololo12lo34hilo12lo34lolohi12hi34hihi12hi34 = _mm256_unpackhi_epi64(lololo12lo34hilo12lo34, lolohi12hi34hihi12hi34);
|
|
const auto hilohi12hi34hihi12hi34 = _mm256_unpackhi_epi32(lohi12hi34, hihi12hi34);
|
|
const auto lohilolo12lo34hilo12lo34hilohi12hi34hihi12hi34 = _mm256_unpacklo_epi64(hilolo12lo34hilo12lo34, hilohi12hi34hihi12hi34);
|
|
const auto hihilolo12lo34hilo12lo34hilohi12hi34hihi12hi34 = _mm256_unpackhi_epi64(hilolo12lo34hilo12lo34, hilohi12hi34hihi12hi34);
|
|
const auto mask1 = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
|
|
const auto mask2 = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
|
|
const auto r = _mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(lolololo12lo34hilo12lo34lolohi12hi34hihi12hi34, mask1), mask2);
|
|
const auto g = _mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(hilololo12lo34hilo12lo34lolohi12hi34hihi12hi34, mask1), mask2);
|
|
const auto b = _mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(lohilolo12lo34hilo12lo34hilohi12hi34hihi12hi34, mask1), mask2);
|
|
const auto a = _mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(hihilolo12lo34hilo12lo34hilohi12hi34hihi12hi34, mask1), mask2);
|
|
return {{r, g, b, a}};
|
|
}
|
|
else{
|
|
const auto t1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr));
|
|
const auto t2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr+simd_lanes/2));
|
|
const auto t3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr+simd_lanes));
|
|
const auto t4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr+simd_lanes*3/2));
|
|
const auto t5 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr+simd_lanes*2));
|
|
const auto t6 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr+simd_lanes*5/2));
|
|
const auto mask01 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
|
|
const auto mask02 = _mm_setr_epi8(2, 5, 8, 11, 14, 0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13);
|
|
const auto mask03 = _mm_setr_epi8(1, 4, 7, 10, 13, 2, 5, 8, 11, 14, 0, 3, 6, 9, 12, 15);
|
|
static constexpr char _128 = static_cast<char>(0b1000'0000);
|
|
const auto mask11 = _mm_setr_epi8(0, 0, 0, 0, 0, 0, _128, _128, _128, _128, _128, _128, _128, _128, _128, _128);
|
|
const auto mask21 = _mm_setr_epi8(_128, _128, _128, _128, _128, _128, _128, _128, _128, _128, _128, 0, 0, 0, 0, 0);
|
|
const auto mask12 = _mm_setr_epi8(_128, _128, _128, _128, _128, 0, 0, 0, 0, 0, 0, _128, _128, _128, _128, _128);
|
|
const auto mask22 = _mm_setr_epi8(0, 0, 0, 0, 0, _128, _128, _128, _128, _128, _128, _128, _128, _128, _128, _128);
|
|
const auto mask13 = _mm_setr_epi8(_128, _128, _128, _128, _128, _128, _128, _128, _128, _128, 0, 0, 0, 0, 0, 0);
|
|
const auto mask23 = _mm_setr_epi8(_128, _128, _128, _128, _128, 0, 0, 0, 0, 0, _128, _128, _128, _128, _128, _128);
|
|
const auto x1 = _mm_shuffle_epi8(t1, mask01);
|
|
const auto x2 = _mm_shuffle_epi8(t2, mask02);
|
|
const auto x3 = _mm_shuffle_epi8(t3, mask03);
|
|
const auto x4 = _mm_shuffle_epi8(t4, mask01);
|
|
const auto x5 = _mm_shuffle_epi8(t5, mask02);
|
|
const auto x6 = _mm_shuffle_epi8(t6, mask03);
|
|
const auto r1 = _mm_blendv_epi8(_mm_alignr_epi8(x3, x3, 5), _mm_blendv_epi8(x1, _mm_alignr_epi8(x2, x2, 10), mask11), mask21);
|
|
const auto g1 = _mm_blendv_epi8(_mm_alignr_epi8(x1, x1, 6), _mm_blendv_epi8(x2, _mm_alignr_epi8(x3, x3, 10), mask12), mask22);
|
|
const auto b1 = _mm_blendv_epi8(_mm_alignr_epi8(x2, x2, 6), _mm_blendv_epi8(x3, _mm_alignr_epi8(x1, x1, 11), mask13), mask23);
|
|
const auto r2 = _mm_blendv_epi8(_mm_alignr_epi8(x6, x6, 5), _mm_blendv_epi8(x4, _mm_alignr_epi8(x5, x5, 10), mask11), mask21);
|
|
const auto g2 = _mm_blendv_epi8(_mm_alignr_epi8(x4, x4, 6), _mm_blendv_epi8(x5, _mm_alignr_epi8(x6, x6, 10), mask12), mask22);
|
|
const auto b2 = _mm_blendv_epi8(_mm_alignr_epi8(x5, x5, 6), _mm_blendv_epi8(x6, _mm_alignr_epi8(x4, x4, 11), mask13), mask23);
|
|
const auto r = _mm256_inserti128_si256(_mm256_castsi128_si256(r1), r2, 1);
|
|
const auto g = _mm256_inserti128_si256(_mm256_castsi128_si256(g1), g2, 1);
|
|
const auto b = _mm256_inserti128_si256(_mm256_castsi128_si256(b1), b2, 1);
|
|
return {{r, g, b}};
|
|
}
|
|
}
|
|
template<std::uint_fast8_t Channels, typename Pusher, typename Puller>
|
|
static inline void encode_avx2(Pusher& p_, Puller& pixels_, const desc& desc){
|
|
static constexpr bool Alpha = Channels == 4;
|
|
std::uint8_t* p = p_.raw_pointer();
|
|
const std::uint8_t* pixels = pixels_.raw_pointer();
|
|
|
|
rgba_t index[index_size] = {};
|
|
|
|
const auto zero = _mm256_setzero_si256();
|
|
|
|
pixels_type<Alpha> prev;
|
|
prev.val[0] = prev.val[1] = prev.val[2] = zero;
|
|
if constexpr(Alpha)
|
|
prev.val[3] = _mm256_set1_epi8(static_cast<char>(0xff));
|
|
|
|
std::size_t run = 0;
|
|
rgba_t px = {0, 0, 0, 255};
|
|
auto prev_hash = static_cast<std::uint8_t>(index_size);
|
|
|
|
std::size_t px_len = desc.width * desc.height;
|
|
std::size_t simd_len = px_len / simd_lanes;
|
|
const std::size_t simd_len_32 = simd_len * simd_lanes;
|
|
px_len -= simd_len_32;
|
|
pixels_.advance(simd_len_32*Channels);
|
|
while(simd_len--){
|
|
const auto pxs = load<Alpha>(pixels);
|
|
pixels_type<Alpha> diff;
|
|
diff.val[0] = _mm256_sub_epi8(pxs.val[0], prev_vector(pxs.val[0], prev.val[0]));
|
|
diff.val[1] = _mm256_sub_epi8(pxs.val[1], prev_vector(pxs.val[1], prev.val[1]));
|
|
diff.val[2] = _mm256_sub_epi8(pxs.val[2], prev_vector(pxs.val[2], prev.val[2]));
|
|
bool alpha = true;
|
|
if constexpr(Alpha){
|
|
diff.val[3] = _mm256_sub_epi8(pxs.val[3], prev_vector(pxs.val[3], prev.val[3]));
|
|
alpha = _mm256_testz_si256(diff.val[3], diff.val[3]);
|
|
diff.val[3] = _mm256_cmpeq_epi8(diff.val[3], zero);
|
|
}
|
|
const auto ored = _mm256_or_si256(_mm256_or_si256(diff.val[0], diff.val[1]), diff.val[2]);
|
|
auto runv = _mm256_cmpeq_epi8(ored, zero);
|
|
if(_mm256_testz_si256(ored, ored) && alpha){
|
|
run += simd_lanes;
|
|
pixels += simd_lanes*Channels;
|
|
continue;
|
|
}
|
|
if constexpr(Alpha)
|
|
runv = _mm256_and_si256(runv, diff.val[3]);
|
|
const auto r = lsb32(~_mm256_movemask_epi8(runv));
|
|
run += r;
|
|
pixels += r*Channels;
|
|
if(run > 0){
|
|
while(run >= 62)[[unlikely]]{
|
|
static constexpr std::uint8_t x = chunk_tag::run | 61;
|
|
*p++ = x;
|
|
run -= 62;
|
|
}
|
|
if(run > 1){
|
|
*p++ = static_cast<std::uint8_t>(chunk_tag::run | (run-1));
|
|
run = 0;
|
|
}
|
|
else if(run == 1){
|
|
if(prev_hash == index_size)[[unlikely]]
|
|
*p++ = chunk_tag::run;
|
|
else
|
|
*p++ = chunk_tag::index | prev_hash;
|
|
run = 0;
|
|
}
|
|
}
|
|
const auto two = _mm256_set1_epi8(2);
|
|
diff.val[0] = _mm256_add_epi8(diff.val[0], two);
|
|
diff.val[1] = _mm256_add_epi8(diff.val[1], two);
|
|
diff.val[2] = _mm256_add_epi8(diff.val[2], two);
|
|
const auto diffor = _mm256_or_si256(_mm256_or_si256(diff.val[0], diff.val[1]), diff.val[2]);
|
|
const auto diffv = _mm256_and_si256(_mm256_or_si256(_mm256_or_si256(_mm256_set1_epi8(chunk_tag::diff), slli_epi8<4>(diff.val[0])), _mm256_or_si256(slli_epi8<2>(diff.val[1]), diff.val[2])), _mm256_cmpeq_epi8(_mm256_and_si256(diffor, _mm256_set1_epi8(0b11)), diffor));
|
|
const auto eight = _mm256_set1_epi8(8);
|
|
diff.val[0] = _mm256_add_epi8(_mm256_sub_epi8(diff.val[0], diff.val[1]), eight);
|
|
diff.val[2] = _mm256_add_epi8(_mm256_sub_epi8(diff.val[2], diff.val[1]), eight);
|
|
diff.val[1] = _mm256_add_epi8(diff.val[1], _mm256_set1_epi8(30));
|
|
const auto luma_mask = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
|
|
const auto lu = _mm256_permutevar8x32_epi32(_mm256_and_si256(_mm256_or_si256(_mm256_set1_epi8(static_cast<char>(chunk_tag::luma)), diff.val[1]), _mm256_cmpeq_epi8(_mm256_or_si256(_mm256_and_si256(_mm256_or_si256(diff.val[0], diff.val[2]), _mm256_set1_epi8(static_cast<char>(0xf0))), _mm256_and_si256(diff.val[1], _mm256_set1_epi8(static_cast<char>(0xc0)))), zero)), luma_mask);
|
|
const auto ma = _mm256_permutevar8x32_epi32(_mm256_or_si256(slli_epi8<4>(diff.val[0]), diff.val[2]), luma_mask);
|
|
__m256i hash;
|
|
if constexpr(Alpha)
|
|
hash = _mm256_and_si256(_mm256_add_epi8(_mm256_add_epi8(mul_epi8<3>(pxs.val[0]), mul_epi8<5>(pxs.val[1])), _mm256_add_epi8(mul_epi8<7>(pxs.val[2]), mul_epi8<11>(pxs.val[3]))), _mm256_set1_epi8(63));
|
|
else
|
|
hash = _mm256_and_si256(_mm256_add_epi8(_mm256_add_epi8(mul_epi8<3>(pxs.val[0]), mul_epi8<5>(pxs.val[1])), _mm256_add_epi8(mul_epi8<7>(pxs.val[2]), _mm256_set1_epi8(static_cast<std::uint8_t>(255*11)))), _mm256_set1_epi8(63));
|
|
alignas(alignof(__m256i)) std::uint8_t runs[simd_lanes], diffs[simd_lanes], lumas[simd_lanes*2], hashs[simd_lanes];
|
|
[[maybe_unused]] alignas(alignof(__m256i)) std::uint8_t alphas[simd_lanes];
|
|
_mm256_store_si256(reinterpret_cast<__m256i*>(runs), runv);
|
|
_mm256_store_si256(reinterpret_cast<__m256i*>(diffs), diffv);
|
|
_mm256_store_si256(reinterpret_cast<__m256i*>(lumas), _mm256_unpacklo_epi8(lu, ma));
|
|
_mm256_store_si256(reinterpret_cast<__m256i*>(lumas)+1, _mm256_unpackhi_epi8(lu, ma));
|
|
_mm256_store_si256(reinterpret_cast<__m256i*>(hashs), hash);
|
|
if constexpr(Alpha)
|
|
if(!alpha)
|
|
_mm256_store_si256(reinterpret_cast<__m256i*>(alphas), diff.val[3]);
|
|
for(std::size_t i = r; i < simd_lanes; ++i){
|
|
if(runs[i]){
|
|
++run;
|
|
pixels += Channels;
|
|
continue;
|
|
}
|
|
if(run > 1){
|
|
*p++ = static_cast<std::uint8_t>(chunk_tag::run | (run-1));
|
|
run = 0;
|
|
}
|
|
else if(run == 1){
|
|
if(prev_hash == index_size)[[unlikely]]
|
|
*p++ = chunk_tag::run;
|
|
else
|
|
*p++ = chunk_tag::index | prev_hash;
|
|
run = 0;
|
|
}
|
|
const auto index_pos = hashs[i];
|
|
prev_hash = index_pos;
|
|
efficient_memcpy<Channels>(&px, pixels);
|
|
pixels += Channels;
|
|
if(index[index_pos] == px){
|
|
*p++ = chunk_tag::index | index_pos;
|
|
continue;
|
|
}
|
|
index[index_pos] = px;
|
|
|
|
if constexpr(Alpha)
|
|
if(!alpha && !alphas[i]){
|
|
*p++ = chunk_tag::rgba;
|
|
std::memcpy(p, &px, 4);
|
|
p += 4;
|
|
continue;
|
|
}
|
|
if(diffs[i])
|
|
*p++ = diffs[i];
|
|
else if(lumas[i*2]){
|
|
std::memcpy(p, lumas + i*2, 2);
|
|
p += 2;
|
|
}
|
|
else{
|
|
*p++ = chunk_tag::rgb;
|
|
efficient_memcpy<3>(p, &px);
|
|
p += 3;
|
|
}
|
|
}
|
|
prev = pxs;
|
|
}
|
|
p_.advance(p-p_.raw_pointer());
|
|
|
|
if constexpr(Alpha)
|
|
encode_body<Channels>(p_, pixels_, index, px_len, px, prev_hash, run);
|
|
else{
|
|
rgb_t px_prev;
|
|
efficient_memcpy<3>(&px_prev, &px);
|
|
encode_body<Channels>(p_, pixels_, index, px_len, px_prev, prev_hash, run);
|
|
}
|
|
|
|
push<sizeof(padding)>(p_, padding);
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
template<std::uint_fast8_t Channels, typename Pusher, typename Puller>
|
|
static inline void encode_impl(Pusher& p, Puller& pixels, const desc& desc){
|
|
rgba_t index[index_size] = {};
|
|
|
|
std::size_t px_len = desc.width * desc.height;
|
|
encode_body<Channels>(p, pixels, index, px_len);
|
|
|
|
push<sizeof(padding)>(p, padding);
|
|
}
|
|
|
|
template<typename Puller>
|
|
static inline desc decode_header(Puller& p){
|
|
desc d;
|
|
const auto magic_ = read_32(p);
|
|
d.width = read_32(p);
|
|
d.height = read_32(p);
|
|
d.channels = p.pull();
|
|
d.colorspace = static_cast<qoi::colorspace>(p.pull());
|
|
if(
|
|
d.width == 0 || d.height == 0 || magic_ != magic ||
|
|
d.channels < 3 || d.channels > 4 ||
|
|
d.height >= pixels_max / d.width
|
|
)[[unlikely]]
|
|
throw std::runtime_error("qoixx::qoi::decode: invalid header");
|
|
return d;
|
|
}
|
|
|
|
#ifndef QOIXX_DECODE_WITH_TABLES
|
|
#define QOIXX_HPP_DECODE_WITH_TABLES_NOT_DEFINED
|
|
#ifdef __aarch64__
|
|
#define QOIXX_DECODE_WITH_TABLES 0
|
|
#else
|
|
#define QOIXX_DECODE_WITH_TABLES 1
|
|
#endif
|
|
#endif
|
|
|
|
#if QOIXX_DECODE_WITH_TABLES
|
|
static constexpr std::size_t hash_table_offset = std::numeric_limits<std::uint8_t>::max()+1 - chunk_tag::diff;
|
|
static constexpr std::array<int, std::numeric_limits<std::uint8_t>::max()+1+chunk_tag::run-chunk_tag::diff> create_hash_diff_table(){
|
|
std::array<int, std::numeric_limits<std::uint8_t>::max()+1+chunk_tag::run-chunk_tag::diff> table = {};
|
|
for(std::size_t i = 0; i <= std::numeric_limits<std::uint8_t>::max(); ++i){
|
|
constexpr std::uint32_t mask_tail_4 = 0b0000'1111u;
|
|
const auto vr = (i >> 4);
|
|
const auto vb = (i & mask_tail_4);
|
|
table[i] = (vr*3 + vb*7) % index_size;
|
|
}
|
|
for(std::size_t i = chunk_tag::diff; i < chunk_tag::luma; ++i){
|
|
constexpr std::uint32_t mask_tail_2 = 0b0000'0011u;
|
|
const auto vr = static_cast<int>((i >> 4) & mask_tail_2) - 2;
|
|
const auto vg = static_cast<int>((i >> 2) & mask_tail_2) - 2;
|
|
const auto vb = static_cast<int>( i & mask_tail_2) - 2;
|
|
table[i+hash_table_offset] = static_cast<std::uint8_t>((vr*3 + vg*5 + vb*7) % index_size);
|
|
}
|
|
for(std::size_t i = chunk_tag::luma; i < chunk_tag::run; ++i){
|
|
constexpr int vgv = chunk_tag::luma+40;
|
|
const int vg = i - vgv;
|
|
table[i+hash_table_offset] = static_cast<std::uint8_t>((vg*3 + (vg+8)*5 + vg*7) % index_size);
|
|
}
|
|
return table;
|
|
}
|
|
static constexpr std::array<std::array<std::uint8_t, 2>, std::numeric_limits<std::uint8_t>::max()+1> create_luma_table(){
|
|
std::array<std::array<std::uint8_t, 2>, std::numeric_limits<std::uint8_t>::max()+1> table = {};
|
|
for(std::size_t i = 0; i <= std::numeric_limits<std::uint8_t>::max(); ++i){
|
|
constexpr std::uint32_t mask_tail_4 = 0b0000'1111u;
|
|
const auto vr = (i >> 4);
|
|
const auto vb = (i & mask_tail_4);
|
|
table[i][0] = static_cast<uint8_t>(vr);
|
|
table[i][1] = static_cast<uint8_t>(vb);
|
|
}
|
|
return table;
|
|
}
|
|
static constexpr std::array<std::array<std::int8_t, 3>, chunk_tag::luma> create_diff_table(){
|
|
std::array<std::array<std::int8_t, 3>, chunk_tag::luma> table = {};
|
|
for(std::size_t i = chunk_tag::diff; i < chunk_tag::luma; ++i){
|
|
constexpr std::uint32_t mask_tail_2 = 0b0000'0011u;
|
|
const auto vr = ((i >> 4) & mask_tail_2) - 2;
|
|
const auto vg = ((i >> 2) & mask_tail_2) - 2;
|
|
const auto vb = ( i & mask_tail_2) - 2;
|
|
table[i][0] = static_cast<uint8_t>(vr);
|
|
table[i][1] = static_cast<uint8_t>(vg);
|
|
table[i][2] = static_cast<uint8_t>(vb);
|
|
}
|
|
return table;
|
|
}
|
|
#endif
|
|
|
|
template<std::size_t Channels, typename Pusher, typename Puller>
|
|
static inline void decode_impl(Pusher& pixels, Puller& p, std::size_t px_len, std::size_t size){
|
|
#ifndef __aarch64__
|
|
using rgba_t = std::conditional_t<Channels == 4, qoi::rgba_t, qoi::rgb_t>;
|
|
#endif
|
|
rgba_t px = {};
|
|
if constexpr(std::is_same<rgba_t, qoi::rgba_t>::value)
|
|
px.a = 255;
|
|
rgba_t index[index_size];
|
|
if constexpr(std::is_same<rgba_t, qoi::rgba_t>::value){
|
|
index[(0*3+0*5+0*7+0*11)%index_size] = {};
|
|
index[(0*3+0*5+0*7+255*11)%index_size] = px;
|
|
}
|
|
else
|
|
index[(0*3+0*5+0*7+255*11)%index_size] = {};
|
|
|
|
#if QOIXX_DECODE_WITH_TABLES
|
|
#define QOIXX_HPP_WITH_TABLES(...) __VA_ARGS__
|
|
#define QOIXX_HPP_WITHOUT_TABLES(...)
|
|
#else
|
|
#define QOIXX_HPP_WITH_TABLES(...)
|
|
#define QOIXX_HPP_WITHOUT_TABLES(...) __VA_ARGS__
|
|
#endif
|
|
|
|
QOIXX_HPP_WITH_TABLES(
|
|
auto hash = px.hash() % index_size;
|
|
static constexpr auto luma_hash_diff_table = create_hash_diff_table();
|
|
static constexpr auto hash_diff_table = luma_hash_diff_table.data() + hash_table_offset;
|
|
)
|
|
|
|
const auto f = [&pixels, &p, &px_len, &size, &px, &index QOIXX_HPP_WITH_TABLES(, &hash)]{
|
|
const auto b1 = p.pull();
|
|
--size;
|
|
|
|
#if defined(__aarch64__) and not defined(QOIXX_NO_SIMD)
|
|
#define QOIXX_HPP_DECODE_RUN(px, run) { \
|
|
if constexpr(Pusher::is_contiguous){ \
|
|
++run; \
|
|
if(run >= 8){ \
|
|
std::conditional_t<Channels == 4, uint8x8x4_t, uint8x8x3_t> data = {vdup_n_u8(px.r), vdup_n_u8(px.g), vdup_n_u8(px.b)}; \
|
|
if constexpr(Channels == 4) \
|
|
data.val[3] = vdup_n_u8(px.a); \
|
|
while(run>=8){ \
|
|
if constexpr(Channels == 4) \
|
|
vst4_u8(pixels.raw_pointer(), data); \
|
|
else \
|
|
vst3_u8(pixels.raw_pointer(), data); \
|
|
pixels.advance(Channels*8); \
|
|
run -= 8; \
|
|
} \
|
|
} \
|
|
while(run--){push<Channels>(pixels, &px);} \
|
|
} \
|
|
else \
|
|
do{push<Channels>(pixels, &px);}while(run--); \
|
|
}
|
|
#else
|
|
#define QOIXX_HPP_DECODE_RUN(px, run) do{push<Channels>(pixels, &px);}while(run--);
|
|
#endif
|
|
|
|
if(b1 >= chunk_tag::run){
|
|
if(b1 < chunk_tag::rgb){
|
|
/*run*/
|
|
static constexpr std::uint32_t mask_tail_6 = 0b0011'1111u;
|
|
std::size_t run = b1 & mask_tail_6;
|
|
if(run >= px_len)[[unlikely]]
|
|
run = px_len;
|
|
px_len -= run;
|
|
QOIXX_HPP_DECODE_RUN(px, run)
|
|
return;
|
|
}
|
|
if(b1 == chunk_tag::rgb){
|
|
pull<3>(&px, p);
|
|
size -= 3;
|
|
QOIXX_HPP_WITH_TABLES(hash = px.hash() % index_size;)
|
|
}
|
|
if constexpr(Channels == 4){
|
|
if(b1 == chunk_tag::rgba){
|
|
pull<4>(&px, p);
|
|
size -= 4;
|
|
QOIXX_HPP_WITH_TABLES(hash = px.hash() % index_size;)
|
|
}
|
|
}
|
|
else{
|
|
if(b1 == chunk_tag::rgba)[[unlikely]]{
|
|
pull<3>(&px, p);
|
|
p.advance(1);
|
|
size -= 4;
|
|
QOIXX_HPP_WITH_TABLES(hash = px.hash() % index_size;)
|
|
}
|
|
}
|
|
}
|
|
else if(b1 < chunk_tag::diff){
|
|
/*index*/
|
|
if constexpr(std::is_same<rgba_t, qoi::rgba_t>::value)
|
|
px = index[b1];
|
|
else
|
|
efficient_memcpy<Channels>(&px, index + b1);
|
|
push<Channels>(pixels, &px);
|
|
QOIXX_HPP_WITH_TABLES(hash = b1;)
|
|
return;
|
|
}
|
|
else if(b1 >= chunk_tag::luma){
|
|
/*luma*/
|
|
const auto b2 = p.pull();
|
|
--size;
|
|
QOIXX_HPP_WITH_TABLES(
|
|
static constexpr auto table = create_luma_table();
|
|
const auto drb = table[b2];
|
|
)
|
|
static constexpr int vgv = chunk_tag::luma+40;
|
|
const int vg = b1 - vgv;
|
|
QOIXX_HPP_WITH_TABLES(
|
|
px.r += vg + drb[0];
|
|
px.g += vg + 8;
|
|
px.b += vg + drb[1];
|
|
hash = (static_cast<int>(hash)+hash_diff_table[b1]+luma_hash_diff_table[b2]) % index_size;
|
|
) QOIXX_HPP_WITHOUT_TABLES(
|
|
static constexpr std::uint32_t mask_tail_4 = 0b0000'1111u;
|
|
px.r += vg + (b2 >> 4);
|
|
px.g += vg + 8;
|
|
px.b += vg + (b2 & mask_tail_4);
|
|
)
|
|
}
|
|
else{
|
|
/*diff*/
|
|
QOIXX_HPP_WITH_TABLES(
|
|
static constexpr auto table = create_diff_table();
|
|
const auto drgb = table[b1];
|
|
px.r += drgb[0];
|
|
px.g += drgb[1];
|
|
px.b += drgb[2];
|
|
hash = (static_cast<int>(hash)+hash_diff_table[b1]) % index_size;
|
|
) QOIXX_HPP_WITHOUT_TABLES(
|
|
static constexpr std::uint32_t mask_tail_2 = 0b0000'0011u;
|
|
px.r += ((b1 >> 4) & mask_tail_2) - 2;
|
|
px.g += ((b1 >> 2) & mask_tail_2) - 2;
|
|
px.b += ( b1 & mask_tail_2) - 2;
|
|
)
|
|
}
|
|
#undef QOIXX_HPP_DECODE_RUN
|
|
if constexpr(std::is_same<rgba_t, qoi::rgba_t>::value)
|
|
index[QOIXX_HPP_WITH_TABLES(hash) QOIXX_HPP_WITHOUT_TABLES(px.hash() % index_size)] = px;
|
|
else
|
|
efficient_memcpy<Channels>(index + QOIXX_HPP_WITH_TABLES(hash) QOIXX_HPP_WITHOUT_TABLES(px.hash() % index_size), &px);
|
|
#undef QOIXX_HPP_WITHOUT_TABLES
|
|
#undef QOIXX_HPP_WITH_TABLES
|
|
#ifdef QOIXX_HPP_DECODE_WITH_TABLES_NOT_DEFINED
|
|
#undef QOIXX_DECODE_WITH_TABLES
|
|
#undef QOIXX_HPP_DECODE_WITH_TABLES_NOT_DEFINED
|
|
#endif
|
|
|
|
push<Channels>(pixels, &px);
|
|
};
|
|
|
|
while(px_len--)[[likely]]{
|
|
f();
|
|
if(size < sizeof(padding))[[unlikely]]{
|
|
throw std::runtime_error("qoixx::qoi::decode: insufficient input data");
|
|
}
|
|
}
|
|
}
|
|
public:
|
|
template<typename T, typename U>
|
|
static inline T encode(const U& u, const desc& desc){
|
|
using coU = container_operator<U>;
|
|
if(!coU::valid(u) || coU::size(u) < desc.width*desc.height*desc.channels || desc.width == 0 || desc.height == 0 || desc.channels < 3 || desc.channels > 4 || desc.height >= pixels_max / desc.width)[[unlikely]]
|
|
throw std::invalid_argument{"qoixx::qoi::encode: invalid argument"};
|
|
|
|
const auto max_size = static_cast<std::size_t>(desc.width) * desc.height * (desc.channels + 1) + header_size + sizeof(padding);
|
|
using coT = container_operator<T>;
|
|
T data = coT::construct(max_size);
|
|
auto p = coT::create_pusher(data);
|
|
auto puller = coU::create_puller(u);
|
|
|
|
write_32(p, magic);
|
|
write_32(p, desc.width);
|
|
write_32(p, desc.height);
|
|
p.push(desc.channels);
|
|
p.push(static_cast<std::uint8_t>(desc.colorspace));
|
|
|
|
#ifndef QOIXX_NO_SIMD
|
|
#if defined(__ARM_FEATURE_SVE)
|
|
if constexpr(coT::pusher::is_contiguous && coU::puller::is_contiguous)
|
|
if(desc.channels == 4)
|
|
#define QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH \
|
|
switch(svcntb()){ \
|
|
QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(128); \
|
|
QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(256); \
|
|
QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(384); \
|
|
QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(512); \
|
|
QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(640); \
|
|
QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(768); \
|
|
QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(896); \
|
|
QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(1024); \
|
|
QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(1152); \
|
|
QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(1280); \
|
|
QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(1408); \
|
|
QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(1536); \
|
|
QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(1664); \
|
|
QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(1792); \
|
|
QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(1920); \
|
|
QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(2048); \
|
|
default: while(true){/*unreachable*/} \
|
|
}
|
|
#define QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(i) case i/8: encode_sve<i, 4>(p, puller, desc); break
|
|
QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH
|
|
#undef QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE
|
|
else
|
|
#define QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE(i) case i/8: encode_sve<i, 3>(p, puller, desc); break;
|
|
QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH
|
|
#undef QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH_CASE
|
|
#undef QOIXX_HPP_SVE_REGISTER_SIZE_SWITCH
|
|
else
|
|
#elif defined(__aarch64__)
|
|
if constexpr(coT::pusher::is_contiguous && coU::puller::is_contiguous)
|
|
if(desc.channels == 4)
|
|
encode_neon<4>(p, puller, desc);
|
|
else
|
|
encode_neon<3>(p, puller, desc);
|
|
else
|
|
#elif defined(__AVX2__)
|
|
if constexpr(coT::pusher::is_contiguous && coU::puller::is_contiguous)
|
|
if(desc.channels == 4)
|
|
encode_avx2<4>(p, puller, desc);
|
|
else
|
|
encode_avx2<3>(p, puller, desc);
|
|
else
|
|
#endif
|
|
#endif
|
|
if(desc.channels == 4)
|
|
encode_impl<4>(p, puller, desc);
|
|
else
|
|
encode_impl<3>(p, puller, desc);
|
|
|
|
return p.finalize();
|
|
}
|
|
template<typename T, typename U>
|
|
requires(sizeof(U) == 1)
|
|
static inline T encode(const U* pixels, std::size_t size, const desc& desc){
|
|
return encode<T>(std::make_pair(pixels, size), desc);
|
|
}
|
|
template<typename T, typename U>
|
|
requires (!std::is_pointer_v<U>)
|
|
static inline std::pair<T, desc> decode(const U& u, std::uint8_t channels = 0){
|
|
using coU = container_operator<U>;
|
|
const auto size = coU::size(u);
|
|
if(!coU::valid(u) || size < header_size + sizeof(padding) || (channels != 0 && channels != 3 && channels != 4))[[unlikely]]
|
|
throw std::invalid_argument{"qoixx::qoi::decode: invalid argument"};
|
|
auto puller = coU::create_puller(u);
|
|
|
|
const auto d = decode_header(puller);
|
|
if(channels == 0)
|
|
channels = d.channels;
|
|
|
|
const std::size_t px_len = static_cast<std::size_t>(d.width) * d.height;
|
|
using coT = container_operator<T>;
|
|
T data = coT::construct(px_len*channels);
|
|
auto p = coT::create_pusher(data);
|
|
|
|
if(channels == 4)
|
|
decode_impl<4>(p, puller, px_len, size);
|
|
else
|
|
decode_impl<3>(p, puller, px_len, size);
|
|
|
|
return std::make_pair(std::move(p.finalize()), d);
|
|
}
|
|
template<typename T, typename U>
|
|
requires(sizeof(U) == 1)
|
|
static inline std::pair<T, desc> decode(const U* pixels, std::size_t size, std::uint8_t channels = 0){
|
|
return decode<T>(std::make_pair(pixels, size), channels);
|
|
}
|
|
};
|
|
|
|
}
|
|
|
|
#endif //QOIXX_HPP_INCLUDED_
|