Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 28 additions & 38 deletions modules/core/include/opencv2/core/cvdef.h
Original file line number Diff line number Diff line change
Expand Up @@ -809,40 +809,20 @@ using std::uint64_t;
namespace cv
{

class float16_t
class hfloat
{
public:
#if CV_FP16_TYPE

float16_t() : h(0) {}
explicit float16_t(float x) { h = (__fp16)x; }
hfloat() : h(0) {}
explicit hfloat(float x) { h = (__fp16)x; }
operator float() const { return (float)h; }
static float16_t fromBits(ushort w)
{
Cv16suf u;
u.u = w;
float16_t result;
result.h = u.h;
return result;
}
static float16_t zero()
{
float16_t result;
result.h = (__fp16)0;
return result;
}
ushort bits() const
{
Cv16suf u;
u.h = h;
return u.u;
}
protected:
__fp16 h;

#else
float16_t() : w(0) {}
explicit float16_t(float x)
hfloat() : w(0) {}
explicit hfloat(float x)
{
#if CV_FP16 && CV_AVX2
__m128 v = _mm_load_ss(&x);
Expand Down Expand Up @@ -893,25 +873,35 @@ class float16_t
#endif
}

static float16_t fromBits(ushort b)
{
float16_t result;
result.w = b;
return result;
}
static float16_t zero()
{
float16_t result;
result.w = (ushort)0;
return result;
}
ushort bits() const { return w; }
protected:
ushort w;

#endif
};

inline hfloat hfloatFromBits(ushort w) {
#if CV_FP16_TYPE
Cv16suf u;
u.u = w;
hfloat res(float(u.h));
return res;
#else
Cv32suf out;

unsigned t = ((w & 0x7fff) << 13) + 0x38000000;
unsigned sign = (w & 0x8000) << 16;
unsigned e = w & 0x7c00;

out.u = t + (1 << 23);
out.u = (e >= 0x7c00 ? t + 0x38000000 :
e == 0 ? (static_cast<void>(out.f -= 6.103515625e-05f), out.u) : t) | sign;
hfloat res(out.f);
return res;
#endif
}

typedef hfloat float16_t;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That should be available for EXTERNAL users only.
We should not use that in OpenCV code anywhere else.

Because these hits still lead to conflicts with future C++ compilers:

$ grep -Rn float16_t ./
./modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp:497:    typedef __fp16 float16_t;
./modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp:498:    const float16_t* a = (const float16_t*)_a;
./modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp:499:    const float16_t* b = (const float16_t*)_b;
./modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp:500:    float16_t* c = (float16_t*)_c;
./modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp:641:    typedef __fp16 float16_t;
./modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp:643:    const float16_t* a = (const float16_t*)_a;
./modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp:644:    const float16_t* b = (const float16_t*)_b;
./modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp:646:    const float16_t bias = (float16_t)_bias;
./modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp:88:        esz = sizeof(float16_t);
./modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp:438:    typedef __fp16 float16_t;
./modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp:439:    const float16_t* inwptr = (const float16_t*)_inwptr;
./modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp:440:    const float16_t* wptr = (const float16_t*)_wptr;
./modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp:441:    float16_t* outbuf = (float16_t*)_outbuf;
./modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp:594:    typedef __fp16 float16_t;
./modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp:595:    float16_t* outptr = (float16_t*)_outptr;
./modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp:760:    typedef __fp16 float16_t;
./modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp:761:    const float16_t* inptr = (const float16_t*)_inptr;
./modules/dnn/src/layers/cpu_kernels/convolution.hpp:65:    std::vector<float16_t> weightsBuf_FP16;
./modules/dnn/src/layers/cpu_kernels/convolution.hpp:66:    std::vector<float16_t> weightsWinoBuf_FP16;
./modules/dnn/src/layers/cpu_kernels/convolution.hpp:67:    float16_t* getWeightsFP16();
./modules/dnn/src/layers/cpu_kernels/convolution.hpp:68:    float16_t* getWeightsWinoFP16();
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:29:static inline void _cvt32f16f(const float* src, float16_t* dst, int len)
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:63:        dst[j] = float16_t(src[j]);
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:77:float16_t* FastConv::getWeightsFP16()
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:82:float16_t* FastConv::getWeightsWinoFP16()
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:274:        float16_t* wptrWino_FP16 = nullptr;
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:330:                    float16_t* wptr = wptrWino_FP16 + (g*Kg_nblocks + ki) * Cg *CONV_WINO_KBLOCK*CONV_WINO_AREA +
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:338:                            wptr[j] = (float16_t)kernelTm[i * CONV_WINO_ATOM_F16 + j];
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:374:        float16_t* weightsPtr_FP16 = nullptr;
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:403:                float16_t* packed_wptr = weightsPtr_FP16 + DkHkWkCg * (startK + g * Kg_aligned_FP16);
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:414:                            packed_wptr[k] = (float16_t)(*wptr);
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:416:                            packed_wptr[k] = (float16_t)0.f;
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:476:    float16_t* inpbufC_FP16 = (float16_t *)inpbufC;
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:477:    if (esz == sizeof(float16_t))
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:574:    float16_t* inpbufC_FP16 = (float16_t *)inpbufC;
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:575:    if (esz == sizeof(float16_t))
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:582:            inpbufC_FP16[k*CONV_NR_FP16] = (float16_t)v0;
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:583:            inpbufC_FP16[k*CONV_NR_FP16+1] = (float16_t)v1;
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:639:                        _cvt32f16f(inptr, (float16_t *)inpbuf, CONV_NR);
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:653:                        _cvt32f16f(inptr, (float16_t *)inpbuf, slice_len);
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:715:                                float16_t* inpbufC = (float16_t *)inpbuf + s0;
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:719:                                    inpbufC[w*CONV_NR] = (float16_t)inptrInC[imgofs];
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:776:                                float16_t* inpbufC = (float16_t *)inpbuf + s0;
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:783:                                        inpbufC[(h*Wk + w)*CONV_NR] = (float16_t)inptrInC[imgofs];
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:849:                                float16_t* inpbufC = (float16_t* )inpbuf + s0;
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:858:                                            inpbufC[((d*Hk + h)*Wk + w)*CONV_NR] = (float16_t)inptrInC[imgofs];
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:900:                    float16_t * inpbuf_ki_FP16 = (float16_t *)inpbuf + k * CONV_NR * Cg + i;
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:1064:                                    inpbuf_ki_FP16[0] = (float16_t)(*inptr_ki);
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:1080:                                inpbuf_ki_FP16[0] = (float16_t)0.f;
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:1268:        esz = sizeof(float16_t);
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:1522:                            float16_t* cptr_f16 = (float16_t*)cbuf_task + stripe*CONV_NR;
./modules/dnn/src/layers/cpu_kernels/convolution.cpp:1558:                    const float16_t *cptr_fp16 = (const float16_t *)cbuf_task;
./modules/dnn/src/onnx/onnx_graph_simplifier.cpp:1637:            AutoBuffer<float16_t, 16> aligned_val;
./modules/dnn/src/onnx/onnx_graph_simplifier.cpp:1640:            float16_t* bufPtr = aligned_val.data();
./modules/dnn/src/onnx/onnx_graph_simplifier.cpp:1642:            float16_t *fp16Ptr = (float16_t *)field.data();
./modules/dnn/src/onnx/onnx_graph_simplifier.cpp:1654:            AutoBuffer<float16_t, 16> aligned_val;
./modules/dnn/src/onnx/onnx_graph_simplifier.cpp:1655:            if (!isAligned<sizeof(float16_t)>(val))
./modules/dnn/src/onnx/onnx_graph_simplifier.cpp:1658:                aligned_val.allocate(divUp(sz, sizeof(float16_t)));

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

float16_t used in convolution related code should be valid, which is exactly float16_t from arm_neon.h. I am not sure about the float16_t in onnx_graph_simplifier.cpp though.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This may conflict with compilers too: typedef __fp16 float16_t;

Need to use #if !defined(__OPENCV_BUILD) && !defined(OPENCV_HIDE_FLOAT16_T) compilation guard.


}
#endif

Expand Down
4 changes: 2 additions & 2 deletions modules/core/include/opencv2/core/hal/hal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,8 @@ CV_EXPORTS void addWeighted32s( const int* src1, size_t step1, const int* src2,
CV_EXPORTS void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars );
CV_EXPORTS void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars );

CV_EXPORTS void cvt16f32f( const float16_t* src, float* dst, int len );
CV_EXPORTS void cvt32f16f( const float* src, float16_t* dst, int len );
CV_EXPORTS void cvt16f32f( const hfloat* src, float* dst, int len );
CV_EXPORTS void cvt32f16f( const float* src, hfloat* dst, int len );

CV_EXPORTS void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len );
CV_EXPORTS void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len );
Expand Down
2 changes: 1 addition & 1 deletion modules/core/include/opencv2/core/hal/intrin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -708,7 +708,7 @@ namespace CV__SIMD_NAMESPACE {
inline v_int32 vx_load_expand(const short* ptr) { return VXPREFIX(_load_expand)(ptr); }
inline v_int64 vx_load_expand(const int* ptr) { return VXPREFIX(_load_expand)(ptr); }
inline v_uint64 vx_load_expand(const unsigned* ptr) { return VXPREFIX(_load_expand)(ptr); }
inline v_float32 vx_load_expand(const float16_t * ptr) { return VXPREFIX(_load_expand)(ptr); }
inline v_float32 vx_load_expand(const hfloat * ptr) { return VXPREFIX(_load_expand)(ptr); }
//! @}

//! @name Wide load with quad expansion
Expand Down
6 changes: 3 additions & 3 deletions modules/core/include/opencv2/core/hal/intrin_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3137,7 +3137,7 @@ OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, u
// FP16
//

inline v_float32x8 v256_load_expand(const float16_t* ptr)
inline v_float32x8 v256_load_expand(const hfloat* ptr)
{
#if CV_FP16
return v_float32x8(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
Expand All @@ -3149,7 +3149,7 @@ inline v_float32x8 v256_load_expand(const float16_t* ptr)
#endif
}

inline void v_pack_store(float16_t* ptr, const v_float32x8& a)
inline void v_pack_store(hfloat* ptr, const v_float32x8& a)
{
#if CV_FP16
__m128i ah = _mm256_cvtps_ph(a.val, 0);
Expand All @@ -3158,7 +3158,7 @@ inline void v_pack_store(float16_t* ptr, const v_float32x8& a)
float CV_DECL_ALIGNED(32) buf[8];
v_store_aligned(buf, a);
for (int i = 0; i < 8; i++)
ptr[i] = float16_t(buf[i]);
ptr[i] = hfloat(buf[i]);
#endif
}

Expand Down
4 changes: 2 additions & 2 deletions modules/core/include/opencv2/core/hal/intrin_avx512.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -506,12 +506,12 @@ inline v_float64x8 v_reinterpret_as_f64(const v_float32x16& a)
{ return v_float64x8(_mm512_castps_pd(a.val)); }

// FP16
inline v_float32x16 v512_load_expand(const float16_t* ptr)
inline v_float32x16 v512_load_expand(const hfloat* ptr)
{
return v_float32x16(_mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)ptr)));
}

inline void v_pack_store(float16_t* ptr, const v_float32x16& a)
inline void v_pack_store(hfloat* ptr, const v_float32x16& a)
{
__m256i ah = _mm512_cvtps_ph(a.val, 0);
_mm256_storeu_si256((__m256i*)ptr, ah);
Expand Down
10 changes: 5 additions & 5 deletions modules/core/include/opencv2/core/hal/intrin_cpp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3251,7 +3251,7 @@ template<int n> inline v_reg<double, n/2> v_dotprod_expand_fast(const v_reg<int,
////// FP16 support ///////

inline v_reg<float, simd128_width / sizeof(float)>
v_load_expand(const float16_t* ptr)
v_load_expand(const hfloat* ptr)
{
v_reg<float, simd128_width / sizeof(float)> v;
for( int i = 0; i < v.nlanes; i++ )
Expand All @@ -3262,7 +3262,7 @@ v_load_expand(const float16_t* ptr)
}
#if CV_SIMD256
inline v_reg<float, simd256_width / sizeof(float)>
v256_load_expand(const float16_t* ptr)
v256_load_expand(const hfloat* ptr)
{
v_reg<float, simd256_width / sizeof(float)> v;
for (int i = 0; i < v.nlanes; i++)
Expand All @@ -3274,7 +3274,7 @@ v256_load_expand(const float16_t* ptr)
#endif
#if CV_SIMD512
inline v_reg<float, simd512_width / sizeof(float)>
v512_load_expand(const float16_t* ptr)
v512_load_expand(const hfloat* ptr)
{
v_reg<float, simd512_width / sizeof(float)> v;
for (int i = 0; i < v.nlanes; i++)
Expand All @@ -3286,11 +3286,11 @@ v512_load_expand(const float16_t* ptr)
#endif

template<int n> inline void
v_pack_store(float16_t* ptr, const v_reg<float, n>& v)
v_pack_store(hfloat* ptr, const v_reg<float, n>& v)
{
for( int i = 0; i < v.nlanes; i++ )
{
ptr[i] = float16_t(v.s[i]);
ptr[i] = hfloat(v.s[i]);
}
}

Expand Down
6 changes: 3 additions & 3 deletions modules/core/include/opencv2/core/hal/intrin_lasx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2983,7 +2983,7 @@ OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4,
// FP16
//

inline v_float32x8 v256_load_expand(const float16_t* ptr)
inline v_float32x8 v256_load_expand(const hfloat* ptr)
{
#if CV_FP16
//1-load128, 2-permi, 3-cvt
Expand All @@ -2996,7 +2996,7 @@ inline v_float32x8 v256_load_expand(const float16_t* ptr)
#endif
}

inline void v_pack_store(float16_t* ptr, const v_float32x8& a)
inline void v_pack_store(hfloat* ptr, const v_float32x8& a)
{
#if CV_FP16
__m256i ah = __lasx_xvfcvt_h_s(a.val, a.val);
Expand All @@ -3005,7 +3005,7 @@ inline void v_pack_store(float16_t* ptr, const v_float32x8& a)
float CV_DECL_ALIGNED(32) buf[8];
v_store_aligned(buf, a);
for (int i = 0; i < 8; i++)
ptr[i] = float16_t(buf[i]);
ptr[i] = hfloat(buf[i]);
#endif
}

Expand Down
6 changes: 3 additions & 3 deletions modules/core/include/opencv2/core/hal/intrin_lsx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2498,7 +2498,7 @@ OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, u
// FP16
//

inline v_float32x4 v_load_expand(const float16_t* ptr)
inline v_float32x4 v_load_expand(const hfloat* ptr)
{
#if CV_FP16
return v_float32x4(__lsx_vfcvtl_s_h((__m128)__lsx_vld(ptr, 0)));
Expand All @@ -2510,7 +2510,7 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
#endif
}

inline void v_pack_store(float16_t* ptr, const v_float32x4& a)
inline void v_pack_store(hfloat* ptr, const v_float32x4& a)
{
#if CV_FP16
__m128i res = (__m218i)__lsx_vfcvt_h_s(a.val, a.val);
Expand All @@ -2519,7 +2519,7 @@ inline void v_pack_store(float16_t* ptr, const v_float32x4& a)
float CV_DECL_ALIGNED(32) buf[4];
v_store_aligned(buf, a);
for (int i = 0; i < 4; i++)
ptr[i] = float16_t(buf[i]);
ptr[i] = hfloat(buf[i]);
#endif
}

Expand Down
10 changes: 5 additions & 5 deletions modules/core/include/opencv2/core/hal/intrin_msa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1838,7 +1838,7 @@ inline v_float32x4 v_broadcast_element(const v_float32x4& a)

////// FP16 support ///////
#if CV_FP16
inline v_float32x4 v_load_expand(const float16_t* ptr)
inline v_float32x4 v_load_expand(const hfloat* ptr)
{
#ifndef msa_ld1_f16
v4f16 v = (v4f16)msa_ld1_s16((const short*)ptr);
Expand All @@ -1848,7 +1848,7 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
return v_float32x4(msa_cvt_f32_f16(v));
}

inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
{
v4f16 hv = msa_cvt_f16_f32(v.val);

Expand All @@ -1859,20 +1859,20 @@ inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
#endif
}
#else
inline v_float32x4 v_load_expand(const float16_t* ptr)
inline v_float32x4 v_load_expand(const hfloat* ptr)
{
float buf[4];
for( int i = 0; i < 4; i++ )
buf[i] = (float)ptr[i];
return v_load(buf);
}

inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
{
float buf[4];
v_store(buf, v);
for( int i = 0; i < 4; i++ )
ptr[i] = (float16_t)buf[i];
ptr[i] = (hfloat)buf[i];
}
#endif

Expand Down
10 changes: 5 additions & 5 deletions modules/core/include/opencv2/core/hal/intrin_neon.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2605,7 +2605,7 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo

////// FP16 support ///////
#if CV_FP16
inline v_float32x4 v_load_expand(const float16_t* ptr)
inline v_float32x4 v_load_expand(const hfloat* ptr)
{
float16x4_t v =
#ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
Expand All @@ -2616,7 +2616,7 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
return v_float32x4(vcvt_f32_f16(v));
}

inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
{
float16x4_t hv = vcvt_f16_f32(v.val);

Expand All @@ -2627,20 +2627,20 @@ inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
#endif
}
#else
inline v_float32x4 v_load_expand(const float16_t* ptr)
inline v_float32x4 v_load_expand(const hfloat* ptr)
{
const int N = 4;
float buf[N];
for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
return v_load(buf);
}

inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
{
const int N = 4;
float buf[N];
v_store(buf, v);
for( int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]);
for( int i = 0; i < N; i++ ) ptr[i] = hfloat(buf[i]);
}
#endif

Expand Down
10 changes: 5 additions & 5 deletions modules/core/include/opencv2/core/hal/intrin_rvv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2873,30 +2873,30 @@ inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
////// FP16 support ///////

#if CV_FP16
inline v_float32x4 v_load_expand(const float16_t* ptr)
inline v_float32x4 v_load_expand(const hfloat* ptr)
{
return v_float32x4(vfwcvt_f_f_v_f32m1(vle16_v_f16mf2(ptr, 4), 4));
}

inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
{
vse16_v_f16mf2(ptr, vfncvt_f_f_w_f16mf2(v, 4), 4);
}
#else
inline v_float32x4 v_load_expand(const float16_t* ptr)
inline v_float32x4 v_load_expand(const hfloat* ptr)
{
const int N = 4;
float buf[N];
for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
return v_load(buf);
}

inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
{
const int N = 4;
float buf[N];
v_store(buf, v);
for( int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]);
for( int i = 0; i < N; i++ ) ptr[i] = hfloat(buf[i]);
}
#endif

Expand Down
Loading