opencv · vpisarev · Mar 21, 2024 · Mar 15, 2024 · Mar 15, 2024 · Mar 15, 2024
diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
@@ -809,40 +809,20 @@ using std::uint64_t;
 namespace cv
 {
 
-class float16_t
+class hfloat
 {
 public:
 #if CV_FP16_TYPE
 
-    float16_t() : h(0) {}
-    explicit float16_t(float x) { h = (__fp16)x; }
+    hfloat() : h(0) {}
+    explicit hfloat(float x) { h = (__fp16)x; }
     operator float() const { return (float)h; }
-    static float16_t fromBits(ushort w)
-    {
-        Cv16suf u;
-        u.u = w;
-        float16_t result;
-        result.h = u.h;
-        return result;
-    }
-    static float16_t zero()
-    {
-        float16_t result;
-        result.h = (__fp16)0;
-        return result;
-    }
-    ushort bits() const
-    {
-        Cv16suf u;
-        u.h = h;
-        return u.u;
-    }
 protected:
     __fp16 h;
 
 #else
-    float16_t() : w(0) {}
-    explicit float16_t(float x)
+    hfloat() : w(0) {}
+    explicit hfloat(float x)
     {
     #if CV_FP16 && CV_AVX2
         __m128 v = _mm_load_ss(&x);
@@ -893,25 +873,35 @@ class float16_t
     #endif
     }
 
-    static float16_t fromBits(ushort b)
-    {
-        float16_t result;
-        result.w = b;
-        return result;
-    }
-    static float16_t zero()
-    {
-        float16_t result;
-        result.w = (ushort)0;
-        return result;
-    }
-    ushort bits() const { return w; }
 protected:
     ushort w;
 
 #endif
 };
 
+inline hfloat hfloatFromBits(ushort w) {
+#if CV_FP16_TYPE
+    Cv16suf u;
+    u.u = w;
+    hfloat res(float(u.h));
+    return res;
+#else
+    Cv32suf out;
+
+    unsigned t = ((w & 0x7fff) << 13) + 0x38000000;
+    unsigned sign = (w & 0x8000) << 16;
+    unsigned e = w & 0x7c00;
+
+    out.u = t + (1 << 23);
+    out.u = (e >= 0x7c00 ? t + 0x38000000 :
+            e == 0 ? (static_cast<void>(out.f -= 6.103515625e-05f), out.u) : t) | sign;
+    hfloat res(out.f);
+    return res;
+#endif
+}
+
+typedef hfloat float16_t;
+
 }
 #endif
 

diff --git a/modules/core/include/opencv2/core/hal/hal.hpp b/modules/core/include/opencv2/core/hal/hal.hpp
@@ -195,8 +195,8 @@ CV_EXPORTS void addWeighted32s( const int* src1, size_t step1, const int* src2,
 CV_EXPORTS void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars );
 CV_EXPORTS void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars );
 
-CV_EXPORTS void cvt16f32f( const float16_t* src, float* dst, int len );
-CV_EXPORTS void cvt32f16f( const float* src, float16_t* dst, int len );
+CV_EXPORTS void cvt16f32f( const hfloat* src, float* dst, int len );
+CV_EXPORTS void cvt32f16f( const float* src, hfloat* dst, int len );
 
 CV_EXPORTS void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len );
 CV_EXPORTS void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len );

diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -708,7 +708,7 @@ namespace CV__SIMD_NAMESPACE {
     inline v_int32 vx_load_expand(const short* ptr) { return VXPREFIX(_load_expand)(ptr); }
     inline v_int64 vx_load_expand(const int* ptr) { return VXPREFIX(_load_expand)(ptr); }
     inline v_uint64 vx_load_expand(const unsigned* ptr) { return VXPREFIX(_load_expand)(ptr); }
-    inline v_float32 vx_load_expand(const float16_t * ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_float32 vx_load_expand(const hfloat * ptr) { return VXPREFIX(_load_expand)(ptr); }
     //! @}
 
     //! @name Wide load with quad expansion

diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -3137,7 +3137,7 @@ OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, u
 // FP16
 //
 
-inline v_float32x8 v256_load_expand(const float16_t* ptr)
+inline v_float32x8 v256_load_expand(const hfloat* ptr)
 {
 #if CV_FP16
     return v_float32x8(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
@@ -3149,7 +3149,7 @@ inline v_float32x8 v256_load_expand(const float16_t* ptr)
 #endif
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x8& a)
+inline void v_pack_store(hfloat* ptr, const v_float32x8& a)
 {
 #if CV_FP16
     __m128i ah = _mm256_cvtps_ph(a.val, 0);
@@ -3158,7 +3158,7 @@ inline void v_pack_store(float16_t* ptr, const v_float32x8& a)
     float CV_DECL_ALIGNED(32) buf[8];
     v_store_aligned(buf, a);
     for (int i = 0; i < 8; i++)
-        ptr[i] = float16_t(buf[i]);
+        ptr[i] = hfloat(buf[i]);
 #endif
 }
 

diff --git a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
@@ -506,12 +506,12 @@ inline v_float64x8 v_reinterpret_as_f64(const v_float32x16& a)
 { return v_float64x8(_mm512_castps_pd(a.val)); }
 
 // FP16
-inline v_float32x16 v512_load_expand(const float16_t* ptr)
+inline v_float32x16 v512_load_expand(const hfloat* ptr)
 {
     return v_float32x16(_mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)ptr)));
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x16& a)
+inline void v_pack_store(hfloat* ptr, const v_float32x16& a)
 {
     __m256i ah = _mm512_cvtps_ph(a.val, 0);
     _mm256_storeu_si256((__m256i*)ptr, ah);

diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -3251,7 +3251,7 @@ template<int n> inline v_reg<double, n/2> v_dotprod_expand_fast(const v_reg<int,
 ////// FP16 support ///////
 
 inline v_reg<float, simd128_width / sizeof(float)>
-v_load_expand(const float16_t* ptr)
+v_load_expand(const hfloat* ptr)
 {
     v_reg<float, simd128_width / sizeof(float)> v;
     for( int i = 0; i < v.nlanes; i++ )
@@ -3262,7 +3262,7 @@ v_load_expand(const float16_t* ptr)
 }
 #if CV_SIMD256
 inline v_reg<float, simd256_width / sizeof(float)>
-v256_load_expand(const float16_t* ptr)
+v256_load_expand(const hfloat* ptr)
 {
     v_reg<float, simd256_width / sizeof(float)> v;
     for (int i = 0; i < v.nlanes; i++)
@@ -3274,7 +3274,7 @@ v256_load_expand(const float16_t* ptr)
 #endif
 #if CV_SIMD512
 inline v_reg<float, simd512_width / sizeof(float)>
-v512_load_expand(const float16_t* ptr)
+v512_load_expand(const hfloat* ptr)
 {
     v_reg<float, simd512_width / sizeof(float)> v;
     for (int i = 0; i < v.nlanes; i++)
@@ -3286,11 +3286,11 @@ v512_load_expand(const float16_t* ptr)
 #endif
 
 template<int n> inline void
-v_pack_store(float16_t* ptr, const v_reg<float, n>& v)
+v_pack_store(hfloat* ptr, const v_reg<float, n>& v)
 {
     for( int i = 0; i < v.nlanes; i++ )
     {
-        ptr[i] = float16_t(v.s[i]);
+        ptr[i] = hfloat(v.s[i]);
     }
 }
 

diff --git a/modules/core/include/opencv2/core/hal/intrin_lasx.hpp b/modules/core/include/opencv2/core/hal/intrin_lasx.hpp
@@ -2983,7 +2983,7 @@ OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4,
 // FP16
 //
 
-inline v_float32x8 v256_load_expand(const float16_t* ptr)
+inline v_float32x8 v256_load_expand(const hfloat* ptr)
 {
 #if CV_FP16
     //1-load128, 2-permi, 3-cvt
@@ -2996,7 +2996,7 @@ inline v_float32x8 v256_load_expand(const float16_t* ptr)
 #endif
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x8& a)
+inline void v_pack_store(hfloat* ptr, const v_float32x8& a)
 {
 #if CV_FP16
     __m256i ah = __lasx_xvfcvt_h_s(a.val, a.val);
@@ -3005,7 +3005,7 @@ inline void v_pack_store(float16_t* ptr, const v_float32x8& a)
     float CV_DECL_ALIGNED(32) buf[8];
     v_store_aligned(buf, a);
     for (int i = 0; i < 8; i++)
-        ptr[i] = float16_t(buf[i]);
+        ptr[i] = hfloat(buf[i]);
 #endif
 }
 

diff --git a/modules/core/include/opencv2/core/hal/intrin_lsx.hpp b/modules/core/include/opencv2/core/hal/intrin_lsx.hpp
@@ -2498,7 +2498,7 @@ OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, u
 // FP16
 //
 
-inline v_float32x4 v_load_expand(const float16_t* ptr)
+inline v_float32x4 v_load_expand(const hfloat* ptr)
 {
 #if CV_FP16
     return v_float32x4(__lsx_vfcvtl_s_h((__m128)__lsx_vld(ptr, 0)));
@@ -2510,7 +2510,7 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
 #endif
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x4& a)
+inline void v_pack_store(hfloat* ptr, const v_float32x4& a)
 {
 #if CV_FP16
     __m128i res = (__m218i)__lsx_vfcvt_h_s(a.val, a.val);
@@ -2519,7 +2519,7 @@ inline void v_pack_store(float16_t* ptr, const v_float32x4& a)
     float CV_DECL_ALIGNED(32) buf[4];
     v_store_aligned(buf, a);
     for (int i = 0; i < 4; i++)
-        ptr[i] = float16_t(buf[i]);
+        ptr[i] = hfloat(buf[i]);
 #endif
 }
 

diff --git a/modules/core/include/opencv2/core/hal/intrin_msa.hpp b/modules/core/include/opencv2/core/hal/intrin_msa.hpp
@@ -1838,7 +1838,7 @@ inline v_float32x4 v_broadcast_element(const v_float32x4& a)
 
 ////// FP16 support ///////
 #if CV_FP16
-inline v_float32x4 v_load_expand(const float16_t* ptr)
+inline v_float32x4 v_load_expand(const hfloat* ptr)
 {
 #ifndef msa_ld1_f16
     v4f16 v = (v4f16)msa_ld1_s16((const short*)ptr);
@@ -1848,7 +1848,7 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
     return v_float32x4(msa_cvt_f32_f16(v));
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 {
     v4f16 hv = msa_cvt_f16_f32(v.val);
 
@@ -1859,20 +1859,20 @@ inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
 #endif
 }
 #else
-inline v_float32x4 v_load_expand(const float16_t* ptr)
+inline v_float32x4 v_load_expand(const hfloat* ptr)
 {
     float buf[4];
     for( int i = 0; i < 4; i++ )
         buf[i] = (float)ptr[i];
     return v_load(buf);
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 {
     float buf[4];
     v_store(buf, v);
     for( int i = 0; i < 4; i++ )
-        ptr[i] = (float16_t)buf[i];
+        ptr[i] = (hfloat)buf[i];
 }
 #endif
 

diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -2605,7 +2605,7 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
 
 ////// FP16 support ///////
 #if CV_FP16
-inline v_float32x4 v_load_expand(const float16_t* ptr)
+inline v_float32x4 v_load_expand(const hfloat* ptr)
 {
     float16x4_t v =
     #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
@@ -2616,7 +2616,7 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
     return v_float32x4(vcvt_f32_f16(v));
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 {
     float16x4_t hv = vcvt_f16_f32(v.val);
 
@@ -2627,20 +2627,20 @@ inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
     #endif
 }
 #else
-inline v_float32x4 v_load_expand(const float16_t* ptr)
+inline v_float32x4 v_load_expand(const hfloat* ptr)
 {
     const int N = 4;
     float buf[N];
     for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
     return v_load(buf);
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 {
     const int N = 4;
     float buf[N];
     v_store(buf, v);
-    for( int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]);
+    for( int i = 0; i < N; i++ ) ptr[i] = hfloat(buf[i]);
 }
 #endif
 

diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv.hpp
@@ -2873,30 +2873,30 @@ inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
 ////// FP16 support ///////
 
 #if CV_FP16
-inline v_float32x4 v_load_expand(const float16_t* ptr)
+inline v_float32x4 v_load_expand(const hfloat* ptr)
 {
     return v_float32x4(vfwcvt_f_f_v_f32m1(vle16_v_f16mf2(ptr, 4), 4));
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 {
     vse16_v_f16mf2(ptr, vfncvt_f_f_w_f16mf2(v, 4), 4);
 }
 #else
-inline v_float32x4 v_load_expand(const float16_t* ptr)
+inline v_float32x4 v_load_expand(const hfloat* ptr)
 {
     const int N = 4;
     float buf[N];
     for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
     return v_load(buf);
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 {
     const int N = 4;
     float buf[N];
     v_store(buf, v);
-    for( int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]);
+    for( int i = 0; i < N; i++ ) ptr[i] = hfloat(buf[i]);
 }
 #endif