diff --git a/modules/imgproc/src/filter.simd.hpp b/modules/imgproc/src/filter.simd.hpp index 21dab519af93..c25345f40715 100644 --- a/modules/imgproc/src/filter.simd.hpp +++ b/modules/imgproc/src/filter.simd.hpp @@ -86,7 +86,6 @@ Ptr getLinearFilter( #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY -typedef int CV_DECL_ALIGNED(1) unaligned_int; #define VEC_ALIGN CV_MALLOC_ALIGN int FilterEngine__start(FilterEngine& this_, const Size &_wholeSize, const Size &sz, const Point &ofs) @@ -1083,21 +1082,6 @@ struct SymmColumnVec_32s8u v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1))); i += VTraits::vlanes(); } -#if CV_SIMD_WIDTH > 16 - while( i <= width - 4 /*VTraits::vlanes()*/ ) -#else - if( i <= width - VTraits::vlanes() ) -#endif - { - v_float32 s0 = v_muladd(v_cvt_f32(vx_load(src[0] + i)), vx_setall_f32(ky[0]), vx_setall_f32(delta)); - s0 = v_muladd(v_cvt_f32(v_add(vx_load(src[1] + i), vx_load(src[-1] + i))), vx_setall_f32(ky[1]), s0); - for( k = 2; k <= ksize2; k++ ) - s0 = v_muladd(v_cvt_f32(v_add(vx_load(src[k] + i), vx_load(src[-k] + i))), vx_setall_f32(ky[k]), s0); - v_int32 s32 = v_round(s0); - v_int16 s16 = v_pack(s32, s32); - *(unaligned_int*)(dst + i) = v_get0(v_reinterpret_as_s32(v_pack_u(s16, s16))); - i += 4 /*v_int32x4::nlanes*/ ; - } } else { @@ -1139,20 +1123,6 @@ struct SymmColumnVec_32s8u v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1))); i += VTraits::vlanes(); } -#if CV_SIMD_WIDTH > 16 - while( i <= width - 4 /*VTraits::vlanes()*/ ) -#else - if( i <= width - VTraits::vlanes() ) -#endif - { - v_float32 s0 = v_muladd(v_cvt_f32(v_sub(vx_load(src[1] + i), vx_load(src[-1] + i))), vx_setall_f32(ky[1]), vx_setall_f32(delta)); - for (k = 2; k <= ksize2; k++) - s0 = v_muladd(v_cvt_f32(v_sub(vx_load(src[k] + i), vx_load(src[-k] + i))), vx_setall_f32(ky[k]), s0); - v_int32 s32 = v_round(s0); - v_int16 s16 = v_pack(s32, s32); - *(unaligned_int*)(dst + i) = v_get0(v_reinterpret_as_s32(v_pack_u(s16, s16))); - i += 4 /*v_int32x4::nlanes*/ ; - } } return i; } @@ -2236,20 +2206,6 @@ struct FilterVec_8u v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1))); i += VTraits::vlanes(); } -#if CV_SIMD_WIDTH > 16 - while( i <= width - 4 /*VTraits::vlanes()*/ ) -#else - if( i <= width - VTraits::vlanes() ) -#endif - { - v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[0] + i))), vx_setall_f32(kf[0]), vx_setall_f32(delta)); - for( k = 1; k < nz; k++ ) - s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[k] + i))), vx_setall_f32(kf[k]), s0); - v_int32 s32 = v_round(s0); - v_int16 s16 = v_pack(s32, s32); - *(unaligned_int*)(dst + i) = v_get0(v_reinterpret_as_s32(v_pack_u(s16, s16))); - i += 4 /*VTraits::vlanes()*/ ; - } return i; } diff --git a/modules/imgproc/src/fixedpoint.inl.hpp b/modules/imgproc/src/fixedpoint.inl.hpp index f5f433fec652..7303e06ad384 100644 --- a/modules/imgproc/src/fixedpoint.inl.hpp +++ b/modules/imgproc/src/fixedpoint.inl.hpp @@ -370,7 +370,7 @@ class ufixedpoint16 static CV_ALWAYS_INLINE ufixedpoint16 one() { return ufixedpoint16((uint16_t)(1 << fixedShift)); } static CV_ALWAYS_INLINE ufixedpoint16 fromRaw(uint16_t v) { return ufixedpoint16(v); } - CV_ALWAYS_INLINE uint16_t raw() { return val; } + CV_ALWAYS_INLINE uint16_t raw() const { return val; } }; } diff --git a/modules/imgproc/src/smooth.simd.hpp b/modules/imgproc/src/smooth.simd.hpp index 33e58d4e80b4..7389cdbce947 100644 --- a/modules/imgproc/src/smooth.simd.hpp +++ b/modules/imgproc/src/smooth.simd.hpp @@ -1634,6 +1634,15 @@ void vlineSmooth(const FT* const * src, const FT* m, int n, ET* dst, int len) dst[i] = val; } } + +inline uint32_t read_pair_as_u32(const ufixedpoint16 * mem) +{ + union Cv32sufX2 { uint32_t v32; int16_t v16[2]; } res; + res.v16[0] = mem->raw(); + res.v16[1] = (mem + 1)->raw(); + return res.v32; +} + template <> void vlineSmooth(const ufixedpoint16* const * src, const ufixedpoint16* m, int n, uint8_t* dst, int len) { @@ -1655,7 +1664,7 @@ void vlineSmooth(const ufixedpoint16* const * src, const v_int16 v_src00, v_src10, v_src01, v_src11, v_src02, v_src12, v_src03, v_src13; v_int16 v_tmp0, v_tmp1; - v_int16 v_mul = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)m))); + v_int16 v_mul = v_reinterpret_as_s16(vx_setall_u32(read_pair_as_u32(m))); const int16_t* src0 = (const int16_t*)src[0] + i; const int16_t* src1 = (const int16_t*)src[1] + i; @@ -1683,7 +1692,7 @@ void vlineSmooth(const ufixedpoint16* const * src, const int j = 2; for (; j < n - 1; j+=2) { - v_mul = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)(m+j)))); + v_mul = v_reinterpret_as_s16(vx_setall_u32(read_pair_as_u32(m + j))); const int16_t* srcj0 = (const int16_t*)src[j] + i; const int16_t* srcj1 = (const int16_t*)src[j + 1] + i;