diff --git a/3rdparty/carotene/hal/tegra_hal.hpp b/3rdparty/carotene/hal/tegra_hal.hpp
index 31182a029a84..2e07b7f52669 100644
--- a/3rdparty/carotene/hal/tegra_hal.hpp
+++ b/3rdparty/carotene/hal/tegra_hal.hpp
@@ -1962,4 +1962,20 @@ inline int TEGRA_LKOpticalFlowLevel(const uchar *prev_data, size_t prev_data_ste
 #define cv_hal_LKOpticalFlowLevel TEGRA_LKOpticalFlowLevel
 #endif // __ARM_ARCH=7
 
+#if 0 // OpenCV provides fater parallel implementation
+inline int TEGRA_ScharrDeriv(const uchar* src_data, size_t src_step,
+                      short* dst_data, size_t dst_step,
+                      int width, int height, int cn)
+{
+    if (!CAROTENE_NS::isSupportedConfiguration())
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    CAROTENE_NS::ScharrDeriv(CAROTENE_NS::Size2D(width, height), cn, src_data, src_step, dst_data, dst_step);
+    return CV_HAL_ERROR_OK;
+}
+
+#undef cv_hal_ScharrDeriv
+#define cv_hal_ScharrDeriv TEGRA_ScharrDeriv
+#endif
+
 #endif
diff --git a/3rdparty/hal_rvv/hal_rvv.hpp b/3rdparty/hal_rvv/hal_rvv.hpp
index e10a3258e90f..4765a15518a5 100644
--- a/3rdparty/hal_rvv/hal_rvv.hpp
+++ b/3rdparty/hal_rvv/hal_rvv.hpp
@@ -19,4 +19,8 @@
 #include "version/hal_rvv_071.hpp"
 #endif
 
-#endif
\ No newline at end of file
+#if defined(__riscv_v) && __riscv_v == 1000000
+#include "hal_rvv_1p0/merge.hpp" // core
+#endif
+
+#endif
diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/merge.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/merge.hpp
new file mode 100644
index 000000000000..5278680eaaf5
--- /dev/null
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/merge.hpp
@@ -0,0 +1,363 @@
+#ifndef OPENCV_HAL_RVV_MERGE_HPP_INCLUDED
+#define OPENCV_HAL_RVV_MERGE_HPP_INCLUDED
+
+#include <riscv_vector.h>
+
+namespace cv { namespace cv_hal_rvv {
+
+#undef cv_hal_merge8u
+#define cv_hal_merge8u cv::cv_hal_rvv::merge8u
+#undef cv_hal_merge16u
+#define cv_hal_merge16u cv::cv_hal_rvv::merge16u
+#undef cv_hal_merge32s
+#define cv_hal_merge32s cv::cv_hal_rvv::merge32s
+#undef cv_hal_merge64s
+#define cv_hal_merge64s cv::cv_hal_rvv::merge64s
+
+#if defined __GNUC__
+__attribute__((optimize("no-tree-vectorize")))
+#endif
+static int merge8u(const uchar** src, uchar* dst, int len, int cn ) {
+    int k = cn % 4 ? cn % 4 : 4;
+    int i = 0, j;
+    int vl = __riscv_vsetvlmax_e8m1();
+    if( k == 1 )
+    {
+        const uchar* src0 = src[0];
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*2, a, vl);
+        }
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++)
+            dst[i*cn] = src0[i];
+    }
+    else if( k == 2 )
+    {
+        const uchar *src0 = src[0], *src1 = src[1];
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
+            auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*2, a, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn + 1, sizeof(uchar)*2, b, vl);
+        }
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++ )
+        {
+            dst[i*cn] = src0[i];
+            dst[i*cn+1] = src1[i];
+        }
+    }
+    else if( k == 3 )
+    {
+        const uchar *src0 = src[0], *src1 = src[1], *src2 = src[2];
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
+            auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
+            auto c = __riscv_vle8_v_u8m1(src2 + i, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*3, a, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn + 1, sizeof(uchar)*3, b, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn + 2, sizeof(uchar)*3, c, vl);
+        }
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++ )
+        {
+            dst[i*cn] = src0[i];
+            dst[i*cn+1] = src1[i];
+            dst[i*cn+2] = src2[i];
+        }
+    }
+    else
+    {
+        const uchar *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
+            auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
+            auto c = __riscv_vle8_v_u8m1(src2 + i, vl);
+            auto d = __riscv_vle8_v_u8m1(src3 + i, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*4, a, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn + 1, sizeof(uchar)*4, b, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn + 2, sizeof(uchar)*4, c, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn + 3, sizeof(uchar)*4, d, vl);
+        }
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++ )
+        {
+            dst[i*cn] = src0[i];
+            dst[i*cn+1] = src1[i];
+            dst[i*cn+2] = src2[i];
+            dst[i*cn+3] = src3[i];
+        }
+    }
+    #if defined(__clang__)
+    #pragma clang loop vectorize(disable)
+    #endif
+    for( ; k < cn; k += 4 )
+    {
+        const uchar *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
+        for( i = 0, j = k; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i]; dst[j+1] = src1[i];
+            dst[j+2] = src2[i]; dst[j+3] = src3[i];
+        }
+    }
+    return CV_HAL_ERROR_OK;
+}
+
+#if defined __GNUC__
+__attribute__((optimize("no-tree-vectorize")))
+#endif
+static int merge16u(const ushort** src, ushort* dst, int len, int cn ) {
+    int k = cn % 4 ? cn % 4 : 4;
+    int i = 0, j;
+    int vl = __riscv_vsetvlmax_e16m1();
+    if( k == 1 )
+    {
+        const ushort* src0 = src[0];
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*2, a, vl);
+        }
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++)
+            dst[i*cn] = src0[i];
+    }
+    else if( k == 2 )
+    {
+        const ushort *src0 = src[0], *src1 = src[1];
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
+            auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*2, a, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn + 1, sizeof(ushort)*2, b, vl);
+        }
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++ )
+        {
+            dst[i*cn] = src0[i];
+            dst[i*cn+1] = src1[i];
+        }
+    }
+    else if( k == 3 )
+    {
+        const ushort *src0 = src[0], *src1 = src[1], *src2 = src[2];
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
+            auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
+            auto c = __riscv_vle16_v_u16m1(src2 + i, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*3, a, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn + 1, sizeof(ushort)*3, b, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn + 2, sizeof(ushort)*3, c, vl);
+        }
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++ )
+        {
+            dst[i*cn] = src0[i];
+            dst[i*cn+1] = src1[i];
+            dst[i*cn+2] = src2[i];
+        }
+    }
+    else
+    {
+        const ushort *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
+            auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
+            auto c = __riscv_vle16_v_u16m1(src2 + i, vl);
+            auto d = __riscv_vle16_v_u16m1(src3 + i, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*4, a, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn + 1, sizeof(ushort)*4, b, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn + 2, sizeof(ushort)*4, c, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn + 3, sizeof(ushort)*4, d, vl);
+        }
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++ )
+        {
+            dst[i*cn] = src0[i];
+            dst[i*cn+1] = src1[i];
+            dst[i*cn+2] = src2[i];
+            dst[i*cn+3] = src3[i];
+        }
+    }
+    #if defined(__clang__)
+    #pragma clang loop vectorize(disable)
+    #endif
+    for( ; k < cn; k += 4 )
+    {
+        const uint16_t *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
+        for( i = 0, j = k; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i]; dst[j+1] = src1[i];
+            dst[j+2] = src2[i]; dst[j+3] = src3[i];
+        }
+    }
+    return CV_HAL_ERROR_OK;
+}
+
+#if defined __GNUC__
+__attribute__((optimize("no-tree-vectorize")))
+#endif
+static int merge32s(const int** src, int* dst, int len, int cn ) {
+    int k = cn % 4 ? cn % 4 : 4;
+    int i, j;
+    if( k == 1 )
+    {
+        const int* src0 = src[0];
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( i = j = 0; i < len; i++, j += cn )
+            dst[j] = src0[i];
+    }
+    else if( k == 2 )
+    {
+        const int *src0 = src[0], *src1 = src[1];
+        i = j = 0;
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i];
+            dst[j+1] = src1[i];
+        }
+    }
+    else if( k == 3 )
+    {
+        const int *src0 = src[0], *src1 = src[1], *src2 = src[2];
+        i = j = 0;
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i];
+            dst[j+1] = src1[i];
+            dst[j+2] = src2[i];
+        }
+    }
+    else
+    {
+        const int *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
+        i = j = 0;
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i]; dst[j+1] = src1[i];
+            dst[j+2] = src2[i]; dst[j+3] = src3[i];
+        }
+    }
+    #if defined(__clang__)
+    #pragma clang loop vectorize(disable)
+    #endif
+    for( ; k < cn; k += 4 )
+    {
+        const int *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
+        for( i = 0, j = k; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i]; dst[j+1] = src1[i];
+            dst[j+2] = src2[i]; dst[j+3] = src3[i];
+        }
+    }
+    return CV_HAL_ERROR_OK;
+}
+
+#if defined __GNUC__
+__attribute__((optimize("no-tree-vectorize")))
+#endif
+static int merge64s(const int64** src, int64* dst, int len, int cn ) {
+    int k = cn % 4 ? cn % 4 : 4;
+    int i, j;
+    if( k == 1 )
+    {
+        const int64* src0 = src[0];
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( i = j = 0; i < len; i++, j += cn )
+            dst[j] = src0[i];
+    }
+    else if( k == 2 )
+    {
+        const int64 *src0 = src[0], *src1 = src[1];
+        i = j = 0;
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i];
+            dst[j+1] = src1[i];
+        }
+    }
+    else if( k == 3 )
+    {
+        const int64 *src0 = src[0], *src1 = src[1], *src2 = src[2];
+        i = j = 0;
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i];
+            dst[j+1] = src1[i];
+            dst[j+2] = src2[i];
+        }
+    }
+    else
+    {
+        const int64 *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
+        i = j = 0;
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i]; dst[j+1] = src1[i];
+            dst[j+2] = src2[i]; dst[j+3] = src3[i];
+        }
+    }
+    #if defined(__clang__)
+    #pragma clang loop vectorize(disable)
+    #endif
+    for( ; k < cn; k += 4 )
+    {
+        const int64 *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
+        for( i = 0, j = k; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i]; dst[j+1] = src1[i];
+            dst[j+2] = src2[i]; dst[j+3] = src3[i];
+        }
+    }
+    return CV_HAL_ERROR_OK;
+}
+
+}}
+
+#endif
diff --git a/3rdparty/kleidicv/CMakeLists.txt b/3rdparty/kleidicv/CMakeLists.txt
index 26e485441603..a7f7c1a37c94 100644
--- a/3rdparty/kleidicv/CMakeLists.txt
+++ b/3rdparty/kleidicv/CMakeLists.txt
@@ -1,8 +1,8 @@
 project(kleidicv_hal)
 
 set(KLEIDICV_SOURCE_PATH "" CACHE PATH "Directory containing KleidiCV sources")
-ocv_update(KLEIDICV_SRC_COMMIT "0.1.0")
-ocv_update(KLEIDICV_SRC_HASH "9388f28cf2fbe3338197b2b57d491468")
+ocv_update(KLEIDICV_SRC_COMMIT "0.2.0")
+ocv_update(KLEIDICV_SRC_HASH "dabe522e8f55ac342d07a787391dab80")
 
 if(KLEIDICV_SOURCE_PATH)
   set(THE_ROOT "${KLEIDICV_SOURCE_PATH}")
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3bc9cbe03824..9e983bad79b7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -52,6 +52,10 @@ if(POLICY CMP0056)
   cmake_policy(SET CMP0056 NEW)  # try_compile(): link flags
 endif()
 
+if(POLICY CMP0057)
+  cmake_policy(SET CMP0057 NEW)  # CMake 3.3: if(IN_LIST) support
+endif()
+
 if(POLICY CMP0066)
   cmake_policy(SET CMP0066 NEW)  # CMake 3.7: try_compile(): use per-config flags, like CMAKE_CXX_FLAGS_RELEASE
 endif()
@@ -217,7 +221,7 @@ OCV_OPTION(WITH_1394 "Include IEEE1394 support" OFF
 OCV_OPTION(WITH_AVFOUNDATION "Use AVFoundation for Video I/O (iOS/visionOS/Mac)" ON
   VISIBLE_IF APPLE
   VERIFY HAVE_AVFOUNDATION)
-OCV_OPTION(WITH_AVIF "Enable AVIF support" OFF
+OCV_OPTION(WITH_AVIF "Enable AVIF support" ON
   VERIFY HAVE_AVIF)
 OCV_OPTION(WITH_CAP_IOS "Enable iOS video capture" ON
   VISIBLE_IF IOS
diff --git a/cmake/OpenCVUtils.cmake b/cmake/OpenCVUtils.cmake
index 5344b1597469..f94235038f43 100644
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@@ -353,23 +353,23 @@ function(ocv_target_include_directories target)
   #ocv_debug_message("ocv_target_include_directories(${target} ${ARGN})")
   _ocv_fix_target(target)
   set(__params "")
-  if(CV_GCC AND NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6.0" AND
-      ";${ARGN};" MATCHES "/usr/include;")
-    return() # workaround for GCC 6.x bug
-  endif()
-  set(__params "")
   set(__system_params "")
   set(__var_name __params)
   foreach(dir ${ARGN})
     if("${dir}" STREQUAL "SYSTEM")
       set(__var_name __system_params)
     else()
-      get_filename_component(__abs_dir "${dir}" ABSOLUTE)
-      ocv_is_opencv_directory(__is_opencv_dir "${dir}")
-      if(__is_opencv_dir)
-        list(APPEND ${__var_name} "${__abs_dir}")
+      if(CV_GCC AND NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6.0" AND
+          dir MATCHES "/usr/include$")
+         # workaround for GCC 6.x bug
       else()
-        list(APPEND ${__var_name} "${dir}")
+        get_filename_component(__abs_dir "${dir}" ABSOLUTE)
+        ocv_is_opencv_directory(__is_opencv_dir "${dir}")
+        if(__is_opencv_dir)
+          list(APPEND ${__var_name} "${__abs_dir}")
+        else()
+          list(APPEND ${__var_name} "${dir}")
+        endif()
       endif()
     endif()
   endforeach()
diff --git a/doc/js_tutorials/js_core/js_image_arithmetics/js_image_arithmetics.markdown b/doc/js_tutorials/js_core/js_image_arithmetics/js_image_arithmetics.markdown
index 30ed9185762c..03e152f3866b 100644
--- a/doc/js_tutorials/js_core/js_image_arithmetics/js_image_arithmetics.markdown
+++ b/doc/js_tutorials/js_core/js_image_arithmetics/js_image_arithmetics.markdown
@@ -4,9 +4,9 @@ Arithmetic Operations on Images {#tutorial_js_image_arithmetics}
 Goal
 ----
 
--   Learn several arithmetic operations on images like addition, subtraction, bitwise operations
+-   Learn several arithmetic operations on images like addition, subtraction, bitwise operations,
     etc.
--   You will learn these functions : **cv.add()**, **cv.subtract()**  etc.
+-   You will learn these functions : **cv.add()**, **cv.subtract()**,  etc.
 
 Image Addition
 --------------
diff --git a/doc/tutorial-utils.js b/doc/tutorial-utils.js
index 3f08f1d2c1ff..9703e227795e 100644
--- a/doc/tutorial-utils.js
+++ b/doc/tutorial-utils.js
@@ -72,12 +72,23 @@ function buttonsToAdd($elements, $heading, $type) {
 }
 
 function addTutorialsButtons() {
-    $("h1").each(function() {
-        var $elements = $(this).nextUntil("h1")
+    // See https://github.com/opencv/opencv/issues/26339
+    $lastHeader = undefined
+    $("h1,h2,h3,div.newInnerHTML").each(function() {
+        if( this.tagName.startsWith("H") ) {
+            $lastHeader = $(this)
+            return true // loop-continue
+        }
+        if( $lastHeader === undefined ) {
+            return true // loop-continue
+        }
+        var $toggleHeader = $lastHeader.tagName
+        var $elements = $lastHeader.nextUntil($toggleHeader)
         var $lower = $elements.find("div.newInnerHTML")
         $elements = $elements.add($lower)
         $elements = $elements.filter("div.newInnerHTML")
-        buttonsToAdd($elements, $(this), "h1")
+        buttonsToAdd($elements, $lastHeader, $toggleHeader)
+        $lastHeader = undefined
     });
     $(".toggleable_button").first().click();
     var $clickDefault = $('.toggleable_button.label_python').first();
diff --git a/doc/tutorials/core/file_input_output_with_xml_yml/file_input_output_with_xml_yml.markdown b/doc/tutorials/core/file_input_output_with_xml_yml/file_input_output_with_xml_yml.markdown
index acd66bdeecf9..197292808f81 100644
--- a/doc/tutorials/core/file_input_output_with_xml_yml/file_input_output_with_xml_yml.markdown
+++ b/doc/tutorials/core/file_input_output_with_xml_yml/file_input_output_with_xml_yml.markdown
@@ -1,4 +1,4 @@
-File Input and Output using XML and YAML files {#tutorial_file_input_output_with_xml_yml}
+File Input and Output using XML / YAML / JSON files {#tutorial_file_input_output_with_xml_yml}
 ==============================================
 
 @tableofcontents
@@ -14,12 +14,12 @@ File Input and Output using XML and YAML files {#tutorial_file_input_output_with
 Goal
 ----
 
-You'll find answers for the following questions:
+You'll find answers to the following questions:
 
--   How to print and read text entries to a file and OpenCV using YAML or XML files?
--   How to do the same for OpenCV data structures?
--   How to do this for your data structures?
--   Usage of OpenCV data structures such as @ref cv::FileStorage , @ref cv::FileNode or @ref
+-   How do you print and read text entries to a file in OpenCV using YAML, XML, or JSON files?
+-   How can you perform the same operations for OpenCV data structures?
+-   How can this be done for your custom data structures?
+-   How do you use OpenCV data structures, such as @ref cv::FileStorage , @ref cv::FileNode or @ref
     cv::FileNodeIterator .
 
 Source code
@@ -49,14 +49,14 @@ Here's a sample code of how to achieve all the stuff enumerated at the goal list
 Explanation
 -----------
 
-Here we talk only about XML and YAML file inputs. Your output (and its respective input) file may
+Here we talk only about XML, YAML and JSON file inputs. Your output (and its respective input) file may
 have only one of these extensions and the structure coming from this. They are two kinds of data
 structures you may serialize: *mappings* (like the STL map and the Python dictionary) and *element sequence* (like the STL
 vector). The difference between these is that in a map every element has a unique name through what
 you may access it. For sequences you need to go through them to query a specific item.
 
--#  **XML/YAML File Open and Close.** Before you write any content to such file you need to open it
-    and at the end to close it. The XML/YAML data structure in OpenCV is @ref cv::FileStorage . To
+-#  **XML/YAML/JSON File Open and Close.** Before you write any content to such file you need to open it
+    and at the end to close it. The XML/YAML/JSON data structure in OpenCV is @ref cv::FileStorage . To
     specify that this structure to which file binds on your hard drive you can use either its
     constructor or the *open()* function of this:
     @add_toggle_cpp
diff --git a/modules/3d/src/sqpnp.cpp b/modules/3d/src/sqpnp.cpp
index ef5d2470ad0e..b66998e68fa4 100644
--- a/modules/3d/src/sqpnp.cpp
+++ b/modules/3d/src/sqpnp.cpp
@@ -1,3 +1,10 @@
+// Implementation of SQPnP as described in the paper:
+//
+// "A Consistently Fast and Globally Optimal Solution to the Perspective-n-Point Problem" by G. Terzakis and M. Lourakis
+//     a) Paper:         https://www.ecva.net/papers/eccv_2020/papers_ECCV/papers/123460460.pdf
+//     b) Supplementary: https://www.ecva.net/papers/eccv_2020/papers_ECCV/papers/123460460-supp.pdf
+
+
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html
@@ -39,6 +46,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "precomp.hpp"
 #include "sqpnp.hpp"
 
+#ifdef HAVE_EIGEN
+#include <Eigen/Dense>
+#endif
+
 namespace cv {
 namespace sqpnp {
 
@@ -52,8 +63,8 @@ const double PoseSolver::POINT_VARIANCE_THRESHOLD = 1e-5;
 const double PoseSolver::SQRT3 = std::sqrt(3);
 const int PoseSolver::SQP_MAX_ITERATION = 15;
 
-//No checking done here for overflow, since this is not public all call instances
-//are assumed to be valid
+// No checking done here for overflow, since this is not public all call instances
+// are assumed to be valid
 template <typename tp, int snrows, int sncols,
     int dnrows, int dncols>
     void set(int row, int col, cv::Matx<tp, dnrows, dncols>& dest,
@@ -78,7 +89,7 @@ PoseSolver::PoseSolver()
 void PoseSolver::solve(InputArray objectPoints, InputArray imagePoints, OutputArrayOfArrays rvecs,
     OutputArrayOfArrays tvecs)
 {
-    //Input checking
+    // Input checking
     int objType = objectPoints.getMat().type();
     CV_CheckType(objType, objType == CV_32FC3 || objType == CV_64FC3,
         "Type of objectPoints must be CV_32FC3 or CV_64FC3");
@@ -158,12 +169,12 @@ void PoseSolver::computeOmega(InputArray objectPoints, InputArray imagePoints)
         sum_img += img_pt;
         sum_obj += obj_pt;
 
-        const double& x = img_pt.x, & y = img_pt.y;
-        const double& X = obj_pt.x, & Y = obj_pt.y, & Z = obj_pt.z;
+        const double x = img_pt.x, y = img_pt.y;
+        const double X = obj_pt.x, Y = obj_pt.y, Z = obj_pt.z;
         double sq_norm = x * x + y * y;
         sq_norm_sum += sq_norm;
 
-        double X2 = X * X,
+        const double X2 = X * X,
             XY = X * Y,
             XZ = X * Z,
             Y2 = Y * Y,
@@ -178,47 +189,47 @@ void PoseSolver::computeOmega(InputArray objectPoints, InputArray imagePoints)
         omega_(2, 2) += Z2;
 
 
-        //Populating this manually saves operations by only calculating upper triangle
-        omega_(0, 6) += -x * X2; omega_(0, 7) += -x * XY; omega_(0, 8) += -x * XZ;
-        omega_(1, 7) += -x * Y2; omega_(1, 8) += -x * YZ;
-        omega_(2, 8) += -x * Z2;
+        // Populating this manually saves operations by only calculating upper triangle
+        omega_(0, 6) -= x * X2; omega_(0, 7) -= x * XY; omega_(0, 8) -= x * XZ;
+        omega_(1, 7) -= x * Y2; omega_(1, 8) -= x * YZ;
+        omega_(2, 8) -= x * Z2;
 
-        omega_(3, 6) += -y * X2; omega_(3, 7) += -y * XY; omega_(3, 8) += -y * XZ;
-        omega_(4, 7) += -y * Y2; omega_(4, 8) += -y * YZ;
-        omega_(5, 8) += -y * Z2;
+        omega_(3, 6) -= y * X2; omega_(3, 7) -= y * XY; omega_(3, 8) -= y * XZ;
+        omega_(4, 7) -= y * Y2; omega_(4, 8) -= y * YZ;
+        omega_(5, 8) -= y * Z2;
 
 
         omega_(6, 6) += sq_norm * X2; omega_(6, 7) += sq_norm * XY; omega_(6, 8) += sq_norm * XZ;
         omega_(7, 7) += sq_norm * Y2; omega_(7, 8) += sq_norm * YZ;
         omega_(8, 8) += sq_norm * Z2;
 
-        //Compute qa_sum. Certain pairs of elements are equal, so filling them outside the loop saves some operations
+        // Compute qa_sum. Certain pairs of elements are equal, so filling them outside the loop saves some operations
         qa_sum(0, 0) += X; qa_sum(0, 1) += Y; qa_sum(0, 2) += Z;
 
-        qa_sum(0, 6) += -x * X; qa_sum(0, 7) += -x * Y; qa_sum(0, 8) += -x * Z;
-        qa_sum(1, 6) += -y * X; qa_sum(1, 7) += -y * Y; qa_sum(1, 8) += -y * Z;
+        qa_sum(0, 6) -= x * X; qa_sum(0, 7) -= x * Y; qa_sum(0, 8) -= x * Z;
+        qa_sum(1, 6) -= y * X; qa_sum(1, 7) -= y * Y; qa_sum(1, 8) -= y * Z;
 
         qa_sum(2, 6) += sq_norm * X; qa_sum(2, 7) += sq_norm * Y; qa_sum(2, 8) += sq_norm * Z;
     }
 
-    //Complete qa_sum
+    // Complete qa_sum
     qa_sum(1, 3) = qa_sum(0, 0); qa_sum(1, 4) = qa_sum(0, 1); qa_sum(1, 5) = qa_sum(0, 2);
     qa_sum(2, 0) = qa_sum(0, 6); qa_sum(2, 1) = qa_sum(0, 7); qa_sum(2, 2) = qa_sum(0, 8);
     qa_sum(2, 3) = qa_sum(1, 6); qa_sum(2, 4) = qa_sum(1, 7); qa_sum(2, 5) = qa_sum(1, 8);
 
 
-    //lower triangles of omega_'s off-diagonal blocks (0:2, 6:8), (3:5, 6:8) and (6:8, 6:8)
+    // lower triangles of omega_'s off-diagonal blocks (0:2, 6:8), (3:5, 6:8) and (6:8, 6:8)
     omega_(1, 6) = omega_(0, 7); omega_(2, 6) = omega_(0, 8); omega_(2, 7) = omega_(1, 8);
     omega_(4, 6) = omega_(3, 7); omega_(5, 6) = omega_(3, 8); omega_(5, 7) = omega_(4, 8);
     omega_(7, 6) = omega_(6, 7); omega_(8, 6) = omega_(6, 8); omega_(8, 7) = omega_(7, 8);
 
-    //upper triangle of omega_'s block (3:5, 3:5)
+    // upper triangle of omega_'s block (3:5, 3:5)
     omega_(3, 3) = omega_(0, 0); omega_(3, 4) = omega_(0, 1); omega_(3, 5) = omega_(0, 2);
                                  omega_(4, 4) = omega_(1, 1); omega_(4, 5) = omega_(1, 2);
                                                               omega_(5, 5) = omega_(2, 2);
 
-    //Mirror omega_'s upper triangle to lower triangle
-    //Note that elements (7, 6), (8, 6) & (8, 7) have already been assigned above
+    // Mirror omega_'s upper triangle to lower triangle
+    // Note that elements (7, 6), (8, 6) & (8, 7) have already been assigned above
     omega_(1, 0) = omega_(0, 1);
     omega_(2, 0) = omega_(0, 2); omega_(2, 1) = omega_(1, 2);
     omega_(3, 0) = omega_(0, 3); omega_(3, 1) = omega_(1, 3); omega_(3, 2) = omega_(2, 3);
@@ -240,12 +251,26 @@ void PoseSolver::computeOmega(InputArray objectPoints, InputArray imagePoints)
     CV_Assert(point_coordinate_variance >= POINT_VARIANCE_THRESHOLD);
 
     Matx<double, 3, 3> q_inv;
-    analyticalInverse3x3Symm(q, q_inv);
+    if (!invertSPD3x3(q, q_inv)) analyticalInverse3x3Symm(q, q_inv);
 
     p_ = -q_inv * qa_sum;
 
     omega_ += qa_sum.t() * p_;
 
+#ifdef HAVE_EIGEN
+    // Rank revealing QR nullspace computation with full pivoting.
+    // This is slightly less accurate compared to SVD but x2-x3 faster
+    Eigen::Matrix<double, 9, 9> omega_eig, tmp_eig;
+    cv::cv2eigen(omega_, omega_eig);
+    Eigen::FullPivHouseholderQR<Eigen::Matrix<double, 9, 9> > rrqr(omega_eig);
+    tmp_eig = rrqr.matrixQ();
+    cv::eigen2cv(tmp_eig, u_);
+
+    tmp_eig = rrqr.matrixQR().template triangularView<Eigen::Upper>(); // R
+    Eigen::Matrix<double, 9, 1> S_eig = tmp_eig.diagonal().array().abs();
+    cv::eigen2cv(S_eig, s_);
+#else
+    // Use OpenCV's SVD
     cv::SVD omega_svd(omega_, cv::SVD::FULL_UV);
     s_ = omega_svd.w;
     u_ = cv::Mat(omega_svd.vt.t());
@@ -255,6 +280,8 @@ void PoseSolver::computeOmega(InputArray objectPoints, InputArray imagePoints)
     u_ = u_.t(); // eigenvectors were returned as rows
 #endif
 
+#endif // HAVE_EIGEN
+
     CV_Assert(s_(0) >= 1e-7);
 
     while (s_(7 - num_null_vectors_) < RANK_TOLERANCE) num_null_vectors_++;
@@ -276,7 +303,7 @@ void PoseSolver::solveInternal(InputArray objectPoints)
 
         SQPSolution solutions[2];
 
-        //If e is orthogonal, we can skip SQP
+        // If e is orthogonal, we can skip SQP
         if (orthogonality_sq_err < ORTHOGONALITY_SQUARED_ERROR_THRESHOLD)
         {
             solutions[0].r_hat = det3x3(e) * e;
@@ -393,6 +420,77 @@ void PoseSolver::solveSQPSystem(const cv::Matx<double, 9, 1>& r, cv::Matx<double
     delta += N * y;
 }
 
+// Inverse of SPD 3x3 A via a lower triangular sqrt-free Cholesky
+// factorization A=L*D*L' (L has ones on its diagonal, D is diagonal).
+//
+// Only the lower triangular part of A is accessed.
+//
+// The function returns true if successful
+//
+// see http://euler.nmt.edu/~brian/ldlt.html
+//
+bool PoseSolver::invertSPD3x3(const cv::Matx<double, 3, 3>& A, cv::Matx<double, 3, 3>& A1)
+{
+    double L[3*3], D[3], v[2], x[3];
+
+    v[0]=D[0]=A(0, 0);
+    if(v[0]<=1E-10) return false;
+    v[1]=1.0/v[0];
+    L[3]=A(1, 0)*v[1];
+    L[6]=A(2, 0)*v[1];
+    //L[0]=1.0;
+    //L[1]=L[2]=0.0;
+
+    v[0]=L[3]*D[0];
+    v[1]=D[1]=A(1, 1)-L[3]*v[0];
+    if(v[1]<=1E-10) return false;
+    L[7]=(A(2, 1)-L[6]*v[0])/v[1];
+    //L[4]=1.0;
+    //L[5]=0.0;
+
+    v[0]=L[6]*D[0];
+    v[1]=L[7]*D[1];
+    D[2]=A(2, 2)-L[6]*v[0]-L[7]*v[1];
+    if(D[2]<=1E-10) return false;
+    //L[8]=1.0;
+
+    D[0]=1.0/D[0];
+    D[1]=1.0/D[1];
+    D[2]=1.0/D[2];
+
+    /* Forward solve Lx = e0 */
+    //x[0]=1.0;
+    x[1]=-L[3];
+    x[2]=-L[6]+L[7]*L[3];
+
+    /* Backward solve D*L'x = y */
+    A1(0, 2)=x[2]=x[2]*D[2];
+    A1(0, 1)=x[1]=x[1]*D[1]-L[7]*x[2];
+    A1(0, 0)     =     D[0]-L[3]*x[1]-L[6]*x[2];
+
+    /* Forward solve Lx = e1 */
+    //x[0]=0.0;
+    //x[1]=1.0;
+    x[2]=-L[7];
+
+    /* Backward solve D*L'x = y */
+    A1(1, 2)=x[2]=x[2]*D[2];
+    A1(1, 1)=x[1]=     D[1]-L[7]*x[2];
+    A1(1, 0)     =         -L[3]*x[1]-L[6]*x[2];
+
+    /* Forward solve Lx = e2 */
+    //x[0]=0.0;
+    //x[1]=0.0;
+    //x[2]=1.0;
+
+    /* Backward solve D*L'x = y */
+    A1(2, 2)=x[2]=D[2];
+    A1(2, 1)=x[1]=    -L[7]*x[2];
+    A1(2, 0)     =    -L[3]*x[1]-L[6]*x[2];
+
+    return true;
+}
+
 bool PoseSolver::analyticalInverse3x3Symm(const cv::Matx<double, 3, 3>& Q,
     cv::Matx<double, 3, 3>& Qinv,
     const double& threshold)
@@ -411,7 +509,7 @@ bool PoseSolver::analyticalInverse3x3Symm(const cv::Matx<double, 3, 3>& Q,
     t12 = c * c;
     double det = -t4 * f + a * t2 + t7 * f - 2.0 * t9 * e + t12 * d;
 
-    if (fabs(det) < threshold) return false;
+    if (fabs(det) < threshold) { cv::invert(Q, Qinv, cv::DECOMP_SVD); return false; } // fall back to pseudoinverse
 
     // 3. Inverse
     double t15, t20, t24, t30;
@@ -502,7 +600,7 @@ void PoseSolver::computeRowAndNullspace(const cv::Matx<double, 9, 1>& r,
     H(6, 4) = r(3) - dot_j5q3 * H(6, 2); H(7, 4) = r(4) - dot_j5q3 * H(7, 2); H(8, 4) = r(5) - dot_j5q3 * H(8, 2);
 
     Matx<double, 9, 1> q4 = H.col(4);
-    q4 /= cv::norm(q4);
+    q4 *= (1.0 / cv::norm(q4));
     set<double, 9, 1, 9, 6>(0, 4, H, q4);
 
     K(4, 0) = 0;
@@ -531,7 +629,7 @@ void PoseSolver::computeRowAndNullspace(const cv::Matx<double, 9, 1>& r,
     H(8, 5) = r(2) - dot_j6q3 * H(8, 2) - dot_j6q5 * H(8, 4);
 
     Matx<double, 9, 1> q5 = H.col(5);
-    q5 /= cv::norm(q5);
+    q5 *= (1.0 / cv::norm(q5));
     set<double, 9, 1, 9, 6>(0, 5, H, q5);
 
     K(5, 0) = r(6) * H(0, 0) + r(7) * H(1, 0) + r(8) * H(2, 0);
@@ -573,10 +671,11 @@ void PoseSolver::computeRowAndNullspace(const cv::Matx<double, 9, 1>& r,
     Matx<double, 9, 1> v1 = Pn.col(index1);
     v1 /= max_norm1;
     set<double, 9, 1, 9, 3>(0, 0, N, v1);
+    col_norms[index1] = -1.0; // mark to avoid use in subsequent loops
 
     for (int i = 0; i < 9; i++)
     {
-        if (i == index1) continue;
+        //if (i == index1) continue;
         if (col_norms[i] >= norm_threshold)
         {
             double cos_v1_x_col = fabs(Pn.col(i).dot(v1) / col_norms[i]);
@@ -592,16 +691,18 @@ void PoseSolver::computeRowAndNullspace(const cv::Matx<double, 9, 1>& r,
     Matx<double, 9, 1> v2 = Pn.col(index2);
     Matx<double, 9, 1> n0 = N.col(0);
     v2 -= v2.dot(n0) * n0;
-    v2 /= cv::norm(v2);
+    v2 *= (1.0 / cv::norm(v2));
     set<double, 9, 1, 9, 3>(0, 1, N, v2);
+    col_norms[index2] = -1.0; // mark to avoid use in subsequent loops
 
     for (int i = 0; i < 9; i++)
     {
-        if (i == index2 || i == index1) continue;
+        //if (i == index2 || i == index1) continue;
         if (col_norms[i] >= norm_threshold)
         {
-            double cos_v1_x_col = fabs(Pn.col(i).dot(v1) / col_norms[i]);
-            double cos_v2_x_col = fabs(Pn.col(i).dot(v2) / col_norms[i]);
+            double inv_norm = 1.0 / col_norms[i];
+            double cos_v1_x_col = fabs(Pn.col(i).dot(v1) * inv_norm);
+            double cos_v2_x_col = fabs(Pn.col(i).dot(v2) * inv_norm);
 
             if (cos_v1_x_col + cos_v2_x_col <= min_dot1323)
             {
@@ -614,7 +715,7 @@ void PoseSolver::computeRowAndNullspace(const cv::Matx<double, 9, 1>& r,
     Matx<double, 9, 1> v3 = Pn.col(index3);
     Matx<double, 9, 1> n1 = N.col(1);
     v3 -= (v3.dot(n1)) * n1 - (v3.dot(n0)) * n0;
-    v3 /= cv::norm(v3);
+    v3 *= (1.0 / cv::norm(v3));
     set<double, 9, 1, 9, 3>(0, 2, N, v3);
 
 }
@@ -635,17 +736,17 @@ void PoseSolver::nearestRotationMatrixSVD(const cv::Matx<double, 9, 1>& e,
 // Faster nearest rotation computation based on FOAM. See M. Lourakis: "An Efficient Solution to Absolute Orientation", ICPR 2016
 // and M. Lourakis, G. Terzakis: "Efficient Absolute Orientation Revisited", IROS 2018.
 /* Solve the nearest orthogonal approximation problem
-    * i.e., given e, find R minimizing ||R-e||_F
-    *
-    * The computation borrows from Markley's FOAM algorithm
-    * "Attitude Determination Using Vector Observations: A Fast Optimal Matrix Algorithm", J. Astronaut. Sci. 1993.
-    *
-    * See also M. Lourakis: "An Efficient Solution to Absolute Orientation", ICPR 2016
-    *
-    *  Copyright (C) 2019 Manolis Lourakis (lourakis **at** ics forth gr)
-    *  Institute of Computer Science, Foundation for Research & Technology - Hellas
-    *  Heraklion, Crete, Greece.
-    */
+ * i.e., given e, find R minimizing ||R-e||_F
+ *
+ * The computation borrows from Markley's FOAM algorithm
+ * "Attitude Determination Using Vector Observations: A Fast Optimal Matrix Algorithm", J. Astronaut. Sci. 1993.
+ *
+ * See also M. Lourakis: "An Efficient Solution to Absolute Orientation", ICPR 2016
+ *
+ *  Copyright (C) 2019 Manolis Lourakis (lourakis **at** ics forth gr)
+ *  Institute of Computer Science, Foundation for Research & Technology - Hellas
+ *  Heraklion, Crete, Greece.
+ */
 void PoseSolver::nearestRotationMatrixFOAM(const cv::Matx<double, 9, 1>& e,
     cv::Matx<double, 9, 1>& r)
 {
@@ -653,7 +754,7 @@ void PoseSolver::nearestRotationMatrixFOAM(const cv::Matx<double, 9, 1>& e,
     double l, lprev, det_e, e_sq, adj_e_sq, adj_e[9];
 
     // det(e)
-    det_e = e(0) * e(4) * e(8) - e(0) * e(5) * e(7) - e(1) * e(3) * e(8) + e(2) * e(3) * e(7) + e(1) * e(6) * e(5) - e(2) * e(6) * e(4);
+    det_e = ( e(0) * e(4) * e(8) - e(0) * e(5) * e(7) - e(1) * e(3) * e(8) ) + ( e(2) * e(3) * e(7) + e(1) * e(6) * e(5) - e(2) * e(6) * e(4) );
     if (fabs(det_e) < 1E-04) { // singular, handle it with SVD
         PoseSolver::nearestRotationMatrixSVD(e, r);
         return;
@@ -665,8 +766,8 @@ void PoseSolver::nearestRotationMatrixFOAM(const cv::Matx<double, 9, 1>& e,
     adj_e[6] = e(3) * e(7) - e(4) * e(6); adj_e[7] = e(1) * e(6) - e(0) * e(7); adj_e[8] = e(0) * e(4) - e(1) * e(3);
 
     // ||e||^2, ||adj(e)||^2
-    e_sq = e(0) * e(0) + e(1) * e(1) + e(2) * e(2) + e(3) * e(3) + e(4) * e(4) + e(5) * e(5) + e(6) * e(6) + e(7) * e(7) + e(8) * e(8);
-    adj_e_sq = adj_e[0] * adj_e[0] + adj_e[1] * adj_e[1] + adj_e[2] * adj_e[2] + adj_e[3] * adj_e[3] + adj_e[4] * adj_e[4] + adj_e[5] * adj_e[5] + adj_e[6] * adj_e[6] + adj_e[7] * adj_e[7] + adj_e[8] * adj_e[8];
+    e_sq = ( e(0) * e(0) + e(1) * e(1) + e(2) * e(2) ) + ( e(3) * e(3) + e(4) * e(4) + e(5) * e(5) ) + ( e(6) * e(6) + e(7) * e(7) + e(8) * e(8) );
+    adj_e_sq = ( adj_e[0] * adj_e[0] + adj_e[1] * adj_e[1] + adj_e[2] * adj_e[2] ) + ( adj_e[3] * adj_e[3] + adj_e[4] * adj_e[4] + adj_e[5] * adj_e[5] ) + ( adj_e[6] * adj_e[6] + adj_e[7] * adj_e[7] + adj_e[8] * adj_e[8] );
 
     // compute l_max with Newton-Raphson from FOAM's characteristic polynomial, i.e. eq.(23) - (26)
     l = 0.5*(e_sq + 3.0); // 1/2*(trace(mat(e)*mat(e)') + trace(eye(3)))
@@ -733,8 +834,8 @@ void PoseSolver::nearestRotationMatrixFOAM(const cv::Matx<double, 9, 1>& e,
 
 double PoseSolver::det3x3(const cv::Matx<double, 9, 1>& e)
 {
-    return e(0) * e(4) * e(8) + e(1) * e(5) * e(6) + e(2) * e(3) * e(7)
-        - e(6) * e(4) * e(2) - e(7) * e(5) * e(0) - e(8) * e(3) * e(1);
+    return ( e(0) * e(4) * e(8) + e(1) * e(5) * e(6) + e(2) * e(3) * e(7) )
+         - ( e(6) * e(4) * e(2) + e(7) * e(5) * e(0) + e(8) * e(3) * e(1) );
 }
 
 inline bool PoseSolver::positiveDepth(const SQPSolution& solution) const
@@ -815,8 +916,8 @@ double PoseSolver::orthogonalityError(const cv::Matx<double, 9, 1>& e)
     double dot_e1e3 = e(0) * e(6) + e(1) * e(7) + e(2) * e(8);
     double dot_e2e3 = e(3) * e(6) + e(4) * e(7) + e(5) * e(8);
 
-    return (sq_norm_e1 - 1) * (sq_norm_e1 - 1) + (sq_norm_e2 - 1) * (sq_norm_e2 - 1) + (sq_norm_e3 - 1) * (sq_norm_e3 - 1) +
-        2 * (dot_e1e2 * dot_e1e2 + dot_e1e3 * dot_e1e3 + dot_e2e3 * dot_e2e3);
+    return ( (sq_norm_e1 - 1) * (sq_norm_e1 - 1) + (sq_norm_e2 - 1) * (sq_norm_e2 - 1) ) + ( (sq_norm_e3 - 1) * (sq_norm_e3 - 1) +
+        2 * (dot_e1e2 * dot_e1e2 + dot_e1e3 * dot_e1e3 + dot_e2e3 * dot_e2e3) );
 }
 
 }
diff --git a/modules/3d/src/sqpnp.hpp b/modules/3d/src/sqpnp.hpp
index 078c07e906cf..ba44b8e86765 100644
--- a/modules/3d/src/sqpnp.hpp
+++ b/modules/3d/src/sqpnp.hpp
@@ -1,3 +1,10 @@
+// Implementation of SQPnP as described in the paper:
+//
+// "A Consistently Fast and Globally Optimal Solution to the Perspective-n-Point Problem" by G. Terzakis and M. Lourakis
+//     a) Paper:         https://www.ecva.net/papers/eccv_2020/papers_ECCV/papers/123460460.pdf
+//     b) Supplementary: https://www.ecva.net/papers/eccv_2020/papers_ECCV/papers/123460460-supp.pdf
+
+
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html
@@ -158,6 +165,13 @@ class PoseSolver {
     */
     void solveSQPSystem(const cv::Matx<double, 9, 1>& r, cv::Matx<double, 9, 1>& delta);
 
+    /*
+    * @brief                Inverse of SPD 3x3 A via lower triangular sqrt-free Cholesky: A = L*D*L'
+    * @param A              The input matrix
+    * @param A1             The inverse
+    */
+    static bool invertSPD3x3(const cv::Matx<double, 3, 3>& A, cv::Matx<double, 3, 3>& A1);
+
     /*
     * @brief                Analytically computes the inverse of a symmetric 3x3 matrix using the
     *                       lower triangle.
diff --git a/modules/core/doc/intro.markdown b/modules/core/doc/intro.markdown
index 3a0ba76cd40e..55bcbe998f79 100644
--- a/modules/core/doc/intro.markdown
+++ b/modules/core/doc/intro.markdown
@@ -14,6 +14,9 @@ libraries. The following modules are available:
 -   @ref imgproc (**imgproc**) - an image processing module that includes linear and non-linear image filtering,
     geometrical image transformations (resize, affine and perspective warping, generic table-based
     remapping), color space conversion, histograms, and so on.
+-   @ref imgcodecs (**imgcodecs**) - includes functions for reading and writing image files in various formats.
+-   @ref videoio (**videoio**) - an easy-to-use interface to video capturing and video codecs.
+-   @ref highgui (**highgui**) - an easy-to-use interface to simple UI capabilities.
 -   @ref video (**video**) - a video analysis module that includes motion estimation, background subtraction,
     and object tracking algorithms.
 -   @ref _3d "3d" (**3d**) - basic multiple-view geometry algorithms, object pose estimation and elements of 3D reconstruction.
@@ -24,6 +27,9 @@ libraries. The following modules are available:
 -   @ref stereo (**stereo**) - stereo correspondence algorithms
 -   @ref highgui (**highgui**) - an easy-to-use interface to simple UI capabilities.
 -   @ref videoio (**videoio**) - an easy-to-use interface to video capturing and video codecs.
+-   @ref dnn (**dnn**) - Deep Neural Network module.
+-   @ref photo (**photo**) - advanced photo processing techniques like denoising, inpainting.
+-   @ref stitching (**stitching**) - functions for image stitching and panorama creation.
 -   ... some other helper modules, such as FLANN and Google test wrappers, Python bindings, and
     others.
 
diff --git a/modules/core/include/opencv2/core.hpp b/modules/core/include/opencv2/core.hpp
index a9ed18c2d204..6cfdec1b5915 100644
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -60,11 +60,16 @@
 
 /**
 @defgroup core Core functionality
+
+The Core module is the backbone of OpenCV, offering fundamental data structures, matrix operations,
+and utility functions that other modules depend on. It’s essential for handling image data,
+performing mathematical computations, and managing memory efficiently within the OpenCV ecosystem.
+
 @{
     @defgroup core_basic Basic structures
     @defgroup core_array Operations on arrays
     @defgroup core_async Asynchronous API
-    @defgroup core_xml XML/YAML Persistence
+    @defgroup core_xml XML/YAML/JSON Persistence
     @defgroup core_cluster Clustering
     @defgroup core_utils Utility and system functions and macros
     @{
@@ -76,7 +81,6 @@
         @defgroup core_utils_samples Utility functions for OpenCV samples
     @}
     @defgroup core_opengl OpenGL interoperability
-    @defgroup core_ipp Intel IPP Asynchronous C/C++ Converters
     @defgroup core_optim Optimization Algorithms
     @defgroup core_directx DirectX interoperability
     @defgroup core_eigen Eigen support
@@ -96,12 +100,13 @@
     @{
         @defgroup core_parallel_backend Parallel backends API
     @}
+    @defgroup core_quaternion Quaternion
 @}
  */
 
 namespace cv {
 
-//! @addtogroup core
+//! @addtogroup core_utils
 //! @{
 
 enum SortFlags { SORT_EVERY_ROW    = 0, //!< each matrix row is sorted independently
@@ -115,6 +120,11 @@ enum SortFlags { SORT_EVERY_ROW    = 0, //!< each matrix row is sorted independe
                                         //!< mutually exclusive.
                };
 
+//! @} core_utils
+
+//! @addtogroup core_array
+//! @{
+
 //! Covariation flags
 enum CovarFlags {
     /** The output covariance matrix is calculated as:
@@ -151,27 +161,6 @@ enum CovarFlags {
     COVAR_COLS      = 16
 };
 
-//! @addtogroup core_cluster
-//!  @{
-
-//! k-Means flags
-enum KmeansFlags {
-    /** Select random initial centers in each attempt.*/
-    KMEANS_RANDOM_CENTERS     = 0,
-    /** Use kmeans++ center initialization by Arthur and Vassilvitskii [Arthur2007].*/
-    KMEANS_PP_CENTERS         = 2,
-    /** During the first (and possibly the only) attempt, use the
-        user-supplied labels instead of computing them from the initial centers. For the second and
-        further attempts, use the random or semi-random centers. Use one of KMEANS_\*_CENTERS flag
-        to specify the exact method.*/
-    KMEANS_USE_INITIAL_LABELS = 1
-};
-
-//! @} core_cluster
-
-//! @addtogroup core_array
-//! @{
-
 enum ReduceTypes { REDUCE_SUM = 0, //!< the output is the sum of all rows/columns of the matrix.
                    REDUCE_AVG = 1, //!< the output is the mean vector of all rows/columns of the matrix.
                    REDUCE_MAX = 2, //!< the output is the maximum (column/row-wise) of all rows/columns of the matrix.
@@ -179,19 +168,12 @@ enum ReduceTypes { REDUCE_SUM = 0, //!< the output is the sum of all rows/column
                    REDUCE_SUM2 = 4  //!< the output is the sum of all squared rows/columns of the matrix.
                  };
 
-//! @} core_array
-
 /** @brief Swaps two matrices
 */
 CV_EXPORTS void swap(Mat& a, Mat& b);
 /** @overload */
 CV_EXPORTS void swap( UMat& a, UMat& b );
 
-//! @} core
-
-//! @addtogroup core_array
-//! @{
-
 /** @brief Computes the source location of an extrapolated pixel.
 
 The function computes and returns the coordinate of a donor pixel corresponding to the specified
@@ -492,6 +474,10 @@ For example:
 CV_EXPORTS_W void convertScaleAbs(InputArray src, OutputArray dst,
                                   double alpha = 1, double beta = 0);
 
+/** @example samples/cpp/tutorial_code/core/how_to_scan_images/how_to_scan_images.cpp
+Check @ref tutorial_how_to_scan_images "the corresponding tutorial" for more details
+*/
+
 /** @brief Performs a look-up table transform of an array.
 
 The function LUT fills the output array with values from the look-up table. Indices of the entries
@@ -3007,6 +2993,19 @@ class CV_EXPORTS RNG_MT19937
 //! @addtogroup core_cluster
 //!  @{
 
+//! k-means flags
+enum KmeansFlags {
+    /** Select random initial centers in each attempt.*/
+    KMEANS_RANDOM_CENTERS     = 0,
+    /** Use kmeans++ center initialization by Arthur and Vassilvitskii [Arthur2007].*/
+    KMEANS_PP_CENTERS         = 2,
+    /** During the first (and possibly the only) attempt, use the
+        user-supplied labels instead of computing them from the initial centers. For the second and
+        further attempts, use the random or semi-random centers. Use one of KMEANS_\*_CENTERS flag
+        to specify the exact method.*/
+    KMEANS_USE_INITIAL_LABELS = 1
+};
+
 /** @example samples/cpp/snippets/kmeans.cpp
 An example on k-means clustering
 */
@@ -3021,7 +3020,7 @@ and groups the input samples around the clusters. As an output, \f$\texttt{bestL
 0-based cluster index for the sample stored in the \f$i^{th}\f$ row of the samples matrix.
 
 @note
--   (Python) An example on K-means clustering can be found at
+-   (Python) An example on k-means clustering can be found at
     opencv_source_code/samples/python/kmeans.py
 @param data Data for clustering. An array of N-Dimensional points with float coordinates is needed.
 Examples of this array can be:
diff --git a/modules/core/include/opencv2/core/affine.hpp b/modules/core/include/opencv2/core/affine.hpp
index 1806382e99ae..1aebf2b5071a 100644
--- a/modules/core/include/opencv2/core/affine.hpp
+++ b/modules/core/include/opencv2/core/affine.hpp
@@ -51,7 +51,7 @@
 namespace cv
 {
 
-//! @addtogroup core
+//! @addtogroup core_eigen
 //! @{
 
     /** @brief Affine transform
diff --git a/modules/core/include/opencv2/core/bufferpool.hpp b/modules/core/include/opencv2/core/bufferpool.hpp
index 4698e5da167d..e835ad025ca9 100644
--- a/modules/core/include/opencv2/core/bufferpool.hpp
+++ b/modules/core/include/opencv2/core/bufferpool.hpp
@@ -15,7 +15,7 @@
 namespace cv
 {
 
-//! @addtogroup core
+//! @addtogroup core_opencl
 //! @{
 
 class BufferPoolController
diff --git a/modules/core/include/opencv2/core/dualquaternion.hpp b/modules/core/include/opencv2/core/dualquaternion.hpp
index 1f644e9dc83b..4fec990461c3 100644
--- a/modules/core/include/opencv2/core/dualquaternion.hpp
+++ b/modules/core/include/opencv2/core/dualquaternion.hpp
@@ -30,7 +30,7 @@
 #include <opencv2/core/affine.hpp>
 
 namespace cv{
-//! @addtogroup core
+//! @addtogroup core_quaternion
 //! @{
 
 template <typename _Tp> class DualQuat;
diff --git a/modules/core/include/opencv2/core/exception.hpp b/modules/core/include/opencv2/core/exception.hpp
index 117cd47a55d6..2d189241bca7 100644
--- a/modules/core/include/opencv2/core/exception.hpp
+++ b/modules/core/include/opencv2/core/exception.hpp
@@ -127,28 +127,28 @@ CV_EXPORTS CV_NORETURN void error(const Exception& exc);
 By default the function prints information about the error to stderr,
 then it either stops if setBreakOnError() had been called before or raises the exception.
 It is possible to alternate error processing by using redirectError().
-@param _code - error code (Error::Code)
-@param _err - error description
-@param _func - function name. Available only when the compiler supports getting it
-@param _file - source file name where the error has occurred
-@param _line - line number in the source file where the error has occurred
+@param code - error code (Error::Code)
+@param err - error description
+@param func - function name. Available only when the compiler supports getting it
+@param file - source file name where the error has occurred
+@param line - line number in the source file where the error has occurred
 @see CV_Error, CV_Error_, CV_Assert, CV_DbgAssert
  */
-CV_EXPORTS CV_NORETURN void error(Error::Code _code, const String& _err, const char* _func, const char* _file, int _line);
+CV_EXPORTS CV_NORETURN void error(Error::Code code, const String& err, const char* func, const char* file, int line);
 
 /*! @brief Signals an error and terminate application.
 
 By default the function prints information about the error to stderr, then it terminates application
 with std::terminate. The function is designed for invariants check in functions and methods with
 noexcept attribute.
-@param _code - error code (Error::Code)
-@param _err - error description
-@param _func - function name. Available only when the compiler supports getting it
-@param _file - source file name where the error has occurred
-@param _line - line number in the source file where the error has occurred
+@param code - error code (Error::Code)
+@param err - error description
+@param func - function name. Available only when the compiler supports getting it
+@param file - source file name where the error has occurred
+@param line - line number in the source file where the error has occurred
 @see CV_AssertTerminate
  */
-CV_EXPORTS CV_NORETURN void terminate(Error::Code _code, const String& _err, const char* _func, const char* _file, int _line) CV_NOEXCEPT;
+CV_EXPORTS CV_NORETURN void terminate(Error::Code code, const String& err, const char* func, const char* file, int line) CV_NOEXCEPT;
 
 
 #ifdef CV_STATIC_ANALYSIS
diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index 745a9dc40f90..2e78134992ff 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -194,6 +194,20 @@ CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(double, int64, uint64, double, void, double)
 #endif // CV_CPU_OPTIMIZATION_HAL_NAMESPACE
 
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+template <typename _VecTp> inline _VecTp v_setzero_();
+template <typename _VecTp> inline _VecTp v_setall_(uchar);
+template <typename _VecTp> inline _VecTp v_setall_(schar);
+template <typename _VecTp> inline _VecTp v_setall_(ushort);
+template <typename _VecTp> inline _VecTp v_setall_(short);
+template <typename _VecTp> inline _VecTp v_setall_(unsigned);
+template <typename _VecTp> inline _VecTp v_setall_(int);
+template <typename _VecTp> inline _VecTp v_setall_(uint64);
+template <typename _VecTp> inline _VecTp v_setall_(int64);
+template <typename _VecTp> inline _VecTp v_setall_(float);
+template <typename _VecTp> inline _VecTp v_setall_(double);
+template <typename _VecTp> inline _VecTp v_setall_(hfloat);
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 #endif
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
index 3a8505a297b5..f2525f0b24c2 100644
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -447,6 +447,10 @@ OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float64x4, double, pd, __m128d)
     { return _Tpvec(_mm256_setzero_si256()); }                                   \
     inline _Tpvec v256_setall_##suffix(_Tp v)                                    \
     { return _Tpvec(_mm256_set1_##ssuffix((ctype_s)v)); }                        \
+    template <> inline _Tpvec v_setzero_()                                       \
+    { return v256_setzero_##suffix(); }                                          \
+    template <> inline _Tpvec v_setall_(_Tp v)                                   \
+    { return v256_setall_##suffix(v); }                                          \
     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32,  suffix, OPENCV_HAL_NOP)        \
     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32,   suffix, OPENCV_HAL_NOP)        \
     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP)        \
@@ -472,6 +476,10 @@ OPENCV_HAL_IMPL_AVX_INIT(v_int64x4,   int64,    s64, epi64x, int64)
     { return _Tpvec(_mm256_setzero_##zsuffix()); }                       \
     inline _Tpvec v256_setall_##suffix(_Tp v)                            \
     { return _Tpvec(_mm256_set1_##zsuffix(v)); }                         \
+    template <> inline _Tpvec v_setzero_()                               \
+    { return v256_setzero_##suffix(); }                                  \
+    template <> inline _Tpvec v_setall_(_Tp v)                           \
+    { return v256_setall_##suffix(v); }                                  \
     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32,  suffix, cast)          \
     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32,   suffix, cast)          \
     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, cast)          \
@@ -3179,6 +3187,20 @@ inline void v_pack_store(bfloat* ptr, const v_float32x8& a)
 
 inline void v256_cleanup() { _mm256_zeroall(); }
 
+#include "intrin_math.hpp"
+inline v_float32x8 v_exp(const v_float32x8& x) { return v_exp_default_32f<v_float32x8, v_int32x8>(x); }
+inline v_float32x8 v_log(const v_float32x8& x) { return v_log_default_32f<v_float32x8, v_int32x8>(x); }
+inline void v_sincos(const v_float32x8& x, v_float32x8& s, v_float32x8& c) { v_sincos_default_32f<v_float32x8, v_int32x8>(x, s, c); }
+inline v_float32x8 v_sin(const v_float32x8& x) { return v_sin_default_32f<v_float32x8, v_int32x8>(x); }
+inline v_float32x8 v_cos(const v_float32x8& x) { return v_cos_default_32f<v_float32x8, v_int32x8>(x); }
+inline v_float32x8 v_erf(const v_float32x8& x) { return v_erf_default_32f<v_float32x8, v_int32x8>(x); }
+
+inline v_float64x4 v_exp(const v_float64x4& x) { return v_exp_default_64f<v_float64x4, v_int64x4>(x); }
+inline v_float64x4 v_log(const v_float64x4& x) { return v_log_default_64f<v_float64x4, v_int64x4>(x); }
+inline void v_sincos(const v_float64x4& x, v_float64x4& s, v_float64x4& c) { v_sincos_default_64f<v_float64x4, v_int64x4>(x, s, c); }
+inline v_float64x4 v_sin(const v_float64x4& x) { return v_sin_default_64f<v_float64x4, v_int64x4>(x); }
+inline v_float64x4 v_cos(const v_float64x4& x) { return v_cos_default_64f<v_float64x4, v_int64x4>(x); }
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 //! @endcond
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
index 64dab6b3ae0e..077b4d17a75a 100644
--- a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
@@ -458,6 +458,10 @@ OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float64x8, double, pd, __m256d)
     { return _Tpvec(_mm512_setzero_si512()); }                                     \
     inline _Tpvec v512_setall_##suffix(_Tp v)                                      \
     { return _Tpvec(_mm512_set1_##ssuffix((ctype_s)v)); }                          \
+    template <> inline _Tpvec v_setzero_()                                         \
+    { return v512_setzero_##suffix(); }                                            \
+    template <> inline _Tpvec v_setall_(_Tp v)                                     \
+    { return v512_setall_##suffix(v); }                                            \
     OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64,   suffix, OPENCV_HAL_NOP)      \
     OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64,    suffix, OPENCV_HAL_NOP)      \
     OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32,  suffix, OPENCV_HAL_NOP)      \
@@ -483,6 +487,10 @@ OPENCV_HAL_IMPL_AVX512_INIT(v_int64x8,   int64,    s64, epi64,  int64)
     { return _Tpvec(_mm512_setzero_##zsuffix()); }                          \
     inline _Tpvec v512_setall_##suffix(_Tp v)                               \
     { return _Tpvec(_mm512_set1_##zsuffix(v)); }                            \
+    template <> inline _Tpvec v_setzero_()                                  \
+    { return v512_setzero_##suffix(); }                                     \
+    template <> inline _Tpvec v_setall_(_Tp v)                              \
+    { return v512_setall_##suffix(v); }                                     \
     OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64,  suffix, cast)          \
     OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64,   suffix, cast)          \
     OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, cast)          \
@@ -3070,6 +3078,20 @@ inline int v_scan_forward(const v_float64x8& a) { return trailingZeros32(v_signm
 
 inline void v512_cleanup() { _mm256_zeroall(); }
 
+#include "intrin_math.hpp"
+inline v_float32x16 v_exp(const v_float32x16& x) { return v_exp_default_32f<v_float32x16, v_int32x16>(x); }
+inline v_float32x16 v_log(const v_float32x16& x) { return v_log_default_32f<v_float32x16, v_int32x16>(x); }
+inline void v_sincos(const v_float32x16& x, v_float32x16& s, v_float32x16& c) { v_sincos_default_32f<v_float32x16, v_int32x16>(x, s, c); }
+inline v_float32x16 v_sin(const v_float32x16& x) { return v_sin_default_32f<v_float32x16, v_int32x16>(x); }
+inline v_float32x16 v_cos(const v_float32x16& x) { return v_cos_default_32f<v_float32x16, v_int32x16>(x); }
+inline v_float32x16 v_erf(const v_float32x16& x) { return v_erf_default_32f<v_float32x16, v_int32x16>(x); }
+
+inline v_float64x8 v_exp(const v_float64x8& x) { return v_exp_default_64f<v_float64x8, v_int64x8>(x); }
+inline v_float64x8 v_log(const v_float64x8& x) { return v_log_default_64f<v_float64x8, v_int64x8>(x); }
+inline void v_sincos(const v_float64x8& x, v_float64x8& s, v_float64x8& c) { v_sincos_default_64f<v_float64x8, v_int64x8>(x, s, c); }
+inline v_float64x8 v_sin(const v_float64x8& x) { return v_sin_default_64f<v_float64x8, v_int64x8>(x); }
+inline v_float64x8 v_cos(const v_float64x8& x) { return v_cos_default_64f<v_float64x8, v_int64x8>(x); }
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 //! @endcond
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index fed7cc261a78..7eeed2ce9b5d 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -263,8 +263,8 @@ Most of these operations return only one value.
 
 ### Other math
 
-- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude, @ref v_exp,
-                            @ref v_erf
+- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude, @ref v_exp, @ref v_log,
+                            @ref v_erf, @ref v_sin, @ref v_cos
 - Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs
 
 ### Conversions
@@ -366,6 +366,7 @@ Floating point:
 |broadcast_element  | x |   |
 |exp                | x | x |
 |log                | x | x |
+|sin, cos           | x | x |
 
  @{ */
 
@@ -745,10 +746,41 @@ OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
  */
 OPENCV_HAL_IMPL_MATH_FUNC(v_erf, std::erf, _Tp)
 
-//! @cond IGNORED
+/**
+ * @brief Compute sine \f$ sin(x) \f$ and cosine \f$ cos(x) \f$ of elements at the same time
+ *
+ * Only for floating point types. Core implementation steps:
+ * 1. Input Normalization: Scale the periodicity from 2π to 4 and reduce the angle to the range \f$ [0, \frac{\pi}{4}] \f$ using periodicity and trigonometric identities.
+ * 2. Polynomial Approximation for \f$ sin(x) \f$ and \f$ cos(x) \f$:
+ *   - For float16 and float32, use a Taylor series with 4 terms for sine and 5 terms for cosine.
+ *   - For float64, use a Taylor series with 7 terms for sine and 8 terms for cosine.
+ * 3. Select Results: select and convert the final sine and cosine values for the original input angle.
+ *
+ * @note The precision of the calculation depends on the implementation and the data type of the input vector.
+ */
+template<typename _Tp, int n>
+inline void v_sincos(const v_reg<_Tp, n>& x, v_reg<_Tp, n>& s, v_reg<_Tp, n>& c)
+{
+    for( int i = 0; i < n; i++ )
+    {
+        s.s[i] = std::sin(x.s[i]);
+        c.s[i] = std::cos(x.s[i]);
+    }
+}
+
+/**
+ * @brief Sine \f$ sin(x) \f$ of elements
+ *
+ * Only for floating point types. Core implementation the same as @ref v_sincos.
+ */
 OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
+
+/**
+ * @brief Cosine \f$ cos(x) \f$ of elements
+ *
+ * Only for floating point types. Core implementation the same as @ref v_sincos.
+ */
 OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
-//! @endcond
 
 /** @brief Absolute value of elements
 
@@ -2801,7 +2833,8 @@ inline void v_transpose4x4( v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, prefix, suffix) \
-inline _Tpvec prefix##_setzero_##suffix() { return _Tpvec::zero(); }
+inline _Tpvec prefix##_setzero_##suffix() { return _Tpvec::zero(); } \
+template <> inline _Tpvec v_setzero_() { return _Tpvec::zero(); }
 
 //! @name Init with zero
 //! @{
@@ -2847,7 +2880,8 @@ OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x8, v512, s64)
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, prefix, suffix) \
-inline _Tpvec prefix##_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
+inline _Tpvec prefix##_setall_##suffix(_Tp val) { return _Tpvec::all(val); } \
+template <> inline _Tpvec v_setall_(_Tp val) { return _Tpvec::all(val); }
 
 //! @name Init with value
 //! @{
diff --git a/modules/core/include/opencv2/core/hal/intrin_lasx.hpp b/modules/core/include/opencv2/core/hal/intrin_lasx.hpp
index 45f53de8a248..68d08b2ef43e 100644
--- a/modules/core/include/opencv2/core/hal/intrin_lasx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_lasx.hpp
@@ -557,6 +557,10 @@ inline __m256i _lasx_256_castpd_si256(const __m256d& v)
     { return _Tpvec(__lasx_xvreplgr2vr_d(0)); }                                   \
     inline _Tpvec v256_setall_##suffix(_Tp v)                                     \
     { return _Tpvec(__lasx_xvreplgr2vr_##ssuffix((ctype_s)v)); }                  \
+    template <> inline _Tpvec v_setzero_()                                        \
+    { return v256_setzero_##suffix(); }                                           \
+    template <> inline _Tpvec v_setall_(_Tp v)                                    \
+    { return v256_setall_##suffix(v); }                                           \
     OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint8x32,  suffix, OPENCV_HAL_NOP)        \
     OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int8x32,   suffix, OPENCV_HAL_NOP)        \
     OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP)        \
@@ -588,7 +592,11 @@ inline __m256d _lasx_256_castsi256_pd(const __m256i &v)
     inline _Tpvec v256_setzero_##suffix()                                 \
     { return _Tpvec(__lasx_xvreplgr2vr_d(0)); }                           \
     inline _Tpvec v256_setall_##suffix(_Tp v)                             \
-    { return _Tpvec(_v256_setall_##zsuffix(v)); }                   \
+    { return _Tpvec(_v256_setall_##zsuffix(v)); }                         \
+    template <> inline _Tpvec v_setzero_()                                \
+    { return v256_setzero_##suffix(); }                                   \
+    template <> inline _Tpvec v_setall_(_Tp v)                            \
+    { return v256_setall_##suffix(v); }                                   \
     OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint8x32,  suffix, cast)          \
     OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int8x32,   suffix, cast)          \
     OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint16x16, suffix, cast)          \
@@ -3005,6 +3013,20 @@ inline void v_pack_store(hfloat* ptr, const v_float32x8& a)
 
 inline void v256_cleanup() {}
 
+#include "intrin_math.hpp"
+inline v_float32x8 v_exp(const v_float32x8& x) { return v_exp_default_32f<v_float32x8, v_int32x8>(x); }
+inline v_float32x8 v_log(const v_float32x8& x) { return v_log_default_32f<v_float32x8, v_int32x8>(x); }
+inline void v_sincos(const v_float32x8& x, v_float32x8& s, v_float32x8& c) { v_sincos_default_32f<v_float32x8, v_int32x8>(x, s, c); }
+inline v_float32x8 v_sin(const v_float32x8& x) { return v_sin_default_32f<v_float32x8, v_int32x8>(x); }
+inline v_float32x8 v_cos(const v_float32x8& x) { return v_cos_default_32f<v_float32x8, v_int32x8>(x); }
+inline v_float32x8 v_erf(const v_float32x8& x) { return v_erf_default_32f<v_float32x8, v_int32x8>(x); }
+
+inline v_float64x4 v_exp(const v_float64x4& x) { return v_exp_default_64f<v_float64x4, v_int64x4>(x); }
+inline v_float64x4 v_log(const v_float64x4& x) { return v_log_default_64f<v_float64x4, v_int64x4>(x); }
+inline void v_sincos(const v_float64x4& x, v_float64x4& s, v_float64x4& c) { v_sincos_default_64f<v_float64x4, v_int64x4>(x, s, c); }
+inline v_float64x4 v_sin(const v_float64x4& x) { return v_sin_default_64f<v_float64x4, v_int64x4>(x); }
+inline v_float64x4 v_cos(const v_float64x4& x) { return v_cos_default_64f<v_float64x4, v_int64x4>(x); }
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 //! @endcond
diff --git a/modules/core/include/opencv2/core/hal/intrin_lsx.hpp b/modules/core/include/opencv2/core/hal/intrin_lsx.hpp
index aa997070c359..a2f23d6abe44 100644
--- a/modules/core/include/opencv2/core/hal/intrin_lsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_lsx.hpp
@@ -417,6 +417,10 @@ inline __m128i _lsx_128_castpd_si128(const __m128d& v)
     { return _Tpvec(__lsx_vldi(0)); }                                             \
     inline _Tpvec v_setall_##suffix(_Tp v)                                        \
     { return _Tpvec(__lsx_vreplgr2vr_##ssuffix((ctype_s)v)); }                    \
+    template <> inline _Tpvec v_setzero_()                                        \
+    { return v_setzero_##suffix(); }                                              \
+    template <> inline _Tpvec v_setall_(_Tp v)                                    \
+    { return v_setall_##suffix(v); }                                              \
     OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16,  suffix, OPENCV_HAL_NOP)         \
     OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16,   suffix, OPENCV_HAL_NOP)         \
     OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8,  suffix, OPENCV_HAL_NOP)         \
@@ -448,6 +452,10 @@ inline __m128d _lsx_128_castsi128_pd(const __m128i &v)
     { return _Tpvec(__lsx_vldi(0)); }                                       \
     inline _Tpvec v_setall_##suffix(_Tp v)                                  \
     { return _Tpvec(_v128_setall_##zsuffix(v)); }                           \
+    template <> inline _Tpvec v_setzero_()                                  \
+    { return v_setzero_##suffix(); }                                        \
+    template <> inline _Tpvec v_setall_(_Tp v)                              \
+    { return v_setall_##suffix(v); }                                        \
     OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16,     suffix,   cast)        \
     OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16,      suffix,   cast)        \
     OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8,     suffix,   cast)        \
@@ -2515,6 +2523,20 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& a)
 
 inline void v_cleanup() {}
 
+#include "intrin_math.hpp"
+inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
+inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
+inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
+
+inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
+inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
+inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 //! @endcond
diff --git a/modules/core/include/opencv2/core/hal/intrin_math.hpp b/modules/core/include/opencv2/core/hal/intrin_math.hpp
index eaf3b3b78ba8..b7e649e74477 100644
--- a/modules/core/include/opencv2/core/hal/intrin_math.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_math.hpp
@@ -2,10 +2,6 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html
 
-// This header is not standalone. Don't include directly, use "intrin.hpp" instead.
-#ifdef OPENCV_HAL_INTRIN_HPP  // defined in intrin.hpp
-
-namespace CV__SIMD_NAMESPACE {
 
 /* Universal Intrinsics implementation of sin, cos, exp and log
 
@@ -34,434 +30,658 @@ namespace CV__SIMD_NAMESPACE {
 
   (this is the zlib license)
 */
-
-#ifndef OPENCV_HAL_MATH_HAVE_EXP
+#ifndef OPENCV_HAL_INTRIN_MATH_HPP
+#define OPENCV_HAL_INTRIN_MATH_HPP
 
 //! @name Exponential
 //! @{
-#if defined(CV_SIMD_FP16) && CV_SIMD_FP16
-    // Implementation is the same as float32 vector.
-    inline v_float16 v_exp(const v_float16 &x) {
-        const v_float16 _vexp_lo_f16 = vx_setall_f16(hfloat(-10.7421875f));
-        const v_float16 _vexp_hi_f16 = vx_setall_f16(hfloat(11.f));
-        const v_float16 _vexp_half_fp16 = vx_setall_f16(hfloat(0.5f));
-        const v_float16 _vexp_one_fp16 = vx_setall_f16(hfloat(1.f));
-        const v_float16 _vexp_LOG2EF_f16 = vx_setall_f16(hfloat(1.44269504088896341f));
-        const v_float16 _vexp_C1_f16 = vx_setall_f16(hfloat(-6.93359375E-1f));
-        const v_float16 _vexp_C2_f16 = vx_setall_f16(hfloat(2.12194440E-4f));
-        const v_float16 _vexp_p0_f16 = vx_setall_f16(hfloat(1.9875691500E-4f));
-        const v_float16 _vexp_p1_f16 = vx_setall_f16(hfloat(1.3981999507E-3f));
-        const v_float16 _vexp_p2_f16 = vx_setall_f16(hfloat(8.3334519073E-3f));
-        const v_float16 _vexp_p3_f16 = vx_setall_f16(hfloat(4.1665795894E-2f));
-        const v_float16 _vexp_p4_f16 = vx_setall_f16(hfloat(1.6666665459E-1f));
-        const v_float16 _vexp_p5_f16 = vx_setall_f16(hfloat(5.0000001201E-1f));
-        const v_int16 _vexp_bias_s16 = vx_setall_s16(0xf);
-
-        v_float16 _vexp_, _vexp_x, _vexp_y, _vexp_xx;
-        v_int16 _vexp_mm;
-
-        // compute exponential of x
-        _vexp_x = v_max(x, _vexp_lo_f16);
-        _vexp_x = v_min(_vexp_x, _vexp_hi_f16);
-
-        _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f16, _vexp_half_fp16);
-        _vexp_mm = v_floor(_vexp_);
-        _vexp_ = v_cvt_f16(_vexp_mm);
-        _vexp_mm = v_add(_vexp_mm, _vexp_bias_s16);
-        _vexp_mm = v_shl(_vexp_mm, 10);
-
-        _vexp_x = v_fma(_vexp_, _vexp_C1_f16, _vexp_x);
-        _vexp_x = v_fma(_vexp_, _vexp_C2_f16, _vexp_x);
-        _vexp_xx = v_mul(_vexp_x, _vexp_x);
-
-        _vexp_y = v_fma(_vexp_x, _vexp_p0_f16, _vexp_p1_f16);
-        _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f16);
-        _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f16);
-        _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f16);
-        _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f16);
-
-        _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
-        _vexp_y = v_add(_vexp_y, _vexp_one_fp16);
-        _vexp_y = v_mul(_vexp_y, v_reinterpret_as_f16(_vexp_mm));
-
-        // exp(NAN) -> NAN
-        v_float16 mask_not_nan = v_not_nan(x);
-        return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f16(vx_setall_s16(0x7e00)));
-    }
-#endif
-
-    inline v_float32 v_exp(const v_float32 &x) {
-        const v_float32 _vexp_lo_f32 = vx_setall_f32(-88.3762626647949f);
-        const v_float32 _vexp_hi_f32 = vx_setall_f32(89.f);
-        const v_float32 _vexp_half_fp32 = vx_setall_f32(0.5f);
-        const v_float32 _vexp_one_fp32 = vx_setall_f32(1.f);
-        const v_float32 _vexp_LOG2EF_f32 = vx_setall_f32(1.44269504088896341f);
-        const v_float32 _vexp_C1_f32 = vx_setall_f32(-6.93359375E-1f);
-        const v_float32 _vexp_C2_f32 = vx_setall_f32(2.12194440E-4f);
-        const v_float32 _vexp_p0_f32 = vx_setall_f32(1.9875691500E-4f);
-        const v_float32 _vexp_p1_f32 = vx_setall_f32(1.3981999507E-3f);
-        const v_float32 _vexp_p2_f32 = vx_setall_f32(8.3334519073E-3f);
-        const v_float32 _vexp_p3_f32 = vx_setall_f32(4.1665795894E-2f);
-        const v_float32 _vexp_p4_f32 = vx_setall_f32(1.6666665459E-1f);
-        const v_float32 _vexp_p5_f32 = vx_setall_f32(5.0000001201E-1f);
-        const v_int32 _vexp_bias_s32 = vx_setall_s32(0x7f);
-
-        v_float32 _vexp_, _vexp_x, _vexp_y, _vexp_xx;
-        v_int32 _vexp_mm;
-
-        // compute exponential of x
-        _vexp_x = v_max(x, _vexp_lo_f32);
-        _vexp_x = v_min(_vexp_x, _vexp_hi_f32);
-
-        _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f32, _vexp_half_fp32);
-        _vexp_mm = v_floor(_vexp_);
-        _vexp_ = v_cvt_f32(_vexp_mm);
-        _vexp_mm = v_add(_vexp_mm, _vexp_bias_s32);
-        _vexp_mm = v_shl(_vexp_mm, 23);
-
-        _vexp_x = v_fma(_vexp_, _vexp_C1_f32, _vexp_x);
-        _vexp_x = v_fma(_vexp_, _vexp_C2_f32, _vexp_x);
-        _vexp_xx = v_mul(_vexp_x, _vexp_x);
-
-        _vexp_y = v_fma(_vexp_x, _vexp_p0_f32, _vexp_p1_f32);
-        _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f32);
-        _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f32);
-        _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f32);
-        _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f32);
-
-        _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
-        _vexp_y = v_add(_vexp_y, _vexp_one_fp32);
-        _vexp_y = v_mul(_vexp_y, v_reinterpret_as_f32(_vexp_mm));
-
-        // exp(NAN) -> NAN
-        v_float32 mask_not_nan = v_not_nan(x);
-        return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f32(vx_setall_s32(0x7fc00000)));
-    }
-
-#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
-    inline v_float64 v_exp(const v_float64 &x) {
-        const v_float64 _vexp_lo_f64 = vx_setall_f64(-709.43613930310391424428);
-        const v_float64 _vexp_hi_f64 = vx_setall_f64(710.);
-        const v_float64 _vexp_half_f64 = vx_setall_f64(0.5);
-        const v_float64 _vexp_one_f64 = vx_setall_f64(1.0);
-        const v_float64 _vexp_two_f64 = vx_setall_f64(2.0);
-        const v_float64 _vexp_LOG2EF_f64 = vx_setall_f64(1.44269504088896340736);
-        const v_float64 _vexp_C1_f64 = vx_setall_f64(-6.93145751953125E-1);
-        const v_float64 _vexp_C2_f64 = vx_setall_f64(-1.42860682030941723212E-6);
-        const v_float64 _vexp_p0_f64 = vx_setall_f64(1.26177193074810590878E-4);
-        const v_float64 _vexp_p1_f64 = vx_setall_f64(3.02994407707441961300E-2);
-        const v_float64 _vexp_p2_f64 = vx_setall_f64(9.99999999999999999910E-1);
-        const v_float64 _vexp_q0_f64 = vx_setall_f64(3.00198505138664455042E-6);
-        const v_float64 _vexp_q1_f64 = vx_setall_f64(2.52448340349684104192E-3);
-        const v_float64 _vexp_q2_f64 = vx_setall_f64(2.27265548208155028766E-1);
-        const v_float64 _vexp_q3_f64 = vx_setall_f64(2.00000000000000000009E0);
-        const v_int64 _vexp_bias_s64 = vx_setall_s64(0x3ff);
-
-        v_float64 _vexp_, _vexp_x, _vexp_y, _vexp_z, _vexp_xx;
-        v_int64 _vexp_mm;
-
-        // compute exponential of x
-        _vexp_x = v_max(x, _vexp_lo_f64);
-        _vexp_x = v_min(_vexp_x, _vexp_hi_f64);
-
-        _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f64, _vexp_half_f64);
-        _vexp_mm = v_expand_low(v_floor(_vexp_));
-        _vexp_ = v_cvt_f64(_vexp_mm);
-        _vexp_mm = v_add(_vexp_mm, _vexp_bias_s64);
-        _vexp_mm = v_shl(_vexp_mm, 52);
-
-        _vexp_x = v_fma(_vexp_, _vexp_C1_f64, _vexp_x);
-        _vexp_x = v_fma(_vexp_, _vexp_C2_f64, _vexp_x);
-        _vexp_xx = v_mul(_vexp_x, _vexp_x);
-
-        _vexp_y = v_fma(_vexp_xx, _vexp_p0_f64, _vexp_p1_f64);
-        _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_p2_f64);
-        _vexp_y = v_mul(_vexp_y, _vexp_x);
-
-        _vexp_z = v_fma(_vexp_xx, _vexp_q0_f64, _vexp_q1_f64);
-        _vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q2_f64);
-        _vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q3_f64);
-
-        _vexp_z = v_div(_vexp_y, v_sub(_vexp_z, _vexp_y));
-        _vexp_z = v_fma(_vexp_two_f64, _vexp_z, _vexp_one_f64);
-        _vexp_z = v_mul(_vexp_z, v_reinterpret_as_f64(_vexp_mm));
-
-        // exp(NAN) -> NAN
-        v_float64 mask_not_nan = v_not_nan(x);
-        return v_select(mask_not_nan, _vexp_z, v_reinterpret_as_f64(vx_setall_s64(0x7FF8000000000000)));
-    }
-#endif
-
-#define OPENCV_HAL_MATH_HAVE_EXP 1
-//! @}
-#endif
+// Implementation is the same as float32 vector.
+template<typename _TpVec16F, typename _TpVec16S>
+inline _TpVec16F v_exp_default_16f(const _TpVec16F &x) {
+    const _TpVec16F _vexp_lo_f16 = v_setall_<_TpVec16F>(-10.7421875f);
+    const _TpVec16F _vexp_hi_f16 = v_setall_<_TpVec16F>(11.f);
+    const _TpVec16F _vexp_half_fp16 = v_setall_<_TpVec16F>(0.5f);
+    const _TpVec16F _vexp_one_fp16 = v_setall_<_TpVec16F>(1.f);
+    const _TpVec16F _vexp_LOG2EF_f16 = v_setall_<_TpVec16F>(1.44269504088896341f);
+    const _TpVec16F _vexp_C1_f16 = v_setall_<_TpVec16F>(-6.93359375E-1f);
+    const _TpVec16F _vexp_C2_f16 = v_setall_<_TpVec16F>(2.12194440E-4f);
+    const _TpVec16F _vexp_p0_f16 = v_setall_<_TpVec16F>(1.9875691500E-4f);
+    const _TpVec16F _vexp_p1_f16 = v_setall_<_TpVec16F>(1.3981999507E-3f);
+    const _TpVec16F _vexp_p2_f16 = v_setall_<_TpVec16F>(8.3334519073E-3f);
+    const _TpVec16F _vexp_p3_f16 = v_setall_<_TpVec16F>(4.1665795894E-2f);
+    const _TpVec16F _vexp_p4_f16 = v_setall_<_TpVec16F>(1.6666665459E-1f);
+    const _TpVec16F _vexp_p5_f16 = v_setall_<_TpVec16F>(5.0000001201E-1f);
+
+    _TpVec16F _vexp_, _vexp_x, _vexp_y, _vexp_xx;
+    _TpVec16S _vexp_mm;
+    const _TpVec16S _vexp_bias_s16 = v_setall_<_TpVec16S>((short)0xf);
+
+    // compute exponential of x
+    _vexp_x = v_max(x, _vexp_lo_f16);
+    _vexp_x = v_min(_vexp_x, _vexp_hi_f16);
+
+    _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f16, _vexp_half_fp16);
+    _vexp_mm = v_floor(_vexp_);
+    _vexp_ = v_cvt_f16(_vexp_mm);
+    _vexp_mm = v_add(_vexp_mm, _vexp_bias_s16);
+    _vexp_mm = v_shl(_vexp_mm, 10);
+
+    _vexp_x = v_fma(_vexp_, _vexp_C1_f16, _vexp_x);
+    _vexp_x = v_fma(_vexp_, _vexp_C2_f16, _vexp_x);
+    _vexp_xx = v_mul(_vexp_x, _vexp_x);
+
+    _vexp_y = v_fma(_vexp_x, _vexp_p0_f16, _vexp_p1_f16);
+    _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f16);
+    _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f16);
+    _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f16);
+    _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f16);
+
+    _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
+    _vexp_y = v_add(_vexp_y, _vexp_one_fp16);
+    _vexp_y = v_mul(_vexp_y, v_reinterpret_as_f16(_vexp_mm));
+
+    // exp(NAN) -> NAN
+    _TpVec16F mask_not_nan = v_not_nan(x);
+    return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00)));
+}
 
-#ifndef OPENCV_HAL_MATH_HAVE_LOG
+template<typename _TpVec32F, typename _TpVec32S>
+inline _TpVec32F v_exp_default_32f(const _TpVec32F &x) {
+    const _TpVec32F _vexp_lo_f32 = v_setall_<_TpVec32F>(-88.3762626647949f);
+    const _TpVec32F _vexp_hi_f32 = v_setall_<_TpVec32F>(89.f);
+    const _TpVec32F _vexp_half_fp32 = v_setall_<_TpVec32F>(0.5f);
+    const _TpVec32F _vexp_one_fp32 = v_setall_<_TpVec32F>(1.f);
+    const _TpVec32F _vexp_LOG2EF_f32 = v_setall_<_TpVec32F>(1.44269504088896341f);
+    const _TpVec32F _vexp_C1_f32 = v_setall_<_TpVec32F>(-6.93359375E-1f);
+    const _TpVec32F _vexp_C2_f32 = v_setall_<_TpVec32F>(2.12194440E-4f);
+    const _TpVec32F _vexp_p0_f32 = v_setall_<_TpVec32F>(1.9875691500E-4f);
+    const _TpVec32F _vexp_p1_f32 = v_setall_<_TpVec32F>(1.3981999507E-3f);
+    const _TpVec32F _vexp_p2_f32 = v_setall_<_TpVec32F>(8.3334519073E-3f);
+    const _TpVec32F _vexp_p3_f32 = v_setall_<_TpVec32F>(4.1665795894E-2f);
+    const _TpVec32F _vexp_p4_f32 = v_setall_<_TpVec32F>(1.6666665459E-1f);
+    const _TpVec32F _vexp_p5_f32 = v_setall_<_TpVec32F>(5.0000001201E-1f);
+
+    _TpVec32F _vexp_, _vexp_x, _vexp_y, _vexp_xx;
+    _TpVec32S _vexp_mm;
+    const _TpVec32S _vexp_bias_s32 = v_setall_<_TpVec32S>((int)0x7f);
+
+    // compute exponential of x
+    _vexp_x = v_max(x, _vexp_lo_f32);
+    _vexp_x = v_min(_vexp_x, _vexp_hi_f32);
+
+    _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f32, _vexp_half_fp32);
+    _vexp_mm = v_floor(_vexp_);
+    _vexp_ = v_cvt_f32(_vexp_mm);
+    _vexp_mm = v_add(_vexp_mm, _vexp_bias_s32);
+    _vexp_mm = v_shl(_vexp_mm, 23);
+
+    _vexp_x = v_fma(_vexp_, _vexp_C1_f32, _vexp_x);
+    _vexp_x = v_fma(_vexp_, _vexp_C2_f32, _vexp_x);
+    _vexp_xx = v_mul(_vexp_x, _vexp_x);
+
+    _vexp_y = v_fma(_vexp_x, _vexp_p0_f32, _vexp_p1_f32);
+    _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f32);
+    _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f32);
+    _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f32);
+    _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f32);
+
+    _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
+    _vexp_y = v_add(_vexp_y, _vexp_one_fp32);
+    _vexp_y = v_mul(_vexp_y, v_reinterpret_as_f32(_vexp_mm));
+
+    // exp(NAN) -> NAN
+    _TpVec32F mask_not_nan = v_not_nan(x);
+    return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000)));
+}
+
+template<typename _TpVec64F, typename _TpVec64S>
+inline _TpVec64F v_exp_default_64f(const _TpVec64F &x) {
+    const _TpVec64F _vexp_lo_f64 = v_setall_<_TpVec64F>(-709.43613930310391424428);
+    const _TpVec64F _vexp_hi_f64 = v_setall_<_TpVec64F>(710.);
+    const _TpVec64F _vexp_half_f64 = v_setall_<_TpVec64F>(0.5);
+    const _TpVec64F _vexp_one_f64 = v_setall_<_TpVec64F>(1.0);
+    const _TpVec64F _vexp_two_f64 = v_setall_<_TpVec64F>(2.0);
+    const _TpVec64F _vexp_LOG2EF_f64 = v_setall_<_TpVec64F>(1.44269504088896340736);
+    const _TpVec64F _vexp_C1_f64 = v_setall_<_TpVec64F>(-6.93145751953125E-1);
+    const _TpVec64F _vexp_C2_f64 = v_setall_<_TpVec64F>(-1.42860682030941723212E-6);
+    const _TpVec64F _vexp_p0_f64 = v_setall_<_TpVec64F>(1.26177193074810590878E-4);
+    const _TpVec64F _vexp_p1_f64 = v_setall_<_TpVec64F>(3.02994407707441961300E-2);
+    const _TpVec64F _vexp_p2_f64 = v_setall_<_TpVec64F>(9.99999999999999999910E-1);
+    const _TpVec64F _vexp_q0_f64 = v_setall_<_TpVec64F>(3.00198505138664455042E-6);
+    const _TpVec64F _vexp_q1_f64 = v_setall_<_TpVec64F>(2.52448340349684104192E-3);
+    const _TpVec64F _vexp_q2_f64 = v_setall_<_TpVec64F>(2.27265548208155028766E-1);
+    const _TpVec64F _vexp_q3_f64 = v_setall_<_TpVec64F>(2.00000000000000000009E0);
+
+    _TpVec64F _vexp_, _vexp_x, _vexp_y, _vexp_z, _vexp_xx;
+    _TpVec64S _vexp_mm;
+    const _TpVec64S _vexp_bias_s64 = v_setall_<_TpVec64S>((int64)0x3ff);
+
+    // compute exponential of x
+    _vexp_x = v_max(x, _vexp_lo_f64);
+    _vexp_x = v_min(_vexp_x, _vexp_hi_f64);
+
+    _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f64, _vexp_half_f64);
+    _vexp_mm = v_expand_low(v_floor(_vexp_));
+    _vexp_ = v_cvt_f64(_vexp_mm);
+    _vexp_mm = v_add(_vexp_mm, _vexp_bias_s64);
+    _vexp_mm = v_shl(_vexp_mm, 52);
+
+    _vexp_x = v_fma(_vexp_, _vexp_C1_f64, _vexp_x);
+    _vexp_x = v_fma(_vexp_, _vexp_C2_f64, _vexp_x);
+    _vexp_xx = v_mul(_vexp_x, _vexp_x);
+
+    _vexp_y = v_fma(_vexp_xx, _vexp_p0_f64, _vexp_p1_f64);
+    _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_p2_f64);
+    _vexp_y = v_mul(_vexp_y, _vexp_x);
+
+    _vexp_z = v_fma(_vexp_xx, _vexp_q0_f64, _vexp_q1_f64);
+    _vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q2_f64);
+    _vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q3_f64);
+
+    _vexp_z = v_div(_vexp_y, v_sub(_vexp_z, _vexp_y));
+    _vexp_z = v_fma(_vexp_two_f64, _vexp_z, _vexp_one_f64);
+    _vexp_z = v_mul(_vexp_z, v_reinterpret_as_f64(_vexp_mm));
+
+    // exp(NAN) -> NAN
+    _TpVec64F mask_not_nan = v_not_nan(x);
+    return v_select(mask_not_nan, _vexp_z, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7FF8000000000000)));
+}
+//! @}
 
 //! @name Natural Logarithm
 //! @{
-#if defined(CV_SIMD_FP16) && CV_SIMD_FP16
-    inline v_float16 v_log(const v_float16 &x) {
-        const v_float16 _vlog_one_fp16 = vx_setall_f16(hfloat(1.0f));
-        const v_float16 _vlog_SQRTHF_fp16 = vx_setall_f16(hfloat(0.707106781186547524f));
-        const v_float16 _vlog_q1_fp16 = vx_setall_f16(hfloat(-2.12194440E-4f));
-        const v_float16 _vlog_q2_fp16 = vx_setall_f16(hfloat(0.693359375f));
-        const v_float16 _vlog_p0_fp16 = vx_setall_f16(hfloat(7.0376836292E-2f));
-        const v_float16 _vlog_p1_fp16 = vx_setall_f16(hfloat(-1.1514610310E-1f));
-        const v_float16 _vlog_p2_fp16 = vx_setall_f16(hfloat(1.1676998740E-1f));
-        const v_float16 _vlog_p3_fp16 = vx_setall_f16(hfloat(-1.2420140846E-1f));
-        const v_float16 _vlog_p4_fp16 = vx_setall_f16(hfloat(1.4249322787E-1f));
-        const v_float16 _vlog_p5_fp16 = vx_setall_f16(hfloat(-1.6668057665E-1f));
-        const v_float16 _vlog_p6_fp16 = vx_setall_f16(hfloat(2.0000714765E-1f));
-        const v_float16 _vlog_p7_fp16 = vx_setall_f16(hfloat(-2.4999993993E-1f));
-        const v_float16 _vlog_p8_fp16 = vx_setall_f16(hfloat(3.3333331174E-1f));
-        const v_int16 _vlog_inv_mant_mask_s16 = vx_setall_s16(~0x7c00);
-
-        v_float16 _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp;
-        v_int16 _vlog_ux, _vlog_emm0;
-
-        _vlog_ux = v_reinterpret_as_s16(x);
-        _vlog_emm0 = v_shr(_vlog_ux, 10);
-
-        _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s16);
-        _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s16(vx_setall_f16(hfloat(0.5f))));
-        _vlog_x = v_reinterpret_as_f16(_vlog_ux);
-
-        _vlog_emm0 = v_sub(_vlog_emm0, vx_setall_s16(0xf));
-        _vlog_e = v_cvt_f16(_vlog_emm0);
-
-        _vlog_e = v_add(_vlog_e, _vlog_one_fp16);
-
-        v_float16 _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp16);
-        _vlog_tmp = v_and(_vlog_x, _vlog_mask);
-        _vlog_x = v_sub(_vlog_x, _vlog_one_fp16);
-        _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp16, _vlog_mask));
-        _vlog_x = v_add(_vlog_x, _vlog_tmp);
-
-        _vlog_z = v_mul(_vlog_x, _vlog_x);
-
-        _vlog_y = v_fma(_vlog_p0_fp16, _vlog_x, _vlog_p1_fp16);
-        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp16);
-        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp16);
-        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp16);
-        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp16);
-        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp16);
-        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp16);
-        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp16);
-        _vlog_y = v_mul(_vlog_y, _vlog_x);
-        _vlog_y = v_mul(_vlog_y, _vlog_z);
-
-        _vlog_y = v_fma(_vlog_e, _vlog_q1_fp16, _vlog_y);
-
-        _vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, vx_setall_f16(hfloat(0.5f))));
-
-        _vlog_x = v_add(_vlog_x, _vlog_y);
-        _vlog_x = v_fma(_vlog_e, _vlog_q2_fp16, _vlog_x);
-        // log(0) -> -INF
-        v_float16 mask_zero = v_eq(x, vx_setzero_f16());
-        _vlog_x = v_select(mask_zero, v_reinterpret_as_f16(vx_setall_s16(0xfc00)), _vlog_x);
-        // log(NEG), log(NAN) -> NAN
-        v_float16 mask_not_nan = v_ge(x, vx_setzero_f16());
-        _vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f16(vx_setall_s16(0x7e00)));
-        // log(INF) -> INF
-        v_float16 mask_inf = v_eq(x, v_reinterpret_as_f16(vx_setall_s16(0x7c00)));
-        _vlog_x = v_select(mask_inf, x, _vlog_x);
-        return _vlog_x;
-    }
-#endif
-
-    inline v_float32 v_log(const v_float32 &x) {
-        const v_float32 _vlog_one_fp32 = vx_setall_f32(1.0f);
-        const v_float32 _vlog_SQRTHF_fp32 = vx_setall_f32(0.707106781186547524f);
-        const v_float32 _vlog_q1_fp32 = vx_setall_f32(-2.12194440E-4f);
-        const v_float32 _vlog_q2_fp32 = vx_setall_f32(0.693359375f);
-        const v_float32 _vlog_p0_fp32 = vx_setall_f32(7.0376836292E-2f);
-        const v_float32 _vlog_p1_fp32 = vx_setall_f32(-1.1514610310E-1f);
-        const v_float32 _vlog_p2_fp32 = vx_setall_f32(1.1676998740E-1f);
-        const v_float32 _vlog_p3_fp32 = vx_setall_f32(-1.2420140846E-1f);
-        const v_float32 _vlog_p4_fp32 = vx_setall_f32(1.4249322787E-1f);
-        const v_float32 _vlog_p5_fp32 = vx_setall_f32(-1.6668057665E-1f);
-        const v_float32 _vlog_p6_fp32 = vx_setall_f32(2.0000714765E-1f);
-        const v_float32 _vlog_p7_fp32 = vx_setall_f32(-2.4999993993E-1f);
-        const v_float32 _vlog_p8_fp32 = vx_setall_f32(3.3333331174E-1f);
-        const v_int32 _vlog_inv_mant_mask_s32 = vx_setall_s32(~0x7f800000);
-
-        v_float32 _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp;
-        v_int32 _vlog_ux, _vlog_emm0;
-
-        _vlog_ux = v_reinterpret_as_s32(x);
-        _vlog_emm0 = v_shr(_vlog_ux, 23);
-
-        _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s32);
-        _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s32(vx_setall_f32(0.5f)));
-        _vlog_x = v_reinterpret_as_f32(_vlog_ux);
-
-        _vlog_emm0 = v_sub(_vlog_emm0, vx_setall_s32(0x7f));
-        _vlog_e = v_cvt_f32(_vlog_emm0);
-
-        _vlog_e = v_add(_vlog_e, _vlog_one_fp32);
-
-        v_float32 _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp32);
-        _vlog_tmp = v_and(_vlog_x, _vlog_mask);
-        _vlog_x = v_sub(_vlog_x, _vlog_one_fp32);
-        _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp32, _vlog_mask));
-        _vlog_x = v_add(_vlog_x, _vlog_tmp);
-
-        _vlog_z = v_mul(_vlog_x, _vlog_x);
-
-        _vlog_y = v_fma(_vlog_p0_fp32, _vlog_x, _vlog_p1_fp32);
-        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp32);
-        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp32);
-        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp32);
-        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp32);
-        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp32);
-        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp32);
-        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp32);
-        _vlog_y = v_mul(_vlog_y, _vlog_x);
-        _vlog_y = v_mul(_vlog_y, _vlog_z);
-
-        _vlog_y = v_fma(_vlog_e, _vlog_q1_fp32, _vlog_y);
-
-        _vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, vx_setall_f32(0.5)));
-
-        _vlog_x = v_add(_vlog_x, _vlog_y);
-        _vlog_x = v_fma(_vlog_e, _vlog_q2_fp32, _vlog_x);
-        // log(0) -> -INF
-        v_float32 mask_zero = v_eq(x, vx_setzero_f32());
-        _vlog_x = v_select(mask_zero, v_reinterpret_as_f32(vx_setall_s32(0xff800000)), _vlog_x);
-        // log(NEG), log(NAN) -> NAN
-        v_float32 mask_not_nan = v_ge(x, vx_setzero_f32());
-        _vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f32(vx_setall_s32(0x7fc00000)));
-        // log(INF) -> INF
-        v_float32 mask_inf = v_eq(x, v_reinterpret_as_f32(vx_setall_s32(0x7f800000)));
-        _vlog_x = v_select(mask_inf, x, _vlog_x);
-        return _vlog_x;
-    }
-
-#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
-    inline v_float64 v_log(const v_float64 &x) {
-        const v_float64 _vlog_one_fp64 = vx_setall_f64(1.0);
-        const v_float64 _vlog_SQRTHF_fp64 = vx_setall_f64(0.7071067811865475244);
-        const v_float64 _vlog_p0_fp64 = vx_setall_f64(1.01875663804580931796E-4);
-        const v_float64 _vlog_p1_fp64 = vx_setall_f64(4.97494994976747001425E-1);
-        const v_float64 _vlog_p2_fp64 = vx_setall_f64(4.70579119878881725854);
-        const v_float64 _vlog_p3_fp64 = vx_setall_f64(1.44989225341610930846E1);
-        const v_float64 _vlog_p4_fp64 = vx_setall_f64(1.79368678507819816313E1);
-        const v_float64 _vlog_p5_fp64 = vx_setall_f64(7.70838733755885391666);
-        const v_float64 _vlog_q0_fp64 = vx_setall_f64(1.12873587189167450590E1);
-        const v_float64 _vlog_q1_fp64 = vx_setall_f64(4.52279145837532221105E1);
-        const v_float64 _vlog_q2_fp64 = vx_setall_f64(8.29875266912776603211E1);
-        const v_float64 _vlog_q3_fp64 = vx_setall_f64(7.11544750618563894466E1);
-        const v_float64 _vlog_q4_fp64 = vx_setall_f64(2.31251620126765340583E1);
-
-        const v_float64 _vlog_C0_fp64 = vx_setall_f64(2.121944400546905827679e-4);
-        const v_float64 _vlog_C1_fp64 = vx_setall_f64(0.693359375);
-        const v_int64 _vlog_inv_mant_mask_s64 = vx_setall_s64(~0x7ff0000000000000);
-
-        v_float64 _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp, _vlog_xx;
-        v_int64 _vlog_ux, _vlog_emm0;
-
-        _vlog_ux = v_reinterpret_as_s64(x);
-        _vlog_emm0 = v_shr(_vlog_ux, 52);
-
-        _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s64);
-        _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s64(vx_setall_f64(0.5)));
-        _vlog_x = v_reinterpret_as_f64(_vlog_ux);
-
-        _vlog_emm0 = v_sub(_vlog_emm0, vx_setall_s64(0x3ff));
-        _vlog_e = v_cvt_f64(_vlog_emm0);
-
-        _vlog_e = v_add(_vlog_e, _vlog_one_fp64);
-
-        v_float64 _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp64);
-        _vlog_tmp = v_and(_vlog_x, _vlog_mask);
-        _vlog_x = v_sub(_vlog_x, _vlog_one_fp64);
-        _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp64, _vlog_mask));
-        _vlog_x = v_add(_vlog_x, _vlog_tmp);
-
-        _vlog_xx = v_mul(_vlog_x, _vlog_x);
-
-        _vlog_y = v_fma(_vlog_p0_fp64, _vlog_x, _vlog_p1_fp64);
-        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp64);
-        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp64);
-        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp64);
-        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp64);
-        _vlog_y = v_mul(_vlog_y, _vlog_x);
-        _vlog_y = v_mul(_vlog_y, _vlog_xx);
-
-        _vlog_z = v_add(_vlog_x, _vlog_q0_fp64);
-        _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q1_fp64);
-        _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q2_fp64);
-        _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q3_fp64);
-        _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q4_fp64);
-
-        _vlog_z = v_div(_vlog_y, _vlog_z);
-        _vlog_z = v_sub(_vlog_z, v_mul(_vlog_e, _vlog_C0_fp64));
-        _vlog_z = v_sub(_vlog_z, v_mul(_vlog_xx, vx_setall_f64(0.5)));
-
-        _vlog_z = v_add(_vlog_z, _vlog_x);
-        _vlog_z = v_fma(_vlog_e, _vlog_C1_fp64, _vlog_z);
-
-        // log(0) -> -INF
-        v_float64 mask_zero = v_eq(x, vx_setzero_f64());
-        _vlog_z = v_select(mask_zero, v_reinterpret_as_f64(vx_setall_s64(0xfff0000000000000)), _vlog_z);
-        // log(NEG), log(NAN) -> NAN
-        v_float64 mask_not_nan = v_ge(x, vx_setzero_f64());
-        _vlog_z = v_select(mask_not_nan, _vlog_z, v_reinterpret_as_f64(vx_setall_s64(0x7ff8000000000000)));
-        // log(INF) -> INF
-        v_float64 mask_inf = v_eq(x, v_reinterpret_as_f64(vx_setall_s64(0x7ff0000000000000)));
-        _vlog_z = v_select(mask_inf, x, _vlog_z);
-        return _vlog_z;
-    }
-#endif
-
-#define OPENCV_HAL_MATH_HAVE_LOG 1
-//! @}
-#endif
+template<typename _TpVec16F, typename _TpVec16S>
+inline _TpVec16F v_log_default_16f(const _TpVec16F &x) {
+    const _TpVec16F _vlog_one_fp16 = v_setall_<_TpVec16F>(1.0f);
+    const _TpVec16F _vlog_SQRTHF_fp16 = v_setall_<_TpVec16F>(0.707106781186547524f);
+    const _TpVec16F _vlog_q1_fp16 = v_setall_<_TpVec16F>(-2.12194440E-4f);
+    const _TpVec16F _vlog_q2_fp16 = v_setall_<_TpVec16F>(0.693359375f);
+    const _TpVec16F _vlog_p0_fp16 = v_setall_<_TpVec16F>(7.0376836292E-2f);
+    const _TpVec16F _vlog_p1_fp16 = v_setall_<_TpVec16F>(-1.1514610310E-1f);
+    const _TpVec16F _vlog_p2_fp16 = v_setall_<_TpVec16F>(1.1676998740E-1f);
+    const _TpVec16F _vlog_p3_fp16 = v_setall_<_TpVec16F>(-1.2420140846E-1f);
+    const _TpVec16F _vlog_p4_fp16 = v_setall_<_TpVec16F>(1.4249322787E-1f);
+    const _TpVec16F _vlog_p5_fp16 = v_setall_<_TpVec16F>(-1.6668057665E-1f);
+    const _TpVec16F _vlog_p6_fp16 = v_setall_<_TpVec16F>(2.0000714765E-1f);
+    const _TpVec16F _vlog_p7_fp16 = v_setall_<_TpVec16F>(-2.4999993993E-1f);
+    const _TpVec16F _vlog_p8_fp16 = v_setall_<_TpVec16F>(3.3333331174E-1f);
+
+    _TpVec16F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp;
+    _TpVec16S _vlog_ux, _vlog_emm0;
+    const _TpVec16S _vlog_inv_mant_mask_s16 = v_setall_<_TpVec16S>((short)~0x7c00);
+
+    _vlog_ux = v_reinterpret_as_s16(x);
+    _vlog_emm0 = v_shr(_vlog_ux, 10);
+
+    _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s16);
+    _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s16(v_setall_<_TpVec16F>(0.5f)));
+    _vlog_x = v_reinterpret_as_f16(_vlog_ux);
+
+    _vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec16S>((short)0xf));
+    _vlog_e = v_cvt_f16(_vlog_emm0);
+
+    _vlog_e = v_add(_vlog_e, _vlog_one_fp16);
+
+    _TpVec16F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp16);
+    _vlog_tmp = v_and(_vlog_x, _vlog_mask);
+    _vlog_x = v_sub(_vlog_x, _vlog_one_fp16);
+    _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp16, _vlog_mask));
+    _vlog_x = v_add(_vlog_x, _vlog_tmp);
+
+    _vlog_z = v_mul(_vlog_x, _vlog_x);
+
+    _vlog_y = v_fma(_vlog_p0_fp16, _vlog_x, _vlog_p1_fp16);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp16);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp16);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp16);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp16);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp16);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp16);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp16);
+    _vlog_y = v_mul(_vlog_y, _vlog_x);
+    _vlog_y = v_mul(_vlog_y, _vlog_z);
+
+    _vlog_y = v_fma(_vlog_e, _vlog_q1_fp16, _vlog_y);
+
+    _vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, v_setall_<_TpVec16F>(0.5f)));
+
+    _vlog_x = v_add(_vlog_x, _vlog_y);
+    _vlog_x = v_fma(_vlog_e, _vlog_q2_fp16, _vlog_x);
+    // log(0) -> -INF
+    _TpVec16F mask_zero = v_eq(x, v_setzero_<_TpVec16F>());
+    _vlog_x = v_select(mask_zero, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0xfc00)), _vlog_x);
+    // log(NEG), log(NAN) -> NAN
+    _TpVec16F mask_not_nan = v_ge(x, v_setzero_<_TpVec16F>());
+    _vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00)));
+    // log(INF) -> INF
+    _TpVec16F mask_inf = v_eq(x, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7c00)));
+    _vlog_x = v_select(mask_inf, x, _vlog_x);
+    return _vlog_x;
+}
 
-/* This implementation is derived from the approximation approach of Error Function (Erf) from PyTorch
-   https://github.com/pytorch/pytorch/blob/9c50ecc84b9a6e699a7f058891b889aafbf976c7/aten/src/ATen/cpu/vec/vec512/vec512_float.h#L189-L220
-*/
+template<typename _TpVec32F, typename _TpVec32S>
+inline _TpVec32F v_log_default_32f(const _TpVec32F &x) {
+    const _TpVec32F _vlog_one_fp32 = v_setall_<_TpVec32F>(1.0f);
+    const _TpVec32F _vlog_SQRTHF_fp32 = v_setall_<_TpVec32F>(0.707106781186547524f);
+    const _TpVec32F _vlog_q1_fp32 = v_setall_<_TpVec32F>(-2.12194440E-4f);
+    const _TpVec32F _vlog_q2_fp32 = v_setall_<_TpVec32F>(0.693359375f);
+    const _TpVec32F _vlog_p0_fp32 = v_setall_<_TpVec32F>(7.0376836292E-2f);
+    const _TpVec32F _vlog_p1_fp32 = v_setall_<_TpVec32F>(-1.1514610310E-1f);
+    const _TpVec32F _vlog_p2_fp32 = v_setall_<_TpVec32F>(1.1676998740E-1f);
+    const _TpVec32F _vlog_p3_fp32 = v_setall_<_TpVec32F>(-1.2420140846E-1f);
+    const _TpVec32F _vlog_p4_fp32 = v_setall_<_TpVec32F>(1.4249322787E-1f);
+    const _TpVec32F _vlog_p5_fp32 = v_setall_<_TpVec32F>(-1.6668057665E-1f);
+    const _TpVec32F _vlog_p6_fp32 = v_setall_<_TpVec32F>(2.0000714765E-1f);
+    const _TpVec32F _vlog_p7_fp32 = v_setall_<_TpVec32F>(-2.4999993993E-1f);
+    const _TpVec32F _vlog_p8_fp32 = v_setall_<_TpVec32F>(3.3333331174E-1f);
+
+    _TpVec32F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp;
+    _TpVec32S _vlog_ux, _vlog_emm0;
+    const _TpVec32S _vlog_inv_mant_mask_s32 = v_setall_<_TpVec32S>((int)~0x7f800000);
+
+    _vlog_ux = v_reinterpret_as_s32(x);
+    _vlog_emm0 = v_shr(_vlog_ux, 23);
+
+    _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s32);
+    _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s32(v_setall_<_TpVec32F>(0.5f)));
+    _vlog_x = v_reinterpret_as_f32(_vlog_ux);
+
+    _vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec32S>((int)0x7f));
+    _vlog_e = v_cvt_f32(_vlog_emm0);
+
+    _vlog_e = v_add(_vlog_e, _vlog_one_fp32);
+
+    _TpVec32F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp32);
+    _vlog_tmp = v_and(_vlog_x, _vlog_mask);
+    _vlog_x = v_sub(_vlog_x, _vlog_one_fp32);
+    _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp32, _vlog_mask));
+    _vlog_x = v_add(_vlog_x, _vlog_tmp);
+
+    _vlog_z = v_mul(_vlog_x, _vlog_x);
+
+    _vlog_y = v_fma(_vlog_p0_fp32, _vlog_x, _vlog_p1_fp32);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp32);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp32);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp32);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp32);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp32);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp32);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp32);
+    _vlog_y = v_mul(_vlog_y, _vlog_x);
+    _vlog_y = v_mul(_vlog_y, _vlog_z);
+
+    _vlog_y = v_fma(_vlog_e, _vlog_q1_fp32, _vlog_y);
+
+    _vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, v_setall_<_TpVec32F>(0.5f)));
+
+    _vlog_x = v_add(_vlog_x, _vlog_y);
+    _vlog_x = v_fma(_vlog_e, _vlog_q2_fp32, _vlog_x);
+    // log(0) -> -INF
+    _TpVec32F mask_zero = v_eq(x, v_setzero_<_TpVec32F>());
+    _vlog_x = v_select(mask_zero, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0xff800000)), _vlog_x);
+    // log(NEG), log(NAN) -> NAN
+    _TpVec32F mask_not_nan = v_ge(x, v_setzero_<_TpVec32F>());
+    _vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000)));
+    // log(INF) -> INF
+    _TpVec32F mask_inf = v_eq(x, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7f800000)));
+    _vlog_x = v_select(mask_inf, x, _vlog_x);
+    return _vlog_x;
+}
 
-#ifndef OPENCV_HAL_MATH_HAVE_ERF
+template<typename _TpVec64F, typename _TpVec64S>
+inline _TpVec64F v_log_default_64f(const _TpVec64F &x) {
+    const _TpVec64F _vlog_one_fp64 = v_setall_<_TpVec64F>(1.0);
+    const _TpVec64F _vlog_SQRTHF_fp64 = v_setall_<_TpVec64F>(0.7071067811865475244);
+    const _TpVec64F _vlog_p0_fp64 = v_setall_<_TpVec64F>(1.01875663804580931796E-4);
+    const _TpVec64F _vlog_p1_fp64 = v_setall_<_TpVec64F>(4.97494994976747001425E-1);
+    const _TpVec64F _vlog_p2_fp64 = v_setall_<_TpVec64F>(4.70579119878881725854);
+    const _TpVec64F _vlog_p3_fp64 = v_setall_<_TpVec64F>(1.44989225341610930846E1);
+    const _TpVec64F _vlog_p4_fp64 = v_setall_<_TpVec64F>(1.79368678507819816313E1);
+    const _TpVec64F _vlog_p5_fp64 = v_setall_<_TpVec64F>(7.70838733755885391666);
+    const _TpVec64F _vlog_q0_fp64 = v_setall_<_TpVec64F>(1.12873587189167450590E1);
+    const _TpVec64F _vlog_q1_fp64 = v_setall_<_TpVec64F>(4.52279145837532221105E1);
+    const _TpVec64F _vlog_q2_fp64 = v_setall_<_TpVec64F>(8.29875266912776603211E1);
+    const _TpVec64F _vlog_q3_fp64 = v_setall_<_TpVec64F>(7.11544750618563894466E1);
+    const _TpVec64F _vlog_q4_fp64 = v_setall_<_TpVec64F>(2.31251620126765340583E1);
+
+    const _TpVec64F _vlog_C0_fp64 = v_setall_<_TpVec64F>(2.121944400546905827679e-4);
+    const _TpVec64F _vlog_C1_fp64 = v_setall_<_TpVec64F>(0.693359375);
+
+    _TpVec64F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp, _vlog_xx;
+    _TpVec64S _vlog_ux, _vlog_emm0;
+    const _TpVec64S _vlog_inv_mant_mask_s64 = v_setall_<_TpVec64S>((int64)~0x7ff0000000000000);
+
+    _vlog_ux = v_reinterpret_as_s64(x);
+    _vlog_emm0 = v_shr(_vlog_ux, 52);
+
+    _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s64);
+    _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s64(v_setall_<_TpVec64F>(0.5)));
+    _vlog_x = v_reinterpret_as_f64(_vlog_ux);
+
+    _vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec64S>((int64)0x3ff));
+    _vlog_e = v_cvt_f64(_vlog_emm0);
+
+    _vlog_e = v_add(_vlog_e, _vlog_one_fp64);
+
+    _TpVec64F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp64);
+    _vlog_tmp = v_and(_vlog_x, _vlog_mask);
+    _vlog_x = v_sub(_vlog_x, _vlog_one_fp64);
+    _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp64, _vlog_mask));
+    _vlog_x = v_add(_vlog_x, _vlog_tmp);
+
+    _vlog_xx = v_mul(_vlog_x, _vlog_x);
+
+    _vlog_y = v_fma(_vlog_p0_fp64, _vlog_x, _vlog_p1_fp64);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp64);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp64);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp64);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp64);
+    _vlog_y = v_mul(_vlog_y, _vlog_x);
+    _vlog_y = v_mul(_vlog_y, _vlog_xx);
+
+    _vlog_z = v_add(_vlog_x, _vlog_q0_fp64);
+    _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q1_fp64);
+    _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q2_fp64);
+    _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q3_fp64);
+    _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q4_fp64);
+
+    _vlog_z = v_div(_vlog_y, _vlog_z);
+    _vlog_z = v_sub(_vlog_z, v_mul(_vlog_e, _vlog_C0_fp64));
+    _vlog_z = v_sub(_vlog_z, v_mul(_vlog_xx, v_setall_<_TpVec64F>(0.5)));
+
+    _vlog_z = v_add(_vlog_z, _vlog_x);
+    _vlog_z = v_fma(_vlog_e, _vlog_C1_fp64, _vlog_z);
+
+    // log(0) -> -INF
+    _TpVec64F mask_zero = v_eq(x, v_setzero_<_TpVec64F>());
+    _vlog_z = v_select(mask_zero, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0xfff0000000000000)), _vlog_z);
+    // log(NEG), log(NAN) -> NAN
+    _TpVec64F mask_not_nan = v_ge(x, v_setzero_<_TpVec64F>());
+    _vlog_z = v_select(mask_not_nan, _vlog_z, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff8000000000000)));
+    // log(INF) -> INF
+    _TpVec64F mask_inf = v_eq(x, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff0000000000000)));
+    _vlog_z = v_select(mask_inf, x, _vlog_z);
+    return _vlog_z;
+}
+//! @}
 
-//! @name Error Function
+//! @name Sine and Cosine
 //! @{
+template<typename _TpVec16F, typename _TpVec16S>
+inline void v_sincos_default_16f(const _TpVec16F &x, _TpVec16F &ysin, _TpVec16F &ycos) {
+    const _TpVec16F v_cephes_FOPI = v_setall_<_TpVec16F>(hfloat(1.27323954473516f)); // 4 / M_PI
+    const _TpVec16F v_minus_DP1 = v_setall_<_TpVec16F>(hfloat(-0.78515625f));
+    const _TpVec16F v_minus_DP2 = v_setall_<_TpVec16F>(hfloat(-2.4187564849853515625E-4f));
+    const _TpVec16F v_minus_DP3 = v_setall_<_TpVec16F>(hfloat(-3.77489497744594108E-8f));
+    const _TpVec16F v_sincof_p0 = v_setall_<_TpVec16F>(hfloat(-1.9515295891E-4f));
+    const _TpVec16F v_sincof_p1 = v_setall_<_TpVec16F>(hfloat(8.3321608736E-3f));
+    const _TpVec16F v_sincof_p2 = v_setall_<_TpVec16F>(hfloat(-1.6666654611E-1f));
+    const _TpVec16F v_coscof_p0 = v_setall_<_TpVec16F>(hfloat(2.443315711809948E-5f));
+    const _TpVec16F v_coscof_p1 = v_setall_<_TpVec16F>(hfloat(-1.388731625493765E-3f));
+    const _TpVec16F v_coscof_p2 = v_setall_<_TpVec16F>(hfloat(4.166664568298827E-2f));
+    const _TpVec16F v_nan = v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00));
+    const _TpVec16F v_neg_zero = v_setall_<_TpVec16F>(hfloat(-0.f));
+
+    _TpVec16F _vx, _vy, sign_mask_sin, sign_mask_cos;
+    _TpVec16S emm2;
+
+    sign_mask_sin = v_lt(x, v_setzero_<_TpVec16F>());
+    _vx = v_abs(x);
+    _vy = v_mul(_vx, v_cephes_FOPI);
+
+    emm2 = v_trunc(_vy);
+    emm2 = v_add(emm2, v_setall_<_TpVec16S>((short)1));
+    emm2 = v_and(emm2, v_setall_<_TpVec16S>((short)~1));
+    _vy = v_cvt_f16(emm2);
+
+    _TpVec16F poly_mask = v_reinterpret_as_f16(v_eq(v_and(emm2, v_setall_<_TpVec16S>((short)2)), v_setall_<_TpVec16S>((short)0)));
+
+    _vx = v_fma(_vy, v_minus_DP1, _vx);
+    _vx = v_fma(_vy, v_minus_DP2, _vx);
+    _vx = v_fma(_vy, v_minus_DP3, _vx);
+
+    sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f16(v_eq(v_and(emm2, v_setall_<_TpVec16S>((short)4)), v_setall_<_TpVec16S>((short)0))));
+    sign_mask_cos = v_reinterpret_as_f16(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec16S>((short)2)), v_setall_<_TpVec16S>((short)4)), v_setall_<_TpVec16S>((short)0)));
+
+    _TpVec16F _vxx = v_mul(_vx, _vx);
+    _TpVec16F y1, y2;
+
+    y1 = v_fma(v_coscof_p0, _vxx, v_coscof_p1);
+    y1 = v_fma(y1, _vxx, v_coscof_p2);
+    y1 = v_fma(y1, _vxx, v_setall_<_TpVec16F>(hfloat(-0.5f)));
+    y1 = v_fma(y1, _vxx, v_setall_<_TpVec16F>(hfloat(1.f)));
+
+    y2 = v_fma(v_sincof_p0, _vxx, v_sincof_p1);
+    y2 = v_fma(y2, _vxx, v_sincof_p2);
+    y2 = v_mul(y2, _vxx);
+    y2 = v_fma(y2, _vx, _vx);
+
+    ysin = v_select(poly_mask, y2, y1);
+    ycos = v_select(poly_mask, y1, y2);
+    ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
+    ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
+
+    // sincos(NAN) -> NAN, sincos(±INF) -> NAN
+    _TpVec16F mask_inf = v_eq(_vx, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7c00)));
+    _TpVec16F mask_nan = v_or(mask_inf, v_ne(x, x));
+    ysin = v_select(mask_nan, v_nan, ysin);
+    ycos = v_select(mask_nan, v_nan, ycos);
+}
 
-    inline v_float32 v_erf(const v_float32 &v) {
-        const v_float32 coef0 = vx_setall_f32(0.3275911f),
-                        coef1 = vx_setall_f32(1.061405429f),
-                        coef2 = vx_setall_f32(-1.453152027f),
-                        coef3 = vx_setall_f32(1.421413741f),
-                        coef4 = vx_setall_f32(-0.284496736f),
-                        coef5 = vx_setall_f32(0.254829592f),
-                        ones = vx_setall_f32(1.0f),
-                        neg_zeros = vx_setall_f32(-0.f);
-        v_float32 t = v_abs(v);
-        // sign(v)
-        v_float32 sign_mask = v_and(neg_zeros, v);
-
-        t = v_div(ones, v_fma(coef0, t, ones));
-        v_float32 r = v_fma(coef1, t, coef2);
-        r = v_fma(r, t, coef3);
-        r = v_fma(r, t, coef4);
-        r = v_fma(r, t, coef5);
-        // - v * v
-        v_float32 pow_2 = v_mul(v, v);
-        v_float32 neg_pow_2 = v_xor(neg_zeros, pow_2);
-        // - exp(- v * v)
-        v_float32 exp = v_exp(neg_pow_2);
-        v_float32 neg_exp = v_xor(neg_zeros, exp);
-        v_float32 res = v_mul(t, neg_exp);
-        res = v_fma(r, res, ones);
-        return v_xor(sign_mask, res);
-    }
-
-#define OPENCV_HAL_MATH_HAVE_ERF 1
+template<typename _TpVec16F, typename _TpVec16S>
+inline _TpVec16F v_sin_default_16f(const _TpVec16F &x) {
+    _TpVec16F ysin, ycos;
+    v_sincos_default_16f<_TpVec16F, _TpVec16S>(x, ysin, ycos);
+    return ysin;
+}
+
+template<typename _TpVec16F, typename _TpVec16S>
+inline _TpVec16F v_cos_default_16f(const _TpVec16F &x) {
+    _TpVec16F ysin, ycos;
+    v_sincos_default_16f<_TpVec16F, _TpVec16S>(x, ysin, ycos);
+    return ycos;
+}
+
+
+template<typename _TpVec32F, typename _TpVec32S>
+inline void v_sincos_default_32f(const _TpVec32F &x, _TpVec32F &ysin, _TpVec32F &ycos) {
+    const _TpVec32F v_cephes_FOPI = v_setall_<_TpVec32F>(1.27323954473516f); // 4 / M_PI
+    const _TpVec32F v_minus_DP1 = v_setall_<_TpVec32F>(-0.78515625f);
+    const _TpVec32F v_minus_DP2 = v_setall_<_TpVec32F>(-2.4187564849853515625E-4f);
+    const _TpVec32F v_minus_DP3 = v_setall_<_TpVec32F>(-3.77489497744594108E-8f);
+    const _TpVec32F v_sincof_p0 = v_setall_<_TpVec32F>(-1.9515295891E-4f);
+    const _TpVec32F v_sincof_p1 = v_setall_<_TpVec32F>(8.3321608736E-3f);
+    const _TpVec32F v_sincof_p2 = v_setall_<_TpVec32F>(-1.6666654611E-1f);
+    const _TpVec32F v_coscof_p0 = v_setall_<_TpVec32F>(2.443315711809948E-5f);
+    const _TpVec32F v_coscof_p1 = v_setall_<_TpVec32F>(-1.388731625493765E-3f);
+    const _TpVec32F v_coscof_p2 = v_setall_<_TpVec32F>(4.166664568298827E-2f);
+    const _TpVec32F v_nan = v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000));
+    const _TpVec32F v_neg_zero = v_setall_<_TpVec32F>(-0.f);
+
+    _TpVec32F _vx, _vy, sign_mask_sin, sign_mask_cos;
+    _TpVec32S emm2;
+
+    sign_mask_sin = v_lt(x, v_setzero_<_TpVec32F>());
+    _vx = v_abs(x);
+    _vy = v_mul(_vx, v_cephes_FOPI);
+
+    emm2 = v_trunc(_vy);
+    emm2 = v_add(emm2, v_setall_<_TpVec32S>(1));
+    emm2 = v_and(emm2, v_setall_<_TpVec32S>(~1));
+    _vy = v_cvt_f32(emm2);
+
+    _TpVec32F poly_mask = v_reinterpret_as_f32(v_eq(v_and(emm2, v_setall_<_TpVec32S>(2)), v_setall_<_TpVec32S>(0)));
+
+    _vx = v_fma(_vy, v_minus_DP1, _vx);
+    _vx = v_fma(_vy, v_minus_DP2, _vx);
+    _vx = v_fma(_vy, v_minus_DP3, _vx);
+
+    sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f32(v_eq(v_and(emm2, v_setall_<_TpVec32S>(4)), v_setall_<_TpVec32S>(0))));
+    sign_mask_cos = v_reinterpret_as_f32(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec32S>(2)), v_setall_<_TpVec32S>(4)), v_setall_<_TpVec32S>(0)));
+
+    _TpVec32F _vxx = v_mul(_vx, _vx);
+    _TpVec32F y1, y2;
+
+    y1 = v_fma(v_coscof_p0, _vxx, v_coscof_p1);
+    y1 = v_fma(y1, _vxx, v_coscof_p2);
+    y1 = v_fma(y1, _vxx, v_setall_<_TpVec32F>(-0.5f));
+    y1 = v_fma(y1, _vxx, v_setall_<_TpVec32F>(1.f));
+
+    y2 = v_fma(v_sincof_p0, _vxx, v_sincof_p1);
+    y2 = v_fma(y2, _vxx, v_sincof_p2);
+    y2 = v_mul(y2, _vxx);
+    y2 = v_fma(y2, _vx, _vx);
+
+    ysin = v_select(poly_mask, y2, y1);
+    ycos = v_select(poly_mask, y1, y2);
+    ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
+    ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
+
+    // sincos(NAN) -> NAN, sincos(±INF) -> NAN
+    _TpVec32F mask_inf = v_eq(_vx, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7f800000)));
+    _TpVec32F mask_nan = v_or(mask_inf, v_ne(x, x));
+    ysin = v_select(mask_nan, v_nan, ysin);
+    ycos = v_select(mask_nan, v_nan, ycos);
+}
+
+template<typename _TpVec32F, typename _TpVec32S>
+inline _TpVec32F v_sin_default_32f(const _TpVec32F &x) {
+    _TpVec32F ysin, ycos;
+    v_sincos_default_32f<_TpVec32F, _TpVec32S>(x, ysin, ycos);
+    return ysin;
+}
+
+template<typename _TpVec32F, typename _TpVec32S>
+inline _TpVec32F v_cos_default_32f(const _TpVec32F &x) {
+    _TpVec32F ysin, ycos;
+    v_sincos_default_32f<_TpVec32F, _TpVec32S>(x, ysin, ycos);
+    return ycos;
+}
+
+template<typename _TpVec64F, typename _TpVec64S>
+inline void v_sincos_default_64f(const _TpVec64F &x, _TpVec64F &ysin, _TpVec64F &ycos) {
+    const _TpVec64F v_cephes_FOPI = v_setall_<_TpVec64F>(1.2732395447351626861510701069801148); // 4 / M_PI
+    const _TpVec64F v_minus_DP1 = v_setall_<_TpVec64F>(-7.853981554508209228515625E-1);
+    const _TpVec64F v_minus_DP2 = v_setall_<_TpVec64F>(-7.94662735614792836714E-9);
+    const _TpVec64F v_minus_DP3 = v_setall_<_TpVec64F>(-3.06161699786838294307E-17);
+    const _TpVec64F v_sin_C1 = v_setall_<_TpVec64F>(1.58962301576546568060E-10);
+    const _TpVec64F v_sin_C2 = v_setall_<_TpVec64F>(-2.50507477628578072866E-8);
+    const _TpVec64F v_sin_C3 = v_setall_<_TpVec64F>(2.75573136213857245213E-6);
+    const _TpVec64F v_sin_C4 = v_setall_<_TpVec64F>(-1.98412698295895385996E-4);
+    const _TpVec64F v_sin_C5 = v_setall_<_TpVec64F>(8.33333333332211858878E-3);
+    const _TpVec64F v_sin_C6 = v_setall_<_TpVec64F>(-1.66666666666666307295E-1);
+    const _TpVec64F v_cos_C1 = v_setall_<_TpVec64F>(-1.13585365213876817300E-11);
+    const _TpVec64F v_cos_C2 = v_setall_<_TpVec64F>(2.08757008419747316778E-9);
+    const _TpVec64F v_cos_C3 = v_setall_<_TpVec64F>(-2.75573141792967388112E-7);
+    const _TpVec64F v_cos_C4 = v_setall_<_TpVec64F>(2.48015872888517045348E-5);
+    const _TpVec64F v_cos_C5 = v_setall_<_TpVec64F>(-1.38888888888730564116E-3);
+    const _TpVec64F v_cos_C6 = v_setall_<_TpVec64F>(4.16666666666665929218E-2);
+    const _TpVec64F v_nan = v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff8000000000000));
+    const _TpVec64F v_neg_zero = v_setall_<_TpVec64F>(-0.0);
+
+    _TpVec64F _vx, _vy, sign_mask_sin, sign_mask_cos;
+    _TpVec64S emm2;
+
+    sign_mask_sin = v_lt(x, v_setzero_<_TpVec64F>());
+    _vx = v_abs(x);
+    _vy = v_mul(_vx, v_cephes_FOPI);
+
+    emm2 = v_expand_low(v_trunc(_vy));
+    emm2 = v_add(emm2, v_setall_<_TpVec64S>((int64)1));
+    emm2 = v_and(emm2, v_setall_<_TpVec64S>((int64)~1));
+    _vy = v_cvt_f64(emm2);
+
+    _TpVec64F poly_mask = v_reinterpret_as_f64(v_eq(v_and(emm2, v_setall_<_TpVec64S>((int64)2)), v_setall_<_TpVec64S>((int64)0)));
+
+    _vx = v_fma(_vy, v_minus_DP1, _vx);
+    _vx = v_fma(_vy, v_minus_DP2, _vx);
+    _vx = v_fma(_vy, v_minus_DP3, _vx);
+
+    sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f64(v_eq(v_and(emm2, v_setall_<_TpVec64S>((int64)4)), v_setall_<_TpVec64S>((int64)0))));
+    sign_mask_cos = v_reinterpret_as_f64(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec64S>((int64)2)), v_setall_<_TpVec64S>((int64)4)), v_setall_<_TpVec64S>((int64)0)));
+
+    _TpVec64F _vxx = v_mul(_vx, _vx);
+    _TpVec64F y1, y2;
+
+    y1 = v_fma(v_cos_C1, _vxx, v_cos_C2);
+    y1 = v_fma(y1, _vxx, v_cos_C3);
+    y1 = v_fma(y1, _vxx, v_cos_C4);
+    y1 = v_fma(y1, _vxx, v_cos_C5);
+    y1 = v_fma(y1, _vxx, v_cos_C6);
+    y1 = v_fma(y1, _vxx, v_setall_<_TpVec64F>(-0.5));
+    y1 = v_fma(y1, _vxx, v_setall_<_TpVec64F>(1.0));
+
+    y2 = v_fma(v_sin_C1, _vxx, v_sin_C2);
+    y2 = v_fma(y2, _vxx, v_sin_C3);
+    y2 = v_fma(y2, _vxx, v_sin_C4);
+    y2 = v_fma(y2, _vxx, v_sin_C5);
+    y2 = v_fma(y2, _vxx, v_sin_C6);
+    y2 = v_mul(y2, _vxx);
+    y2 = v_fma(y2, _vx, _vx);
+
+    ysin = v_select(poly_mask, y2, y1);
+    ycos = v_select(poly_mask, y1, y2);
+    ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
+    ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
+
+    // sincos(NAN) -> NAN, sincos(±INF) -> NAN
+    _TpVec64F mask_inf = v_eq(_vx, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff0000000000000)));
+    _TpVec64F mask_nan = v_or(mask_inf, v_ne(x, x));
+    ysin = v_select(mask_nan, v_nan, ysin);
+    ycos = v_select(mask_nan, v_nan, ycos);
+}
+
+template<typename _TpVec64F, typename _TpVec64S>
+inline _TpVec64F v_sin_default_64f(const _TpVec64F &x) {
+    _TpVec64F ysin, ycos;
+    v_sincos_default_64f<_TpVec64F, _TpVec64S>(x, ysin, ycos);
+    return ysin;
+}
+
+template<typename _TpVec64F, typename _TpVec64S>
+inline _TpVec64F v_cos_default_64f(const _TpVec64F &x) {
+    _TpVec64F ysin, ycos;
+    v_sincos_default_64f<_TpVec64F, _TpVec64S>(x, ysin, ycos);
+    return ycos;
+}
 //! @}
 
-#endif // OPENCV_HAL_MATH_HAVE_ERF
 
+/* This implementation is derived from the approximation approach of Error Function (Erf) from PyTorch
+   https://github.com/pytorch/pytorch/blob/9c50ecc84b9a6e699a7f058891b889aafbf976c7/aten/src/ATen/cpu/vec/vec512/vec512_float.h#L189-L220
+*/
 
+//! @name Error Function
+//! @{
+template<typename _TpVec32F, typename _TpVec32S>
+inline _TpVec32F v_erf_default_32f(const _TpVec32F &v) {
+    const _TpVec32F coef0 = v_setall_<_TpVec32F>(0.3275911f),
+            coef1 = v_setall_<_TpVec32F>(1.061405429f),
+            coef2 = v_setall_<_TpVec32F>(-1.453152027f),
+            coef3 = v_setall_<_TpVec32F>(1.421413741f),
+            coef4 = v_setall_<_TpVec32F>(-0.284496736f),
+            coef5 = v_setall_<_TpVec32F>(0.254829592f),
+            ones = v_setall_<_TpVec32F>(1.0f),
+            neg_zeros = v_setall_<_TpVec32F>(-0.f);
+    _TpVec32F t = v_abs(v);
+    // sign(v)
+    _TpVec32F sign_mask = v_and(neg_zeros, v);
+
+    t = v_div(ones, v_fma(coef0, t, ones));
+    _TpVec32F r = v_fma(coef1, t, coef2);
+    r = v_fma(r, t, coef3);
+    r = v_fma(r, t, coef4);
+    r = v_fma(r, t, coef5);
+    // - v * v
+    _TpVec32F v2 = v_mul(v, v);
+    _TpVec32F mv2 = v_xor(neg_zeros, v2);
+    // - exp(- v * v)
+    _TpVec32F exp = v_exp_default_32f<_TpVec32F, _TpVec32S>(mv2);
+    _TpVec32F neg_exp = v_xor(neg_zeros, exp);
+    _TpVec32F res = v_mul(t, neg_exp);
+    res = v_fma(r, res, ones);
+    return v_xor(sign_mask, res);
 }
-#endif  // OPENCV_HAL_INTRIN_HPP
+//! @}
+
+#endif // OPENCV_HAL_INTRIN_MATH_HPP
diff --git a/modules/core/include/opencv2/core/hal/intrin_msa.hpp b/modules/core/include/opencv2/core/hal/intrin_msa.hpp
index 8d2c22b08760..3917faa292cd 100644
--- a/modules/core/include/opencv2/core/hal/intrin_msa.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_msa.hpp
@@ -235,6 +235,8 @@ struct v_float64x2
 #define OPENCV_HAL_IMPL_MSA_INIT(_Tpv, _Tp, suffix) \
 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(msa_dupq_n_##suffix((_Tp)0)); } \
 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(msa_dupq_n_##suffix(v)); } \
+template <> inline v_##_Tpv v_setzero_() { return v_setzero_##suffix(); } \
+template <> inline v_##_Tpv v_setall_(_Tp v) { return v_setall_##suffix(v); } \
 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(MSA_TPV_REINTERPRET(v16u8, v.val)); } \
 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(MSA_TPV_REINTERPRET(v16i8, v.val)); } \
 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(MSA_TPV_REINTERPRET(v8u16, v.val)); } \
@@ -1861,6 +1863,20 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 
 inline void v_cleanup() {}
 
+#include "intrin_math.hpp"
+inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
+inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
+inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
+
+inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
+inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
+inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 //! @endcond
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index 7685b435bf83..d42d48ee8296 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -414,6 +414,8 @@ struct v_float64x2
 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, _TpCast, suffix) \
 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_TpCast)0)); } \
 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix((_TpCast)v)); } \
+template <> inline v_##_Tpv v_setzero_() { return v_setzero_##suffix(); } \
+template <> inline v_##_Tpv v_setall_(_Tp v) { return v_setall_##suffix(v); } \
 inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
@@ -435,6 +437,7 @@ OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, uint64, u64)
 OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, int64, s64)
 #if CV_SIMD128_FP16
 OPENCV_HAL_IMPL_NEON_INIT(float16x8, hfloat, __fp16,  f16);
+template <> inline v_float16x8 v_setall_(float v) { return v_setall_f16((hfloat)v); }
 #define OPENCV_HAL_IMPL_NEON_INIT_FP16(_Tpv, suffix) \
 inline v_float16x8 v_reinterpret_as_f16(const v_##_Tpv& v) { return v_float16x8(vreinterpretq_f16_##suffix(v.val)); }
 OPENCV_HAL_IMPL_NEON_INIT_FP16(uint8x16, u8)
@@ -3030,6 +3033,28 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 
 inline void v_cleanup() {}
 
+#include "intrin_math.hpp"
+#if CV_SIMD128_FP16
+inline v_float16x8 v_exp(const v_float16x8& x) { return v_exp_default_16f<v_float16x8, v_int16x8>(x); }
+inline v_float16x8 v_log(const v_float16x8& x) { return v_log_default_16f<v_float16x8, v_int16x8>(x); }
+inline void v_sincos(const v_float16x8& x, v_float16x8& s, v_float16x8& c) { v_sincos_default_16f<v_float16x8, v_int16x8>(x, s, c); }
+inline v_float16x8 v_sin(const v_float16x8& x) { return v_sin_default_16f<v_float16x8, v_int16x8>(x); }
+inline v_float16x8 v_cos(const v_float16x8& x) { return v_cos_default_16f<v_float16x8, v_int16x8>(x); }
+#endif
+inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
+inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
+inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
+#if CV_SIMD128_64F
+inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
+inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
+inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
+#endif
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 //! @endcond
diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp
index 4900418df3ce..146335dc017d 100644
--- a/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp
@@ -355,10 +355,12 @@ inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& v) { return v_float64
 
 #define OPENCV_HAL_IMPL_RISCVV_INIT_SET(__Tp, _Tp, suffix, len, num) \
 inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num(vmv_v_x_##len##m1(0, num)); }     \
-inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); }
+inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); } \
+template <> inline v_##_Tp##x##num v_setzero_() { return v_setzero_##suffix(); }          \
+template <> inline v_##_Tp##x##num v_setall_(__Tp v) { return v_setall_##suffix(v); }
 
 OPENCV_HAL_IMPL_RISCVV_INIT_SET(uchar, uint8, u8, u8, 16)
-OPENCV_HAL_IMPL_RISCVV_INIT_SET(char, int8, s8, i8, 16)
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(schar, int8, s8, i8, 16)
 OPENCV_HAL_IMPL_RISCVV_INIT_SET(ushort, uint16, u16, u16, 8)
 OPENCV_HAL_IMPL_RISCVV_INIT_SET(short, int16, s16, i16, 8)
 OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned int, uint32, u32, u32, 4)
@@ -371,6 +373,11 @@ inline v_float32x4 v_setall_f32(float v) { return v_float32x4(vfmv_v_f_f32m1(v,
 inline v_float64x2 v_setzero_f64() { return v_float64x2(vfmv_v_f_f64m1(0, 2)); }
 inline v_float64x2 v_setall_f64(double v) { return v_float64x2(vfmv_v_f_f64m1(v, 2)); }
 
+template <> inline v_float32x4 v_setzero_() { return v_setzero_f32(); }
+template <> inline v_float32x4 v_setall_(float v) { return v_setall_f32(v); }
+
+template <> inline v_float64x2 v_setzero_() { return v_setzero_f64(); }
+template <> inline v_float64x2 v_setall_(double v) { return v_setall_f64(v); }
 
 #define OPENCV_HAL_IMPL_RISCVV_BIN_OP(bin_op, _Tpvec, intrin) \
 inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
@@ -2859,6 +2866,20 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 
 inline void v_cleanup() {}
 
+#include "intrin_math.hpp"
+inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
+inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
+inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
+
+inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
+inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
+inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 //! @endcond
diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
index 13c616b046a9..76288166051d 100644
--- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
@@ -182,6 +182,14 @@ inline v_##_Tpvec v_setzero_##suffix1() \
 inline v_##_Tpvec v_setall_##suffix1(_Tp v) \
 { \
     return __riscv_vmv_v_x_##suffix2##m1(v, vl); \
+} \
+template <> inline v_##_Tpvec v_setzero_() \
+{ \
+    return v_setzero_##suffix1(); \
+} \
+template <> inline v_##_Tpvec v_setall_(_Tp v) \
+{ \
+    return v_setall_##suffix1(v); \
 }
 
 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8, uchar, u8, u8, VTraits<v_int8>::vlanes())
@@ -201,6 +209,14 @@ inline v_##_Tpv v_setzero_##suffix() \
 inline v_##_Tpv v_setall_##suffix(_Tp v) \
 { \
     return __riscv_vfmv_v_f_##suffix##m1(v, vl); \
+} \
+template <> inline v_##_Tpv v_setzero_() \
+{ \
+    return v_setzero_##suffix(); \
+} \
+template <> inline v_##_Tpv v_setall_(_Tp v) \
+{ \
+    return v_setall_##suffix(v); \
 }
 
 #if CV_SIMD_SCALABLE_FP16
@@ -2471,6 +2487,20 @@ inline v_float32 v_matmuladd(const v_float32& v, const v_float32& m0,
 
 inline void v_cleanup() {}
 
+#include "intrin_math.hpp"
+inline v_float32 v_exp(const v_float32& x) { return v_exp_default_32f<v_float32, v_int32>(x); }
+inline v_float32 v_log(const v_float32& x) { return v_log_default_32f<v_float32, v_int32>(x); }
+inline void v_sincos(const v_float32& x, v_float32& s, v_float32& c) { v_sincos_default_32f<v_float32, v_int32>(x, s, c); }
+inline v_float32 v_sin(const v_float32& x) { return v_sin_default_32f<v_float32, v_int32>(x); }
+inline v_float32 v_cos(const v_float32& x) { return v_cos_default_32f<v_float32, v_int32>(x); }
+inline v_float32 v_erf(const v_float32& x) { return v_erf_default_32f<v_float32, v_int32>(x); }
+
+inline v_float64 v_exp(const v_float64& x) { return v_exp_default_64f<v_float64, v_int64>(x); }
+inline v_float64 v_log(const v_float64& x) { return v_log_default_64f<v_float64, v_int64>(x); }
+inline void v_sincos(const v_float64& x, v_float64& s, v_float64& c) { v_sincos_default_64f<v_float64, v_int64>(x, s, c); }
+inline v_float64 v_sin(const v_float64& x) { return v_sin_default_64f<v_float64, v_int64>(x); }
+inline v_float64 v_cos(const v_float64& x) { return v_cos_default_64f<v_float64, v_int64>(x); }
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 //! @endcond
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
index ee4545db6bd6..26ea34026382 100644
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -347,6 +347,8 @@ namespace hal_sse_internal
 #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
 inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
+template <> inline _Tpvec v_setzero_() { return v_setzero_##suffix(); } \
+template <> inline _Tpvec v_setall_(_Tp v) { return v_setall_##suffix(v); } \
 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
 { return _Tpvec(cast(a.val)); }
 
@@ -364,6 +366,11 @@ inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); }
 inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); }
 inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); }
 
+template <> inline v_uint64x2 v_setzero_() { return v_setzero_u64(); }
+template <> inline v_int64x2 v_setzero_() { return v_setzero_s64(); }
+template <> inline v_uint64x2 v_setall_(uint64 val) { return v_setall_u64(val); }
+template <> inline v_int64x2 v_setall_(int64 val) { return v_setall_s64(val); }
+
 template<typename _Tpvec> inline
 v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); }
 template<typename _Tpvec> inline
@@ -3462,6 +3469,21 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 
 inline void v_cleanup() {}
 
+#include "intrin_math.hpp"
+inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
+inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
+inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
+
+inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
+inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
+inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
+
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 //! @endcond
diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
index fbe690461a5e..2157e1e87063 100644
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -261,6 +261,8 @@ OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_float64x2, double)
 #define OPENCV_HAL_IMPL_VSX_INITVEC(_Tpvec, _Tp, suffix, cast)                        \
 inline _Tpvec v_setzero_##suffix() { return _Tpvec(vec_splats((_Tp)0)); }             \
 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(vec_splats((_Tp)v));}          \
+template <> inline _Tpvec v_setzero_() { return v_setzero_##suffix(); }               \
+template <> inline _Tpvec v_setall_(_Tp v) { return v_setall_##suffix(_Tp v); }       \
 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0 &a)  \
 { return _Tpvec((cast)a.val); }
 
@@ -1594,6 +1596,19 @@ template<int i, typename Tvec>
 inline Tvec v_broadcast_element(const Tvec& v)
 { return Tvec(vec_splat(v.val, i)); }
 
+#include "intrin_math.hpp"
+inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
+inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
+inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
+
+inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
+inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
+inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
 
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
diff --git a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
index 3a8069ca911e..70198451c084 100644
--- a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
@@ -8,9 +8,18 @@
 #include <limits>
 #include <cstring>
 #include <algorithm>
-#include <emscripten/version.h>
 #include "opencv2/core/saturate.hpp"
 
+
+// Emscripten v2.0.13 (latest officially supported, as of 07/30/2024):
+// __EMSCRIPTEN_major__, __EMSCRIPTEN_minor__ and __EMSCRIPTEN_tiny__ are defined via commandline in
+// https://github.com/emscripten-core/emscripten/blob/1690a5802cd1241adc9714fb7fa2f633d38860dc/tools/shared.py#L506-L515
+//
+// See https://github.com/opencv/opencv/pull/25909
+#ifndef __EMSCRIPTEN_major__
+#include <emscripten/version.h>
+#endif
+
 #define CV_SIMD128 1
 #define CV_SIMD128_64F 0 // Now all implementation of f64 use fallback, so disable it.
 #define CV_SIMD128_FP16 0
@@ -392,6 +401,8 @@ inline v128_t v128_cvti32x4_i64x2_high(const v128_t& a)
 #define OPENCV_HAL_IMPL_WASM_INITVEC(_Tpvec, _Tp, suffix, zsuffix, _Tps) \
 inline _Tpvec v_setzero_##suffix() { return _Tpvec(wasm_##zsuffix##_splat((_Tps)0)); } \
 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(wasm_##zsuffix##_splat((_Tps)v)); } \
+template <> inline _Tpvec v_setzero_() { return v_setzero_##suffix(); } \
+template <> inline _Tpvec v_setall_(_Tp v) { return v_setall_##suffix(v); } \
 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
 { return _Tpvec(a.val); }
 
@@ -2767,6 +2778,20 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 
 inline void v_cleanup() {}
 
+#include "intrin_math.hpp"
+inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
+inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
+inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
+
+inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
+inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
+inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 //! @endcond
diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index c6a4d9c7286e..a9a90e998964 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -568,6 +568,22 @@ typedef OutputArray OutputArrayOfArrays;
 typedef const _InputOutputArray& InputOutputArray;
 typedef InputOutputArray InputOutputArrayOfArrays;
 
+/** @brief Returns an empty InputArray or OutputArray.
+
+ This function is used to provide an "empty" or "null" array when certain functions
+ take optional input or output arrays that you don't want to provide.
+
+ Many OpenCV functions accept optional arguments as `cv::InputArray` or `cv::OutputArray`.
+ When you don't want to pass any data for these optional parameters, you can use `cv::noArray()`
+ to indicate that you are omitting them.
+
+ @return An empty `cv::InputArray` or `cv::OutputArray` that can be used as a placeholder.
+
+ @note This is often used when a function has optional arrays, and you do not want to
+ provide a specific input or output array.
+
+ @see cv::InputArray, cv::OutputArray
+ */
 CV_EXPORTS InputOutputArray noArray();
 
 /////////////////////////////////// MatAllocator //////////////////////////////////////
diff --git a/modules/core/include/opencv2/core/mat.inl.hpp b/modules/core/include/opencv2/core/mat.inl.hpp
index 221f2fee1df3..649ebd265726 100644
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@@ -155,10 +155,10 @@ inline Size _InputArray::getSz() const { return sz; }
 
 inline _InputArray::_InputArray() { init(0 + NONE, 0); }
 inline _InputArray::_InputArray(int _flags, void* _obj) { init(_flags, _obj); }
-inline _InputArray::_InputArray(const Mat& m) { init(MAT+ACCESS_READ, &m); }
-inline _InputArray::_InputArray(const std::vector<Mat>& vec) { init(STD_VECTOR_MAT+ACCESS_READ, &vec); }
-inline _InputArray::_InputArray(const UMat& m) { init(UMAT+ACCESS_READ, &m); }
-inline _InputArray::_InputArray(const std::vector<UMat>& vec) { init(STD_VECTOR_UMAT+ACCESS_READ, &vec); }
+inline _InputArray::_InputArray(const Mat& m) { init(+MAT+ACCESS_READ, &m); }
+inline _InputArray::_InputArray(const std::vector<Mat>& vec) { init(+STD_VECTOR_MAT+ACCESS_READ, &vec); }
+inline _InputArray::_InputArray(const UMat& m) { init(+UMAT+ACCESS_READ, &m); }
+inline _InputArray::_InputArray(const std::vector<UMat>& vec) { init(+STD_VECTOR_UMAT+ACCESS_READ, &vec); }
 
 template<typename _Tp> inline
 _InputArray::_InputArray(const std::vector<_Tp>& vec)
@@ -170,7 +170,7 @@ _InputArray::_InputArray(const std::array<_Tp, _Nm>& arr)
 
 template<std::size_t _Nm> inline
 _InputArray::_InputArray(const std::array<Mat, _Nm>& arr)
-{ init(STD_ARRAY_MAT + ACCESS_READ, arr.data(), Size(1, _Nm)); }
+{ init(+STD_ARRAY_MAT + ACCESS_READ, arr.data(), Size(1, _Nm)); }
 
 inline
 _InputArray::_InputArray(const std::vector<bool>& vec)
@@ -200,16 +200,16 @@ inline _InputArray::_InputArray(const double& val)
 { init(FIXED_TYPE + FIXED_SIZE + MATX + CV_64F + ACCESS_READ, &val, Size(1,1)); }
 
 inline _InputArray::_InputArray(const cuda::GpuMat& d_mat)
-{ init(CUDA_GPU_MAT + ACCESS_READ, &d_mat); }
+{ init(+CUDA_GPU_MAT + ACCESS_READ, &d_mat); }
 
 inline _InputArray::_InputArray(const std::vector<cuda::GpuMat>& d_mat)
-{	init(STD_VECTOR_CUDA_GPU_MAT + ACCESS_READ, &d_mat);}
+{	init(+STD_VECTOR_CUDA_GPU_MAT + ACCESS_READ, &d_mat);}
 
 inline _InputArray::_InputArray(const ogl::Buffer& buf)
-{ init(OPENGL_BUFFER + ACCESS_READ, &buf); }
+{ init(+OPENGL_BUFFER + ACCESS_READ, &buf); }
 
 inline _InputArray::_InputArray(const cuda::HostMem& cuda_mem)
-{ init(CUDA_HOST_MEM + ACCESS_READ, &cuda_mem); }
+{ init(+CUDA_HOST_MEM + ACCESS_READ, &cuda_mem); }
 
 template<typename _Tp> inline
 _InputArray _InputArray::rawIn(const std::vector<_Tp>& vec)
@@ -253,12 +253,12 @@ inline bool _InputArray::isGpuMatVector() const { return kind() == _InputArray::
 
 ////////////////////////////////////////////////////////////////////////////////////////
 
-inline _OutputArray::_OutputArray() { init(NONE + ACCESS_WRITE, 0); }
+inline _OutputArray::_OutputArray() { init(+NONE + ACCESS_WRITE, 0); }
 inline _OutputArray::_OutputArray(int _flags, void* _obj) { init(_flags + ACCESS_WRITE, _obj); }
-inline _OutputArray::_OutputArray(Mat& m) { init(MAT+ACCESS_WRITE, &m); }
-inline _OutputArray::_OutputArray(std::vector<Mat>& vec) { init(STD_VECTOR_MAT + ACCESS_WRITE, &vec); }
-inline _OutputArray::_OutputArray(UMat& m) { init(UMAT + ACCESS_WRITE, &m); }
-inline _OutputArray::_OutputArray(std::vector<UMat>& vec) { init(STD_VECTOR_UMAT + ACCESS_WRITE, &vec); }
+inline _OutputArray::_OutputArray(Mat& m) { init(+MAT+ACCESS_WRITE, &m); }
+inline _OutputArray::_OutputArray(std::vector<Mat>& vec) { init(+STD_VECTOR_MAT + ACCESS_WRITE, &vec); }
+inline _OutputArray::_OutputArray(UMat& m) { init(+UMAT + ACCESS_WRITE, &m); }
+inline _OutputArray::_OutputArray(std::vector<UMat>& vec) { init(+STD_VECTOR_UMAT + ACCESS_WRITE, &vec); }
 
 template<typename _Tp> inline
 _OutputArray::_OutputArray(std::vector<_Tp>& vec)
@@ -270,7 +270,7 @@ _OutputArray::_OutputArray(std::array<_Tp, _Nm>& arr)
 
 template<std::size_t _Nm> inline
 _OutputArray::_OutputArray(std::array<Mat, _Nm>& arr)
-{ init(STD_ARRAY_MAT + ACCESS_WRITE, arr.data(), Size(1, _Nm)); }
+{ init(+STD_ARRAY_MAT + ACCESS_WRITE, arr.data(), Size(1, _Nm)); }
 
 template<typename _Tp> inline
 _OutputArray::_OutputArray(std::vector<std::vector<_Tp> >& vec)
@@ -325,16 +325,16 @@ _OutputArray::_OutputArray(const _Tp* vec, int n)
 { init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_WRITE, vec, Size(n, 1)); }
 
 inline _OutputArray::_OutputArray(cuda::GpuMat& d_mat)
-{ init(CUDA_GPU_MAT + ACCESS_WRITE, &d_mat); }
+{ init(+CUDA_GPU_MAT + ACCESS_WRITE, &d_mat); }
 
 inline _OutputArray::_OutputArray(std::vector<cuda::GpuMat>& d_mat)
-{	init(STD_VECTOR_CUDA_GPU_MAT + ACCESS_WRITE, &d_mat);}
+{	init(+STD_VECTOR_CUDA_GPU_MAT + ACCESS_WRITE, &d_mat);}
 
 inline _OutputArray::_OutputArray(ogl::Buffer& buf)
-{ init(OPENGL_BUFFER + ACCESS_WRITE, &buf); }
+{ init(+OPENGL_BUFFER + ACCESS_WRITE, &buf); }
 
 inline _OutputArray::_OutputArray(cuda::HostMem& cuda_mem)
-{ init(CUDA_HOST_MEM + ACCESS_WRITE, &cuda_mem); }
+{ init(+CUDA_HOST_MEM + ACCESS_WRITE, &cuda_mem); }
 
 inline _OutputArray::_OutputArray(const Mat& m)
 { init(FIXED_TYPE + FIXED_SIZE + MAT + ACCESS_WRITE, &m); }
@@ -403,10 +403,10 @@ std::vector<std::vector<_Tp> >& _OutputArray::getVecVecRef() const
 
 inline _InputOutputArray::_InputOutputArray() { init(0+ACCESS_RW, 0); }
 inline _InputOutputArray::_InputOutputArray(int _flags, void* _obj) { init(_flags+ACCESS_RW, _obj); }
-inline _InputOutputArray::_InputOutputArray(Mat& m) { init(MAT+ACCESS_RW, &m); }
-inline _InputOutputArray::_InputOutputArray(std::vector<Mat>& vec) { init(STD_VECTOR_MAT+ACCESS_RW, &vec); }
-inline _InputOutputArray::_InputOutputArray(UMat& m) { init(UMAT+ACCESS_RW, &m); }
-inline _InputOutputArray::_InputOutputArray(std::vector<UMat>& vec) { init(STD_VECTOR_UMAT+ACCESS_RW, &vec); }
+inline _InputOutputArray::_InputOutputArray(Mat& m) { init(+MAT+ACCESS_RW, &m); }
+inline _InputOutputArray::_InputOutputArray(std::vector<Mat>& vec) { init(+STD_VECTOR_MAT+ACCESS_RW, &vec); }
+inline _InputOutputArray::_InputOutputArray(UMat& m) { init(+UMAT+ACCESS_RW, &m); }
+inline _InputOutputArray::_InputOutputArray(std::vector<UMat>& vec) { init(+STD_VECTOR_UMAT+ACCESS_RW, &vec); }
 
 template<typename _Tp> inline
 _InputOutputArray::_InputOutputArray(std::vector<_Tp>& vec)
@@ -418,7 +418,7 @@ _InputOutputArray::_InputOutputArray(std::array<_Tp, _Nm>& arr)
 
 template<std::size_t _Nm> inline
 _InputOutputArray::_InputOutputArray(std::array<Mat, _Nm>& arr)
-{ init(STD_ARRAY_MAT + ACCESS_RW, arr.data(), Size(1, _Nm)); }
+{ init(+STD_ARRAY_MAT + ACCESS_RW, arr.data(), Size(1, _Nm)); }
 
 template<typename _Tp> inline
 _InputOutputArray::_InputOutputArray(std::vector<std::vector<_Tp> >& vec)
@@ -473,13 +473,13 @@ _InputOutputArray::_InputOutputArray(const _Tp* vec, int n)
 { init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_RW, vec, Size(n, 1)); }
 
 inline _InputOutputArray::_InputOutputArray(cuda::GpuMat& d_mat)
-{ init(CUDA_GPU_MAT + ACCESS_RW, &d_mat); }
+{ init(+CUDA_GPU_MAT + ACCESS_RW, &d_mat); }
 
 inline _InputOutputArray::_InputOutputArray(ogl::Buffer& buf)
-{ init(OPENGL_BUFFER + ACCESS_RW, &buf); }
+{ init(+OPENGL_BUFFER + ACCESS_RW, &buf); }
 
 inline _InputOutputArray::_InputOutputArray(cuda::HostMem& cuda_mem)
-{ init(CUDA_HOST_MEM + ACCESS_RW, &cuda_mem); }
+{ init(+CUDA_HOST_MEM + ACCESS_RW, &cuda_mem); }
 
 inline _InputOutputArray::_InputOutputArray(const Mat& m)
 { init(FIXED_TYPE + FIXED_SIZE + MAT + ACCESS_RW, &m); }
@@ -622,7 +622,7 @@ Mat::Mat(const Vec<_Tp, n>& vec, bool copyData)
 
 template<typename _Tp, int m, int n> inline
 Mat::Mat(const Matx<_Tp,m,n>& M, bool copyData)
-    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows(m), cols(n), data(0),
+    : flags(+MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows(m), cols(n), data(0),
       datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows), step(0)
 {
     if( !copyData )
diff --git a/modules/core/include/opencv2/core/operations.hpp b/modules/core/include/opencv2/core/operations.hpp
index d345e24dc55d..1001d0460d2d 100644
--- a/modules/core/include/opencv2/core/operations.hpp
+++ b/modules/core/include/opencv2/core/operations.hpp
@@ -524,7 +524,7 @@ The generic function partition implements an \f$O(N^2)\f$ algorithm for splittin
 into one or more equivalency classes, as described in
 <http://en.wikipedia.org/wiki/Disjoint-set_data_structure> . The function returns the number of
 equivalency classes.
-@param _vec Set of elements stored as a vector.
+@param vec Set of elements stored as a vector.
 @param labels Output vector of labels. It contains as many elements as vec. Each label labels[i] is
 a 0-based cluster index of `vec[i]`.
 @param predicate Equivalence predicate (pointer to a boolean function of two arguments or an
@@ -534,11 +534,11 @@ may or may not be in the same class.
 @ingroup core_cluster
 */
 template<typename _Tp, class _EqPredicate> int
-partition( const std::vector<_Tp>& _vec, std::vector<int>& labels,
+partition( const std::vector<_Tp>& vec, std::vector<int>& labels,
           _EqPredicate predicate=_EqPredicate())
 {
-    int i, j, N = (int)_vec.size();
-    const _Tp* vec = &_vec[0];
+    int i, j, N = (int)vec.size();
+    const _Tp* _vec = &vec[0];
 
     const int PARENT=0;
     const int RANK=1;
@@ -564,7 +564,7 @@ partition( const std::vector<_Tp>& _vec, std::vector<int>& labels,
 
         for( j = 0; j < N; j++ )
         {
-            if( i == j || !predicate(vec[i], vec[j]))
+            if( i == j || !predicate(_vec[i], _vec[j]))
                 continue;
             int root2 = j;
 
diff --git a/modules/core/include/opencv2/core/persistence.hpp b/modules/core/include/opencv2/core/persistence.hpp
index 9c4f33fb1457..33f24d62e504 100644
--- a/modules/core/include/opencv2/core/persistence.hpp
+++ b/modules/core/include/opencv2/core/persistence.hpp
@@ -53,50 +53,6 @@
 #  error persistence.hpp header must be compiled as C++
 #endif
 
-//! @addtogroup core_c
-//! @{
-
-/** @brief "black box" representation of the file storage associated with a file on disk.
-
-Several functions that are described below take CvFileStorage\* as inputs and allow the user to
-save or to load hierarchical collections that consist of scalar values, standard CXCore objects
-(such as matrices, sequences, graphs), and user-defined objects.
-
-OpenCV can read and write data in XML (<http://www.w3c.org/XML>), YAML (<http://www.yaml.org>) or
-JSON (<http://www.json.org/>) formats. Below is an example of 3x3 floating-point identity matrix A,
-stored in XML and YAML files
-using CXCore functions:
-XML:
-@code{.xml}
-    <?xml version="1.0">
-    <opencv_storage>
-    <A type_id="opencv-matrix">
-      <rows>3</rows>
-      <cols>3</cols>
-      <dt>f</dt>
-      <data>1. 0. 0. 0. 1. 0. 0. 0. 1.</data>
-    </A>
-    </opencv_storage>
-@endcode
-YAML:
-@code{.yaml}
-    %YAML:1.0
-    A: !!opencv-matrix
-      rows: 3
-      cols: 3
-      dt: f
-      data: [ 1., 0., 0., 0., 1., 0., 0., 0., 1.]
-@endcode
-As it can be seen from the examples, XML uses nested tags to represent hierarchy, while YAML uses
-indentation for that purpose (similar to the Python programming language).
-
-The same functions can read and write data in both formats; the particular format is determined by
-the extension of the opened file, ".xml" for XML files, ".yml" or ".yaml" for YAML and ".json" for
-JSON.
- */
-
-//! @} core_c
-
 #include "opencv2/core/types.hpp"
 #include "opencv2/core/mat.hpp"
 
@@ -283,13 +239,14 @@ element is a structure of 2 integers, followed by a single-precision floating-po
 equivalent notations of the above specification are `iif`, `2i1f` and so forth. Other examples: `u`
 means that the array consists of bytes, and `2d` means the array consists of pairs of doubles.
 
-@see @ref samples/cpp/filestorage.cpp
+@see @ref samples/cpp/tutorial_code/core/file_input_output/file_input_output.cpp
 */
 
 //! @{
 
-/** @example samples/cpp/filestorage.cpp
+/** @example samples/cpp/tutorial_code/core/file_input_output/file_input_output.cpp
 A complete example using the FileStorage interface
+Check @ref tutorial_file_input_output_with_xml_yml "the corresponding tutorial" for more details
 */
 
 ////////////////////////// XML & YAML I/O //////////////////////////
@@ -322,10 +279,10 @@ class CV_EXPORTS_W FileStorage
     };
     enum State
     {
-        UNDEFINED      = 0,
-        VALUE_EXPECTED = 1,
-        NAME_EXPECTED  = 2,
-        INSIDE_MAP     = 4
+        UNDEFINED      = 0,  //!< Initial or uninitialized state.
+        VALUE_EXPECTED = 1,  //!< Expecting a value in the current position.
+        NAME_EXPECTED  = 2,  //!< Expecting a key/name in the current position.
+        INSIDE_MAP     = 4   //!< Indicates being inside a map (a set of key-value pairs).
     };
 
     /** @brief The constructors.
diff --git a/modules/core/include/opencv2/core/private.cuda.hpp b/modules/core/include/opencv2/core/private.cuda.hpp
index 36edd8ab31f9..39f2ddcdeb3d 100644
--- a/modules/core/include/opencv2/core/private.cuda.hpp
+++ b/modules/core/include/opencv2/core/private.cuda.hpp
@@ -134,6 +134,36 @@ namespace cv { namespace cuda
     template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
     template<> struct NPPTypeTraits<CV_64F> { typedef Npp64f npp_type; };
 
+#define nppSafeCall(expr)  cv::cuda::checkNppError(expr, __FILE__, __LINE__, CV_Func)
+// NppStreamContext is introduced in NPP version 10100 included in CUDA toolkit 10.1 (CUDA_VERSION == 10010) however not all of the NPP functions called internally by OpenCV
+// - have an NppStreamContext argument (e.g. nppiHistogramEvenGetBufferSize_8u_C1R_Ctx in CUDA 12.3) and/or
+// - have a corresponding function in the supplied library (e.g. nppiEvenLevelsHost_32s_Ctx is not present in nppist.lib or libnppist.so as of CUDA 12.6)
+// Because support for these functions has gradually been introduced without being mentioned in the release notes this flag is set to a version of NPP (version 12205 included in CUDA toolkit 12.4) which is known to work.
+#define USE_NPP_STREAM_CTX NPP_VERSION >= 12205
+#if USE_NPP_STREAM_CTX
+    class NppStreamHandler
+    {
+    public:
+        inline explicit NppStreamHandler(cudaStream_t newStream)
+        {
+            nppStreamContext = {};
+            nppSafeCall(nppGetStreamContext(&nppStreamContext));
+            nppStreamContext.hStream = newStream;
+            cudaSafeCall(cudaStreamGetFlags(nppStreamContext.hStream, &nppStreamContext.nStreamFlags));
+        }
+
+        inline explicit NppStreamHandler(Stream& newStream) : NppStreamHandler(StreamAccessor::getStream(newStream)) {}
+
+        inline operator NppStreamContext() const {
+            return nppStreamContext;
+        }
+
+        inline NppStreamContext get() { return nppStreamContext; }
+
+    private:
+        NppStreamContext nppStreamContext;
+    };
+#else
     class NppStreamHandler
     {
     public:
@@ -157,9 +187,9 @@ namespace cv { namespace cuda
     private:
         cudaStream_t oldStream;
     };
+#endif
 }}
 
-#define nppSafeCall(expr)  cv::cuda::checkNppError(expr, __FILE__, __LINE__, CV_Func)
 #define cuSafeCall(expr)  cv::cuda::checkCudaDriverApiError(expr, __FILE__, __LINE__, CV_Func)
 
 #endif // HAVE_CUDA
diff --git a/modules/core/include/opencv2/core/quaternion.hpp b/modules/core/include/opencv2/core/quaternion.hpp
index 9e3e44332f60..e39065020c5f 100644
--- a/modules/core/include/opencv2/core/quaternion.hpp
+++ b/modules/core/include/opencv2/core/quaternion.hpp
@@ -31,7 +31,7 @@
 #include <iostream>
 namespace cv
 {
-//! @addtogroup core
+//! @addtogroup core_quaternion
 //! @{
 
 //! Unit quaternion flag
diff --git a/modules/core/include/opencv2/core/utility.hpp b/modules/core/include/opencv2/core/utility.hpp
index 5eed688c830d..cfe98401dbd5 100644
--- a/modules/core/include/opencv2/core/utility.hpp
+++ b/modules/core/include/opencv2/core/utility.hpp
@@ -176,7 +176,38 @@ extern "C" typedef int (*ErrorCallback)( int status, const char* func_name,
 */
 CV_EXPORTS ErrorCallback redirectError( ErrorCallback errCallback, void* userdata=0, void** prevUserdata=0);
 
+/** @brief Generates a unique temporary file name.
+
+This function generates a full, unique file path for a temporary file,
+which can be used to create temporary files for various purposes.
+
+@param suffix (optional) The desired file extension or suffix for the temporary file (e.g., ".png", ".txt").
+If no suffix is provided (suffix = 0), the file will not have a specific extension.
+
+@return cv::String A full unique path for the temporary file.
+
+@note
+- The function does not create the file, it only generates the name.
+- The file name is unique for the system session.
+- Works cross-platform (Windows, Linux, macOS).
+ */
 CV_EXPORTS String tempfile( const char* suffix = 0);
+
+/** @brief Searches for files matching the specified pattern in a directory.
+
+This function searches for files that match a given pattern (e.g., `*.jpg`)
+in the specified directory. The search can be limited to the directory itself
+or be recursive, including subdirectories.
+
+@param pattern The file search pattern, which can include wildcards like `*`
+(for matching multiple characters) or `?` (for matching a single character).
+
+@param result  Output vector where the file paths matching the search
+pattern will be stored.
+@param recursive (optional) Boolean flag indicating whether to search
+subdirectories recursively. If true, the search will include all subdirectories.
+The default value is `false`.
+ */
 CV_EXPORTS void glob(String pattern, std::vector<String>& result, bool recursive = false);
 
 /** @brief OpenCV will try to set the number of threads for subsequent parallel regions.
@@ -309,11 +340,12 @@ class CV_EXPORTS_W TickMeter
     //! stops counting ticks.
     CV_WRAP void stop()
     {
-        int64 time = cv::getTickCount();
+        const int64 time = cv::getTickCount();
         if (startTime == 0)
             return;
         ++counter;
-        sumTime += (time - startTime);
+        lastTime = time - startTime;
+        sumTime += lastTime;
         startTime = 0;
     }
 
@@ -336,11 +368,35 @@ class CV_EXPORTS_W TickMeter
     }
 
     //! returns passed time in seconds.
-    CV_WRAP double getTimeSec()   const
+    CV_WRAP double getTimeSec() const
     {
         return (double)getTimeTicks() / getTickFrequency();
     }
 
+    //! returns counted ticks of the last iteration.
+    CV_WRAP int64 getLastTimeTicks() const
+    {
+        return lastTime;
+    }
+
+    //! returns passed time of the last iteration in microseconds.
+    CV_WRAP double getLastTimeMicro() const
+    {
+        return getLastTimeMilli()*1e3;
+    }
+
+    //! returns passed time of the last iteration in milliseconds.
+    CV_WRAP double getLastTimeMilli() const
+    {
+        return getLastTimeSec()*1e3;
+    }
+
+    //! returns passed time of the last iteration in seconds.
+    CV_WRAP double getLastTimeSec() const
+    {
+        return (double)getLastTimeTicks() / getTickFrequency();
+    }
+
     //! returns internal counter value.
     CV_WRAP int64 getCounter() const
     {
@@ -373,15 +429,17 @@ class CV_EXPORTS_W TickMeter
     //! resets internal values.
     CV_WRAP void reset()
     {
-        startTime = 0;
-        sumTime = 0;
         counter = 0;
+        sumTime = 0;
+        startTime = 0;
+        lastTime = 0;
     }
 
 private:
     int64 counter;
     int64 sumTime;
     int64 startTime;
+    int64 lastTime;
 };
 
 /** @brief output operator
diff --git a/modules/core/src/opengl.cpp b/modules/core/src/opengl.cpp
index 45aa121a4aaf..83be34f1477c 100644
--- a/modules/core/src/opengl.cpp
+++ b/modules/core/src/opengl.cpp
@@ -42,6 +42,12 @@
 
 #include "precomp.hpp"
 
+#if defined (__APPLE__) || defined(MACOSX)
+   #define GL_SHARING_EXTENSION "cl_APPLE_gl_sharing"
+#else
+   #define GL_SHARING_EXTENSION "cl_khr_gl_sharing"
+#endif
+
 #ifdef HAVE_OPENGL
 #  include "gl_core_3_1.hpp"
 #  ifdef HAVE_CUDA
@@ -1635,94 +1641,148 @@ Context& initializeContextFromGL()
 #elif !defined(HAVE_OPENCL_OPENGL_SHARING)
     NO_OPENCL_SHARING_ERROR;
 #else
-    cl_uint numPlatforms;
-    cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
+    cl_uint platformsCnt = 0;
+    cl_uint devCnt = 0;
+    cl_device_id* devices = nullptr;
+    cl_uint devUsed = 0;
+    cl_context context = nullptr;
+
+    cl_int status = clGetPlatformIDs(0, NULL, &platformsCnt);
     if (status != CL_SUCCESS)
         CV_Error_(cv::Error::OpenCLInitError, ("OpenCL: Can't get number of platforms: %d", status));
-    if (numPlatforms == 0)
+    if (platformsCnt == 0)
         CV_Error(cv::Error::OpenCLInitError, "OpenCL: No available platforms");
 
-    std::vector<cl_platform_id> platforms(numPlatforms);
-    status = clGetPlatformIDs(numPlatforms, &platforms[0], NULL);
+    std::vector<cl_platform_id> platforms(platformsCnt);
+    status = clGetPlatformIDs(platformsCnt, &platforms[0], NULL);
     if (status != CL_SUCCESS)
-        CV_Error_(cv::Error::OpenCLInitError, ("OpenCL: Can't get number of platforms: %d", status));
+        CV_Error_(cv::Error::OpenCLInitError, ("OpenCL: Can't get platforms: %d", status));
+
 
     // TODO Filter platforms by name from OPENCV_OPENCL_DEVICE
+    bool sharingSupported = false;
 
-    int found = -1;
-    cl_device_id device = NULL;
-    cl_context context = NULL;
+    for (unsigned int i = 0; (!sharingSupported && (i < platformsCnt)); ++i) {
+        status = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &devCnt);
+        if (status != CL_SUCCESS)
+            CV_Error_(cv::Error::OpenCLInitError, ("OpenCL: No devices available: %d", status));
 
-    for (int i = 0; i < (int)numPlatforms; i++)
-    {
-        // query platform extension: presence of "cl_khr_gl_sharing" extension is required
-        {
-            AutoBuffer<char> extensionStr;
+        try {
+            devices = new cl_device_id[devCnt];
 
-            size_t extensionSize;
-            status = clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 0, NULL, &extensionSize);
-            if (status == CL_SUCCESS)
-            {
-                extensionStr.allocate(extensionSize+1);
-                status = clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, extensionSize, (char*)extensionStr.data(), NULL);
-            }
+            status = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, devCnt, devices, NULL);
             if (status != CL_SUCCESS)
-                CV_Error_(cv::Error::OpenCLInitError, ("OpenCL: Can't get platform extension string: %d", status));
-
-            if (!strstr((const char*)extensionStr.data(), "cl_khr_gl_sharing"))
-                continue;
-        }
-
-        clGetGLContextInfoKHR_fn clGetGLContextInfoKHR = (clGetGLContextInfoKHR_fn)
-                clGetExtensionFunctionAddressForPlatform(platforms[i], "clGetGLContextInfoKHR");
-        if (!clGetGLContextInfoKHR)
+                CV_Error_(cv::Error::OpenCLInitError, ("OpenCL: Can't get platform devices: %d", status));
+
+            for (unsigned int j = 0; (!sharingSupported && (j < devCnt)); ++j) {
+                size_t extensionSize;
+                status = clGetDeviceInfo(devices[j], CL_DEVICE_EXTENSIONS, 0, NULL, &extensionSize );
+                if (status != CL_SUCCESS)
+                    CV_Error_(cv::Error::OpenCLInitError, ("OpenCL: No devices available: %d", status));
+
+                if(extensionSize > 0)
+                {
+                    char* extensions = nullptr;
+
+                    try {
+                        extensions = new char[extensionSize];
+
+                        status = clGetDeviceInfo(devices[j], CL_DEVICE_EXTENSIONS, extensionSize, extensions, &extensionSize);
+                        if (status != CL_SUCCESS)
+                            continue;
+                    } catch(...) {
+                        CV_Error(cv::Error::OpenCLInitError, "OpenCL: Exception thrown during device extensions gathering");
+                    }
+
+                    std::string devString;
+
+                    if(extensions != nullptr) {
+                        devString = extensions;
+                        delete[] extensions;
+                    }
+                    else {
+                        CV_Error(cv::Error::OpenCLInitError, "OpenCL: Unexpected error during device extensions gathering");
+                    }
+
+                    size_t oldPos = 0;
+                    size_t spacePos = devString.find(' ', oldPos); // extensions string is space delimited
+                    while (spacePos != devString.npos) {
+                        if (strcmp(GL_SHARING_EXTENSION,
+                                devString.substr(oldPos, spacePos - oldPos).c_str())
+                                == 0) {
+                            // Device supports context sharing with OpenGL
+                            devUsed = i;
+                            sharingSupported = true;
+                            break;
+                        }
+                        do {
+                            oldPos = spacePos + 1;
+                            spacePos = devString.find(' ', oldPos);
+                        } while (spacePos == oldPos);
+                    }
+                }
+            }
+        } catch(...) {
+            CV_Error(cv::Error::OpenCLInitError, "OpenCL: Exception thrown during device information gathering");
+            if(devices != nullptr) {
+                delete[] devices;
+            }
             continue;
+        }
 
-        cl_context_properties properties[] =
-        {
-#if defined(_WIN32)
-            CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i],
-            CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(),
-            CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(),
+        if (sharingSupported) {
+            // Define OS-specific context properties and create the OpenCL context
+#if defined (__APPLE__)
+                CGLContextObj cglContext = CGLGetCurrentContext();
+                CGLShareGroupObj cglShareGroup = CGLGetShareGroup(cglContext);
+                cl_context_properties props[] =
+                {
+                    CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, (cl_context_properties)cglShareGroup,
+                    0
+                };
+                context = clCreateContext(props, 0,0, NULL, NULL, &ciErrNum);
 #elif defined(__ANDROID__)
-            CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i],
-            CL_GL_CONTEXT_KHR, (cl_context_properties)eglGetCurrentContext(),
-            CL_EGL_DISPLAY_KHR, (cl_context_properties)eglGetCurrentDisplay(),
+                cl_context_properties props[] =
+                {
+                    CL_GL_CONTEXT_KHR, (cl_context_properties)glXGetCurrentContext(),
+                    CL_GLX_DISPLAY_KHR, (cl_context_properties)glXGetCurrentDisplay(),
+                    CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i],
+                    0
+                };
+                context = clCreateContext(props, 1, &devices[devUsed], NULL, NULL, &status);
+#elif defined(_WIN32)
+                cl_context_properties props[] =
+                {
+                    CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(),
+                    CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(),
+                    CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i],
+                    0
+                };
+                context = clCreateContext(props, 1, &devices[devUsed], NULL, NULL, &status);
 #elif defined(__linux__)
-            CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i],
-            CL_GL_CONTEXT_KHR, (cl_context_properties)glXGetCurrentContext(),
-            CL_GLX_DISPLAY_KHR, (cl_context_properties)glXGetCurrentDisplay(),
+                cl_context_properties props[] =
+                {
+                    CL_GL_CONTEXT_KHR, (cl_context_properties)glXGetCurrentContext(),
+                    CL_GLX_DISPLAY_KHR, (cl_context_properties)glXGetCurrentDisplay(),
+                    CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i],
+                    0
+                };
+                context = clCreateContext(props, 1, &devices[devUsed], NULL, NULL, &status);
 #endif
-            0
-        };
-
-        // query device
-        device = NULL;
-        status = clGetGLContextInfoKHR(properties, CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR, sizeof(cl_device_id), (void*)&device, NULL);
-        if (status != CL_SUCCESS)
-            continue;
+        }
 
-        // create context
-        context = clCreateContext(properties, 1, &device, NULL, NULL, &status);
         if (status != CL_SUCCESS)
-        {
-            clReleaseDevice(device);
-        }
+            CV_Error_(cv::Error::OpenCLInitError, ("OpenCL: Can't create context for OpenGL interop: %d", status));
         else
-        {
-            found = i;
             break;
-        }
     }
 
-    if (found < 0)
-        CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't create context for OpenGL interop");
 
-    cl_platform_id platform = platforms[found];
+    cl_platform_id platform = platforms[devUsed];
     std::string platformName = PlatformInfo(&platform).name();
 
-    OpenCLExecutionContext clExecCtx = OpenCLExecutionContext::create(platformName, platform, context, device);
-    clReleaseDevice(device);
+    OpenCLExecutionContext clExecCtx = OpenCLExecutionContext::create(platformName, platform, context, devices[devUsed]);
+    clReleaseDevice(devices[devUsed]);
     clReleaseContext(context);
     clExecCtx.bind();
     return const_cast<Context&>(clExecCtx.getContext());
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index 92b9dff2b1da..d42652a33575 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -307,14 +307,20 @@ template<typename R> struct TheTest
 #else
 #error "Configuration error"
 #endif
+        R setall_res3 = v_setall_<R>((LaneType)7);
+        R setall_resz = v_setzero_<R>();
 #if CV_SIMD_WIDTH > 0
         Data<R> setall_res1_; v_store(setall_res1_.d, setall_res1);
         Data<R> setall_res2_; v_store(setall_res2_.d, setall_res2);
+        Data<R> setall_res3_; v_store(setall_res3_.d, setall_res3);
+        Data<R> setall_resz_; v_store(setall_resz_.d, setall_resz);
         for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ((LaneType)5, setall_res1_[i]);
             EXPECT_EQ((LaneType)6, setall_res2_[i]);
+            EXPECT_EQ((LaneType)7, setall_res3_[i]);
+            EXPECT_EQ((LaneType)0, setall_resz_[i]);
         }
 #endif
 
@@ -2084,6 +2090,100 @@ template<typename R> struct TheTest
 
         return *this;
     }
+
+    void __test_sincos(LaneType diff_thr, LaneType flt_min) {
+        int n = VTraits<R>::vlanes();
+        // Test each value for a period, from -PI to PI
+        const LaneType step = (LaneType) 0.01;
+        for (LaneType i = (LaneType)0; i <= (LaneType)M_PI;) {
+            Data<R> dataPosPI, dataNegPI;
+            for (int j = 0; j < n; ++j) {
+                dataPosPI[j] = i;
+                dataNegPI[j] = LaneType(-1*i);
+                i = LaneType(i + step);
+            }
+            R posPI = dataPosPI, negPI = dataNegPI, sinPos, cosPos, sinNeg, cosNeg;
+            v_sincos(posPI, sinPos, cosPos);
+            v_sincos(negPI, sinNeg, cosNeg);
+            Data<R> resSinPos = sinPos, resCosPos = cosPos, resSinNeg = sinNeg, resCosNeg = cosNeg;
+            for (int j = 0; j < n; ++j) {
+                LaneType std_sin_pos = (LaneType) std::sin(dataPosPI[j]);
+                LaneType std_cos_pos = (LaneType) std::cos(dataPosPI[j]);
+                LaneType std_sin_neg = (LaneType) std::sin(dataNegPI[j]);
+                LaneType std_cos_neg = (LaneType) std::cos(dataNegPI[j]);
+                SCOPED_TRACE(cv::format("Period test value: %lf and %lf", (double) dataPosPI[j], (double) dataNegPI[j]));
+                EXPECT_LT(std::abs(resSinPos[j] - std_sin_pos), diff_thr * (std::abs(std_sin_pos) + flt_min * 100));
+                EXPECT_LT(std::abs(resCosPos[j] - std_cos_pos), diff_thr * (std::abs(std_cos_pos) + flt_min * 100));
+                EXPECT_LT(std::abs(resSinNeg[j] - std_sin_neg), diff_thr * (std::abs(std_sin_neg) + flt_min * 100));
+                EXPECT_LT(std::abs(resCosNeg[j] - std_cos_neg), diff_thr * (std::abs(std_cos_neg) + flt_min * 100));
+            }
+        }
+
+        // Test special values
+        std::vector<LaneType> specialValues = {(LaneType) 0, (LaneType) M_PI, (LaneType) (M_PI / 2), (LaneType) INFINITY, (LaneType) -INFINITY, (LaneType) NAN};
+        const int testRandNum = 10000;
+        const double specialValueProbability = 0.1; // 10% chance to insert a special value
+        cv::RNG_MT19937 rng;
+
+        for (int i = 0; i < testRandNum; i++) {
+            Data<R> dataRand;
+            for (int j = 0; j < n; ++j) {
+                if (rng.uniform(0.f, 1.f) <= specialValueProbability) {
+                    // Insert a special value
+                    int specialValueIndex = rng.uniform(0, (int) specialValues.size());
+                    dataRand[j] = specialValues[specialValueIndex];
+                } else {
+                    // Generate uniform random data in [-1000, 1000]
+                    dataRand[j] = (LaneType) rng.uniform(-1000, 1000);
+                }
+            }
+
+            // Compare with std::sin and std::cos
+            R x = dataRand, s, c;
+            v_sincos(x, s, c);
+            Data<R> resSin = s, resCos = c;
+            for (int j = 0; j < n; ++j) {
+                SCOPED_TRACE(cv::format("Random test value: %lf", (double) dataRand[j]));
+                LaneType std_sin = (LaneType) std::sin(dataRand[j]);
+                LaneType std_cos = (LaneType) std::cos(dataRand[j]);
+                // input NaN, +INF, -INF -> output NaN
+                if (std::isnan(dataRand[j]) || std::isinf(dataRand[j])) {
+                    EXPECT_TRUE(std::isnan(resSin[j]));
+                    EXPECT_TRUE(std::isnan(resCos[j]));
+                } else if(dataRand[j] == 0) {
+                    // sin(0) -> 0, cos(0) -> 1
+                    EXPECT_EQ(resSin[j], 0);
+                    EXPECT_EQ(resCos[j], 1);
+                } else {
+                    EXPECT_LT(std::abs(resSin[j] - std_sin), diff_thr * (std::abs(std_sin) + flt_min * 100));
+                    EXPECT_LT(std::abs(resCos[j] - std_cos), diff_thr * (std::abs(std_cos) + flt_min * 100));
+                }
+            }
+        }
+    }
+
+    // BUG: https://github.com/opencv/opencv/issues/26362
+    TheTest &test_sincos_fp16() {
+#if 0 // CV_SIMD_FP16
+        hfloat flt16_min;
+        uint16_t flt16_min_hex = 0x0400;
+        std::memcpy(&flt16_min, &flt16_min_hex, sizeof(hfloat));
+        __test_sincos((hfloat) 1e-3, flt16_min);
+#endif
+        return *this;
+    }
+
+    TheTest &test_sincos_fp32() {
+        __test_sincos(1e-6f, FLT_MIN);
+        return *this;
+    }
+
+    TheTest &test_sincos_fp64() {
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
+        __test_sincos(1e-11, DBL_MIN);
+#endif
+        return *this;
+    }
 };
 
 #define DUMP_ENTRY(type) printf("SIMD%d: %s\n", 8*VTraits<v_uint8>::vlanes(), CV__TRACE_FUNCTION);
@@ -2399,6 +2499,7 @@ void test_hal_intrin_float32()
         .test_pack_triplets()
         .test_exp_fp32()
         .test_log_fp32()
+        .test_sincos_fp32()
         .test_erf_fp32()
 #if CV_SIMD_WIDTH == 32
         .test_extract<4>().test_extract<5>().test_extract<6>().test_extract<7>()
@@ -2433,6 +2534,7 @@ void test_hal_intrin_float64()
         .test_extract_highest()
         .test_exp_fp64()
         .test_log_fp64()
+        .test_sincos_fp64()
         //.test_broadcast_element<0>().test_broadcast_element<1>()
 #if CV_SIMD_WIDTH == 32
         .test_extract<2>().test_extract<3>()
@@ -2476,6 +2578,7 @@ void test_hal_intrin_float16()
         .test_extract_n<0>().test_extract_n<1>()
         .test_exp_fp16()
         .test_log_fp16()
+        .test_sincos_fp16()
 #else
     std::cout << "SKIP: CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16 is not available" << std::endl;
 #endif
diff --git a/modules/gapi/cmake/DownloadADE.cmake b/modules/gapi/cmake/DownloadADE.cmake
index 871f99b419c1..8ddaadb51190 100644
--- a/modules/gapi/cmake/DownloadADE.cmake
+++ b/modules/gapi/cmake/DownloadADE.cmake
@@ -1,7 +1,7 @@
 set(ade_src_dir "${OpenCV_BINARY_DIR}/3rdparty/ade")
-set(ade_filename "v0.1.2d.zip")
-set(ade_subdir "ade-0.1.2d")
-set(ade_md5 "dbb095a8bf3008e91edbbf45d8d34885")
+set(ade_filename "v0.1.2e.zip")
+set(ade_subdir "ade-0.1.2e")
+set(ade_md5 "962ce79e0b95591f226431f7b5f152cd")
 ocv_download(FILENAME ${ade_filename}
              HASH ${ade_md5}
              URL
diff --git a/modules/gapi/include/opencv2/gapi/infer/bindings_onnx.hpp b/modules/gapi/include/opencv2/gapi/infer/bindings_onnx.hpp
index 0b6dab6a9d41..f7bb2599242d 100644
--- a/modules/gapi/include/opencv2/gapi/infer/bindings_onnx.hpp
+++ b/modules/gapi/include/opencv2/gapi/infer/bindings_onnx.hpp
@@ -54,6 +54,9 @@ class GAPI_EXPORTS_W_SIMPLE PyParams {
     GAPI_WRAP
     PyParams& cfgSessionOptions(const std::map<std::string, std::string>& options);
 
+    GAPI_WRAP
+    PyParams& cfgOptLevel(const int opt_level);
+
     GBackend backend() const;
     std::string tag() const;
     cv::util::any params() const;
diff --git a/modules/gapi/include/opencv2/gapi/infer/onnx.hpp b/modules/gapi/include/opencv2/gapi/infer/onnx.hpp
index fd0f69a768e4..eb6316b44681 100644
--- a/modules/gapi/include/opencv2/gapi/infer/onnx.hpp
+++ b/modules/gapi/include/opencv2/gapi/infer/onnx.hpp
@@ -15,6 +15,7 @@
 
 #include <opencv2/gapi/opencv_includes.hpp>
 #include <opencv2/gapi/util/any.hpp>
+#include <opencv2/gapi/util/optional.hpp>
 
 #include <opencv2/core/cvdef.h>     // GAPI_EXPORTS
 #include <opencv2/gapi/gkernel.hpp> // GKernelPackage
@@ -354,6 +355,7 @@ struct ParamDesc {
     std::map<std::string, std::string> session_options;
     std::vector<cv::gapi::onnx::ep::EP> execution_providers;
     bool disable_mem_pattern;
+    cv::util::optional<int> opt_level;
 };
 } // namespace detail
 
@@ -648,6 +650,17 @@ template<typename Net> class Params {
         return *this;
     }
 
+    /** @brief Configures optimization level for ONNX Runtime.
+
+    @param opt_level [optimization level]: Valid values are 0 (disable), 1 (basic), 2 (extended), 99 (all).
+    Please see onnxruntime_c_api.h (enum GraphOptimizationLevel) for the full list of all optimization levels.
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgOptLevel(const int opt_level) {
+        desc.opt_level = cv::util::make_optional(opt_level);
+        return *this;
+    }
+
     // BEGIN(G-API's network parametrization API)
     GBackend      backend() const { return cv::gapi::onnx::backend(); }
     std::string   tag()     const { return Net::tag(); }
@@ -675,7 +688,7 @@ class Params<cv::gapi::Generic> {
     @param model_path path to model file (.onnx file).
     */
     Params(const std::string& tag, const std::string& model_path)
-        : desc{model_path, 0u, 0u, {}, {}, {}, {}, {}, {}, {}, {}, {}, true, {}, {}, {}, {}, false}, m_tag(tag) {}
+        : desc{ model_path, 0u, 0u, {}, {}, {}, {}, {}, {}, {}, {}, {}, true, {}, {}, {}, {}, false, {} }, m_tag(tag) {}
 
     /** @see onnx::Params::cfgMeanStdDev. */
     void cfgMeanStdDev(const std::string &layer,
@@ -724,6 +737,11 @@ class Params<cv::gapi::Generic> {
         desc.session_options.insert(options.begin(), options.end());
     }
 
+/** @see onnx::Params::cfgOptLevel. */
+    void cfgOptLevel(const int opt_level) {
+        desc.opt_level = cv::util::make_optional(opt_level);
+    }
+
     // BEGIN(G-API's network parametrization API)
     GBackend      backend() const { return cv::gapi::onnx::backend(); }
     std::string   tag()     const { return m_tag; }
diff --git a/modules/gapi/src/backends/onnx/bindings_onnx.cpp b/modules/gapi/src/backends/onnx/bindings_onnx.cpp
index 294ad8a3cc21..5a2e3d2f6ded 100644
--- a/modules/gapi/src/backends/onnx/bindings_onnx.cpp
+++ b/modules/gapi/src/backends/onnx/bindings_onnx.cpp
@@ -63,6 +63,12 @@ cv::gapi::onnx::PyParams::cfgSessionOptions(const std::map<std::string, std::str
     return *this;
 }
 
+cv::gapi::onnx::PyParams&
+cv::gapi::onnx::PyParams::cfgOptLevel(const int opt_level) {
+    m_priv->cfgOptLevel(opt_level);
+    return *this;
+}
+
 cv::gapi::GBackend cv::gapi::onnx::PyParams::backend() const {
     return m_priv->backend();
 }
diff --git a/modules/gapi/src/backends/onnx/gonnxbackend.cpp b/modules/gapi/src/backends/onnx/gonnxbackend.cpp
index 0d9a16a7bd7d..fc9b12b081f4 100644
--- a/modules/gapi/src/backends/onnx/gonnxbackend.cpp
+++ b/modules/gapi/src/backends/onnx/gonnxbackend.cpp
@@ -701,6 +701,26 @@ namespace cv {
 namespace gimpl {
 namespace onnx {
 
+static GraphOptimizationLevel convertToGraphOptimizationLevel(const int opt_level) {
+    switch (opt_level) {
+    case ORT_DISABLE_ALL:
+        return ORT_DISABLE_ALL;
+    case ORT_ENABLE_BASIC:
+        return ORT_ENABLE_BASIC;
+    case ORT_ENABLE_EXTENDED:
+        return ORT_ENABLE_EXTENDED;
+    case ORT_ENABLE_ALL:
+        return ORT_ENABLE_ALL;
+    default:
+        if (opt_level > ORT_ENABLE_ALL) {  // relax constraint
+            return ORT_ENABLE_ALL;
+        }
+        else {
+            cv::util::throw_error(std::invalid_argument("Invalid argument opt_level = " + std::to_string(opt_level)));
+        }
+    }
+}
+
 ONNXCompiled::ONNXCompiled(const gapi::onnx::detail::ParamDesc &pp)
     : params(pp) {
     // Validate input parameters before allocating any resources
@@ -726,6 +746,10 @@ ONNXCompiled::ONNXCompiled(const gapi::onnx::detail::ParamDesc &pp)
     if (pp.disable_mem_pattern) {
         session_options.DisableMemPattern();
     }
+
+    if (pp.opt_level.has_value()) {
+        session_options.SetGraphOptimizationLevel(convertToGraphOptimizationLevel(pp.opt_level.value()));
+    }
     this_env = Ort::Env(ORT_LOGGING_LEVEL_WARNING, "");
 #ifndef _WIN32
     this_session = Ort::Session(this_env, params.model_path.data(), session_options);
diff --git a/modules/imgcodecs/include/opencv2/imgcodecs.hpp b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
index 9063aafe2c0f..6e317f585b7c 100644
--- a/modules/imgcodecs/include/opencv2/imgcodecs.hpp
+++ b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
@@ -405,7 +405,7 @@ The function imencode compresses the image and stores it in the memory buffer th
 result. See cv::imwrite for the list of supported formats and flags description.
 
 @param ext File extension that defines the output format. Must include a leading period.
-@param img Image to be written.
+@param img Image to be compressed.
 @param buf Output buffer resized to fit the compressed image.
 @param params Format-specific parameters. See cv::imwrite and cv::ImwriteFlags.
 */
@@ -413,6 +413,20 @@ CV_EXPORTS_W bool imencode( const String& ext, InputArray img,
                             CV_OUT std::vector<uchar>& buf,
                             const std::vector<int>& params = std::vector<int>());
 
+/** @brief Encodes array of images into a memory buffer.
+
+The function is analog to cv::imencode for in-memory multi-page image compression.
+See cv::imwrite for the list of supported formats and flags description.
+
+@param ext File extension that defines the output format. Must include a leading period.
+@param imgs Vector of images to be written.
+@param buf Output buffer resized to fit the compressed data.
+@param params Format-specific parameters. See cv::imwrite and cv::ImwriteFlags.
+*/
+CV_EXPORTS_W bool imencodemulti( const String& ext, InputArrayOfArrays imgs,
+                                 CV_OUT std::vector<uchar>& buf,
+                                 const std::vector<int>& params = std::vector<int>());
+
 /** @brief Checks if the specified image file can be decoded by OpenCV.
 
 The function haveImageReader checks if OpenCV is capable of reading the specified file.
diff --git a/modules/imgcodecs/src/grfmt_avif.cpp b/modules/imgcodecs/src/grfmt_avif.cpp
index 98ddb7336268..4752c6ece41b 100644
--- a/modules/imgcodecs/src/grfmt_avif.cpp
+++ b/modules/imgcodecs/src/grfmt_avif.cpp
@@ -142,8 +142,7 @@ static constexpr size_t kAvifSignatureSize = 500;
 AvifDecoder::AvifDecoder() {
   m_buf_supported = true;
   channels_ = 0;
-  decoder_ = avifDecoderCreate();
-  decoder_->strictFlags = AVIF_STRICT_DISABLED;
+  decoder_ = nullptr;
 }
 
 AvifDecoder::~AvifDecoder() {
@@ -181,6 +180,11 @@ bool AvifDecoder::checkSignature(const String &signature) const {
 ImageDecoder AvifDecoder::newDecoder() const { return makePtr<AvifDecoder>(); }
 
 bool AvifDecoder::readHeader() {
+  if (decoder_)
+    return true;
+
+  decoder_ = avifDecoderCreate();
+  decoder_->strictFlags = AVIF_STRICT_DISABLED;
   if (!m_buf.empty()) {
     CV_Assert(m_buf.type() == CV_8UC1);
     CV_Assert(m_buf.rows == 1);
diff --git a/modules/imgcodecs/src/grfmt_tiff.cpp b/modules/imgcodecs/src/grfmt_tiff.cpp
index e2184663aafd..ccc6579a012d 100644
--- a/modules/imgcodecs/src/grfmt_tiff.cpp
+++ b/modules/imgcodecs/src/grfmt_tiff.cpp
@@ -171,7 +171,7 @@ class TiffDecoderBufHelper
         {
             n = size - pos;
         }
-        memcpy(buffer, buf.ptr() + pos, n);
+        std::memcpy(buffer, buf.ptr() + pos, n);
         helper->m_buf_pos += n;
         return n;
     }
@@ -848,9 +848,9 @@ bool  TiffDecoder::readData( Mat& img )
                                     switch ( convert_flag )
                                     {
                                     case MAKE_FLAG( 1, 1 ): // GRAY to GRAY
-                                        memcpy( (void*) img_line_buffer,
-                                                (void*) bstart,
-                                                tile_width * sizeof(uchar) );
+                                        std::memcpy( (void*) img_line_buffer,
+                                                     (void*) bstart,
+                                                     tile_width * sizeof(uchar) );
                                         break;
 
                                     case MAKE_FLAG( 1, 3 ): // GRAY to BGR
@@ -867,9 +867,9 @@ bool  TiffDecoder::readData( Mat& img )
 
                                     case MAKE_FLAG( 3, 3 ): // RGB to BGR
                                         if (m_use_rgb)
-                                            memcpy( (void*) img_line_buffer,
-                                                    (void*) bstart,
-                                                    tile_width * sizeof(uchar) );
+                                            std::memcpy( (void*) img_line_buffer,
+                                                         (void*) bstart,
+                                                         tile_width * sizeof(uchar) );
                                         else
                                             icvCvt_BGR2RGB_8u_C3R( bstart, 0,
                                                     img_line_buffer, 0,
@@ -979,7 +979,7 @@ bool  TiffDecoder::readData( Mat& img )
                                     {
                                         CV_CheckEQ(wanted_channels, 3, "");
                                         if (m_use_rgb)
-                                            memcpy(buffer16, img.ptr<ushort>(img_y + i, x), tile_width * sizeof(ushort));
+                                            std::memcpy(buffer16, img.ptr<ushort>(img_y + i, x), tile_width * sizeof(ushort));
                                         else
                                             icvCvt_RGB2BGR_16u_C3R(buffer16, 0,
                                                     img.ptr<ushort>(img_y + i, x), 0,
@@ -1011,9 +1011,9 @@ bool  TiffDecoder::readData( Mat& img )
                                     CV_CheckEQ(wanted_channels, 1, "");
                                     if( ncn == 1 )
                                     {
-                                        memcpy(img.ptr<ushort>(img_y + i, x),
-                                               buffer16,
-                                               tile_width*sizeof(ushort));
+                                        std::memcpy(img.ptr<ushort>(img_y + i, x),
+                                                    buffer16,
+                                                    tile_width*sizeof(ushort));
                                     }
                                     else
                                     {
@@ -1118,10 +1118,16 @@ class TiffEncoderBufHelper
                                /*map=*/0, /*unmap=*/0 );
     }
 
-    static tmsize_t read( thandle_t /*handle*/, void* /*buffer*/, tmsize_t /*n*/ )
+    static tmsize_t read( thandle_t handle, void* buffer, tmsize_t n )
     {
-        // Not used for encoding.
-        return 0;
+        // Used for imencodemulti() to stores multi-images.
+        TiffEncoderBufHelper *helper = reinterpret_cast<TiffEncoderBufHelper*>(handle);
+        size_t begin = (size_t)helper->m_buf_pos;
+        size_t end = begin + n;
+        CV_CheckGT( helper->m_buf->size(), end , "do not be over-run buffer");
+        std::memcpy(buffer, &(*helper->m_buf)[begin], n);
+        helper->m_buf_pos = end;
+        return n;
     }
 
     static tmsize_t write( thandle_t handle, void* buffer, tmsize_t n )
@@ -1133,7 +1139,7 @@ class TiffEncoderBufHelper
         {
             helper->m_buf->resize(end);
         }
-        memcpy(&(*helper->m_buf)[begin], buffer, n);
+        std::memcpy(&(*helper->m_buf)[begin], buffer, n);
         helper->m_buf_pos = end;
         return n;
     }
@@ -1350,7 +1356,7 @@ bool TiffEncoder::writeLibTiff( const std::vector<Mat>& img_vec, const std::vect
             {
                 case 1:
                 {
-                    memcpy(buffer, img.ptr(y), scanlineSize);
+                    std::memcpy(buffer, img.ptr(y), scanlineSize);
                     break;
                 }
 
diff --git a/modules/imgcodecs/src/loadsave.cpp b/modules/imgcodecs/src/loadsave.cpp
index 745b8633cc24..4f8d894aeeaa 100644
--- a/modules/imgcodecs/src/loadsave.cpp
+++ b/modules/imgcodecs/src/loadsave.cpp
@@ -723,6 +723,7 @@ static bool imwrite_( const String& filename, const std::vector<Mat>& img_vec,
         Mat temp;
         if( !encoder->isFormatSupported(image.depth()) )
         {
+            CV_LOG_ONCE_WARNING(NULL, "Unsupported depth image for selected encoder is fallbacked to CV_8U.");
             CV_Assert( encoder->isFormatSupported(CV_8U) );
             image.convertTo( temp, CV_8U );
             image = temp;
@@ -769,10 +770,12 @@ static bool imwrite_( const String& filename, const std::vector<Mat>& img_vec,
     catch (const cv::Exception& e)
     {
         CV_LOG_ERROR(NULL, "imwrite_('" << filename << "'): can't write data: " << e.what());
+        code = false;
     }
     catch (...)
     {
         CV_LOG_ERROR(NULL, "imwrite_('" << filename << "'): can't write data: unknown exception");
+        code = false;
     }
 
     return code;
@@ -960,7 +963,7 @@ imdecodemulti_(const Mat& buf, int flags, std::vector<Mat>& mats, int start, int
 
     ImageDecoder decoder = findDecoder(buf_row);
     if (!decoder)
-        return 0;
+        return false;
 
     // Try to decode image by RGB instead of BGR.
     if (flags & IMREAD_COLOR_RGB && flags != IMREAD_UNCHANGED)
@@ -977,7 +980,7 @@ imdecodemulti_(const Mat& buf, int flags, std::vector<Mat>& mats, int start, int
         filename = tempfile();
         FILE* f = fopen(filename.c_str(), "wb");
         if (!f)
-            return 0;
+            return false;
         size_t bufSize = buf_row.total() * buf.elemSize();
         if (fwrite(buf_row.ptr(), 1, bufSize, f) != bufSize)
         {
@@ -1103,49 +1106,80 @@ bool imdecodemulti(InputArray _buf, int flags, CV_OUT std::vector<Mat>& mats, co
     }
 }
 
-bool imencode( const String& ext, InputArray _image,
+bool imencode( const String& ext, InputArray _img,
                std::vector<uchar>& buf, const std::vector<int>& params )
 {
     CV_TRACE_FUNCTION();
 
-    Mat image = _image.getMat();
-    CV_Assert(!image.empty());
-
-    int channels = image.channels();
-    CV_Assert( channels == 1 || channels == 3 || channels == 4 );
-
     ImageEncoder encoder = findEncoder( ext );
     if( !encoder )
         CV_Error( Error::StsError, "could not find encoder for the specified extension" );
 
-    if( !encoder->isFormatSupported(image.depth()) )
+    std::vector<Mat> img_vec;
+    CV_Assert(!_img.empty());
+    if (_img.isMatVector() || _img.isUMatVector())
+        _img.getMatVector(img_vec);
+    else
+        img_vec.push_back(_img.getMat());
+
+    CV_Assert(!img_vec.empty());
+    const bool isMultiImg = img_vec.size() > 1;
+
+    std::vector<Mat> write_vec;
+    for (size_t page = 0; page < img_vec.size(); page++)
     {
-        CV_Assert( encoder->isFormatSupported(CV_8U) );
+        Mat image = img_vec[page];
+        CV_Assert(!image.empty());
+
+        const int channels = image.channels();
+        CV_Assert( channels == 1 || channels == 3 || channels == 4 );
+
         Mat temp;
-        image.convertTo(temp, CV_8U);
-        image = temp;
+        if( !encoder->isFormatSupported(image.depth()) )
+        {
+            CV_LOG_ONCE_WARNING(NULL, "Unsupported depth image for selected encoder is fallbacked to CV_8U.");
+            CV_Assert( encoder->isFormatSupported(CV_8U) );
+            image.convertTo( temp, CV_8U );
+            image = temp;
+        }
+
+        write_vec.push_back(image);
     }
 
     CV_Check(params.size(), (params.size() & 1) == 0, "Encoding 'params' must be key-value pairs");
     CV_CheckLE(params.size(), (size_t)(CV_IO_MAX_IMAGE_PARAMS*2), "");
 
-    bool code;
-    if( encoder->setDestination(buf) )
-    {
-        code = encoder->write(image, params);
-        encoder->throwOnEror();
-        CV_Assert( code );
-    }
-    else
+    bool code = false;
+    String filename;
+    if( !encoder->setDestination(buf) )
     {
-        String filename = tempfile();
+        filename = tempfile();
         code = encoder->setDestination(filename);
         CV_Assert( code );
+    }
+
+    try {
+        if (!isMultiImg)
+            code = encoder->write(write_vec[0], params);
+        else
+            code = encoder->writemulti(write_vec, params);
 
-        code = encoder->write(image, params);
         encoder->throwOnEror();
         CV_Assert( code );
+    }
+    catch (const cv::Exception& e)
+    {
+        CV_LOG_ERROR(NULL, "imencode(): can't encode data: " << e.what());
+        code = false;
+    }
+    catch (...)
+    {
+        CV_LOG_ERROR(NULL, "imencode(): can't encode data: unknown exception");
+        code = false;
+    }
 
+    if( !filename.empty() && code )
+    {
         FILE* f = fopen( filename.c_str(), "rb" );
         CV_Assert(f != 0);
         fseek( f, 0, SEEK_END );
@@ -1159,6 +1193,12 @@ bool imencode( const String& ext, InputArray _image,
     return code;
 }
 
+bool imencodemulti( const String& ext, InputArrayOfArrays imgs,
+                    std::vector<uchar>& buf, const std::vector<int>& params)
+{
+    return imencode(ext, imgs, buf, params);
+}
+
 bool haveImageReader( const String& filename )
 {
     ImageDecoder decoder = cv::findDecoder(filename);
diff --git a/modules/imgcodecs/test/test_avif.cpp b/modules/imgcodecs/test/test_avif.cpp
index 0d8a718756e6..d94e5d458c96 100644
--- a/modules/imgcodecs/test/test_avif.cpp
+++ b/modules/imgcodecs/test/test_avif.cpp
@@ -161,14 +161,14 @@ TEST_P(Imgcodecs_Avif_Image_EncodeDecodeSuite, imencode_imdecode) {
 
   // Encode.
   std::vector<unsigned char> buf;
-  if (!IsBitDepthValid()) {
-    EXPECT_THROW(cv::imencode(".avif", img_original, buf, encoding_params_),
-                 cv::Exception);
-    return;
-  }
   bool result = true;
   EXPECT_NO_THROW(
       result = cv::imencode(".avif", img_original, buf, encoding_params_););
+
+  if (!IsBitDepthValid()) {
+    EXPECT_FALSE(result);
+    return;
+  }
   EXPECT_TRUE(result);
 
   // Read back.
@@ -337,11 +337,20 @@ TEST_P(Imgcodecs_Avif_Animation_WriteDecodeSuite, encode_decode) {
   std::vector<unsigned char> buf(size);
   EXPECT_TRUE(file.read(reinterpret_cast<char*>(buf.data()), size));
   file.close();
-  EXPECT_EQ(0, remove(output.c_str()));
   std::vector<cv::Mat> anim;
   ASSERT_TRUE(cv::imdecodemulti(buf, imread_mode_, anim));
 
   ValidateRead(anim_original, anim);
+
+  if (imread_mode_ == IMREAD_UNCHANGED) {
+    ImageCollection collection(output, IMREAD_UNCHANGED);
+    anim.clear();
+    for (auto&& i : collection)
+      anim.push_back(i);
+    ValidateRead(anim_original, anim);
+  }
+
+  EXPECT_EQ(0, remove(output.c_str()));
 }
 
 INSTANTIATE_TEST_CASE_P(
diff --git a/modules/imgcodecs/test/test_exr.impl.hpp b/modules/imgcodecs/test/test_exr.impl.hpp
index c8cda11a6374..32984ff731c7 100644
--- a/modules/imgcodecs/test/test_exr.impl.hpp
+++ b/modules/imgcodecs/test/test_exr.impl.hpp
@@ -314,4 +314,27 @@ TEST(Imgcodecs_EXR, read_RGBA_unchanged)
     EXPECT_EQ(0, remove(filenameOutput.c_str()));
 }
 
+// See https://github.com/opencv/opencv/pull/26211
+// ( related with https://github.com/opencv/opencv/issues/26207 )
+TEST(Imgcodecs_EXR, imencode_regression_26207_extra)
+{
+    // CV_8U is not supported depth for EXR Encoder.
+    const cv::Mat src(100, 100, CV_8UC1, cv::Scalar::all(0));
+    std::vector<uchar> buf;
+    bool ret = false;
+    EXPECT_ANY_THROW(ret = imencode(".exr", src, buf));
+    EXPECT_FALSE(ret);
+}
+TEST(Imgcodecs_EXR, imwrite_regression_26207_extra)
+{
+    // CV_8U is not supported depth for EXR Encoder.
+    const cv::Mat src(100, 100, CV_8UC1, cv::Scalar::all(0));
+    const string filename = cv::tempfile(".exr");
+    bool ret = false;
+    EXPECT_ANY_THROW(ret = imwrite(filename, src));
+    EXPECT_FALSE(ret);
+    remove(filename.c_str());
+}
+
+
 }} // namespace
diff --git a/modules/imgcodecs/test/test_read_write.cpp b/modules/imgcodecs/test/test_read_write.cpp
index 255f819a9a94..7dfd02c67ca9 100644
--- a/modules/imgcodecs/test/test_read_write.cpp
+++ b/modules/imgcodecs/test/test_read_write.cpp
@@ -520,8 +520,78 @@ TEST(ImgCodecs, multipage_collection_two_iterator_operatorpp)
          EXPECT_TRUE(cv::norm(img1, img[i], NORM_INF) == 0);
     }
 }
+
+// See https://github.com/opencv/opencv/issues/26207
+TEST(Imgcodecs, imencodemulti_regression_26207)
+{
+    vector<Mat> imgs;
+    const cv::Mat img(100, 100, CV_8UC1, cv::Scalar::all(0));
+    imgs.push_back(img);
+    std::vector<uchar> buf;
+    bool ret = false;
+
+    // Encode single image
+    EXPECT_NO_THROW(ret = imencode(".tiff", img, buf));
+    EXPECT_TRUE(ret);
+    EXPECT_NO_THROW(ret = imencode(".tiff", imgs, buf));
+    EXPECT_TRUE(ret);
+    EXPECT_NO_THROW(ret = imencodemulti(".tiff", imgs, buf));
+    EXPECT_TRUE(ret);
+
+    // Encode multiple images
+    imgs.push_back(img.clone());
+    EXPECT_NO_THROW(ret = imencode(".tiff", imgs, buf));
+    EXPECT_TRUE(ret);
+    EXPECT_NO_THROW(ret = imencodemulti(".tiff", imgs, buf));
+    EXPECT_TRUE(ret);
+
+    // Count stored images from buffer.
+    // imcount() doesn't support buffer, so encoded buffer outputs to file temporary.
+    const size_t len = buf.size();
+    const string filename = cv::tempfile(".tiff");
+    FILE *f = fopen(filename.c_str(), "wb");
+    EXPECT_NE(f, nullptr);
+    EXPECT_EQ(len, fwrite(&buf[0], 1, len, f));
+    fclose(f);
+
+    EXPECT_EQ(2, (int)imcount(filename));
+    EXPECT_EQ(0, remove(filename.c_str()));
+}
 #endif
 
+// See https://github.com/opencv/opencv/pull/26211
+// ( related with https://github.com/opencv/opencv/issues/26207 )
+TEST(Imgcodecs, imencode_regression_26207_extra)
+{
+    // CV_32F is not supported depth for BMP Encoder.
+    // Encoded buffer contains CV_8U image which is fallbacked.
+    const cv::Mat src(100, 100, CV_32FC1, cv::Scalar::all(0));
+    std::vector<uchar> buf;
+    bool ret = false;
+    EXPECT_NO_THROW(ret = imencode(".bmp", src, buf));
+    EXPECT_TRUE(ret);
+
+    cv::Mat dst;
+    EXPECT_NO_THROW(dst = imdecode(buf, IMREAD_GRAYSCALE));
+    EXPECT_FALSE(dst.empty());
+    EXPECT_EQ(CV_8UC1, dst.type());
+}
+TEST(Imgcodecs, imwrite_regression_26207_extra)
+{
+    // CV_32F is not supported depth for BMP Encoder.
+    // Encoded buffer contains CV_8U image which is fallbacked.
+    const cv::Mat src(100, 100, CV_32FC1, cv::Scalar::all(0));
+    const string filename = cv::tempfile(".bmp");
+    bool ret = false;
+    EXPECT_NO_THROW(ret = imwrite(filename, src));
+    EXPECT_TRUE(ret);
+
+    cv::Mat dst;
+    EXPECT_NO_THROW(dst = imread(filename, IMREAD_GRAYSCALE));
+    EXPECT_FALSE(dst.empty());
+    EXPECT_EQ(CV_8UC1, dst.type());
+    EXPECT_EQ(0, remove(filename.c_str()));
+}
 
 TEST(Imgcodecs_Params, imwrite_regression_22752)
 {
diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index 16af2b969cad..7a80e1c2c452 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -48,7 +48,7 @@
 /**
 @defgroup imgproc Image Processing
 
-This module includes image-processing functions.
+This module offers a comprehensive suite of image processing functions, enabling tasks such as those listed above.
 
 @{
     @defgroup imgproc_filter Image Filtering
diff --git a/modules/imgproc/src/drawing.cpp b/modules/imgproc/src/drawing.cpp
index 8d20a57a8767..ea10da3bb682 100644
--- a/modules/imgproc/src/drawing.cpp
+++ b/modules/imgproc/src/drawing.cpp
@@ -64,7 +64,7 @@ CollectPolyEdges( Mat& img, const Point2l* v, int npts,
                   int shift, Point offset=Point() );
 
 static void
-FillEdgeCollection( Mat& img, std::vector<PolyEdge>& edges, const void* color, int line_type);
+FillEdgeCollection( Mat& img, std::vector<PolyEdge>& edges, const void* color );
 
 static void
 PolyLine( Mat& img, const Point2l* v, int npts, bool closed,
@@ -1051,7 +1051,7 @@ EllipseEx( Mat& img, Point2l center, Size2l axes,
         v.push_back(center);
         std::vector<PolyEdge> edges;
         CollectPolyEdges( img,  &v[0], (int)v.size(), edges, color, line_type, XY_SHIFT );
-        FillEdgeCollection( img, edges, color, line_type );
+        FillEdgeCollection( img, edges, color );
     }
 }
 
@@ -1299,15 +1299,11 @@ CollectPolyEdges( Mat& img, const Point2l* v, int count, std::vector<PolyEdge>&
                 if (t0.y != t1.y)
                 {
                     pt0c.y = t0.y; pt1c.y = t1.y;
-                    pt0c.x = (int64)(t0.x) << XY_SHIFT;
-                    pt1c.x = (int64)(t1.x) << XY_SHIFT;
                 }
             }
-            else
-            {
-                pt0c.x += XY_ONE >> 1;
-                pt1c.x += XY_ONE >> 1;
-            }
+
+            pt0c.x = (int64)(t0.x) << XY_SHIFT;
+            pt1c.x = (int64)(t1.x) << XY_SHIFT;
         }
         else
         {
@@ -1349,7 +1345,7 @@ struct CmpEdges
 /**************** helper macros and functions for sequence/contour processing ***********/
 
 static void
-FillEdgeCollection( Mat& img, std::vector<PolyEdge>& edges, const void* color, int line_type)
+FillEdgeCollection( Mat& img, std::vector<PolyEdge>& edges, const void* color )
 {
     PolyEdge tmp;
     int i, y, total = (int)edges.size();
@@ -1358,12 +1354,7 @@ FillEdgeCollection( Mat& img, std::vector<PolyEdge>& edges, const void* color, i
     int y_max = INT_MIN, y_min = INT_MAX;
     int64 x_max = 0xFFFFFFFFFFFFFFFF, x_min = 0x7FFFFFFFFFFFFFFF;
     int pix_size = (int)img.elemSize();
-    int delta;
-
-    if (line_type < LINE_AA)
-        delta = 0;
-    else
-        delta = XY_ONE - 1;
+    int delta = XY_ONE - 1;
 
     if( total < 2 )
         return;
@@ -2051,7 +2042,7 @@ void fillPoly( InputOutputArray _img, const Point** pts, const int* npts, int nc
         }
     }
 
-    FillEdgeCollection(img, edges, buf, line_type);
+    FillEdgeCollection(img, edges, buf);
 }
 
 void polylines( InputOutputArray _img, const Point* const* pts, const int* npts, int ncontours, bool isClosed,
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index c1dd68d74988..8da46d26da46 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -2508,7 +2508,7 @@ class ipp_warpAffineParallel: public ParallelLoopBody
 
 #endif
 
-static bool ipp_warpAffine( InputArray _src, OutputArray _dst, int interpolation, int borderType, InputArray _M, int flags )
+static bool ipp_warpAffine( InputArray _src, OutputArray _dst, int interpolation, int borderType, const Scalar & borderValue, InputArray _M, int flags )
 {
 #ifdef HAVE_IPP_IW
     CV_INSTRUMENT_REGION_IPP();
@@ -2527,7 +2527,7 @@ static bool ipp_warpAffine( InputArray _src, OutputArray _dst, int interpolation
         Mat dst = _dst.getMat();
         ::ipp::IwiImage        iwSrc = ippiGetImage(src);
         ::ipp::IwiImage        iwDst = ippiGetImage(dst);
-        ::ipp::IwiBorderType   ippBorder(ippiGetBorderType(borderType));
+        ::ipp::IwiBorderType   ippBorder(ippiGetBorderType(borderType), ippiGetValue(borderValue));
         IwTransDirection       iwTransDirection;
         if(!ippBorder)
             return false;
@@ -2570,7 +2570,7 @@ static bool ipp_warpAffine( InputArray _src, OutputArray _dst, int interpolation
     return true;
 #else
     CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(interpolation);
-    CV_UNUSED(borderType); CV_UNUSED(_M); CV_UNUSED(flags);
+    CV_UNUSED(borderType); CV_UNUSED(borderValue); CV_UNUSED(_M); CV_UNUSED(flags);
     return false;
 #endif
 }
@@ -2795,7 +2795,7 @@ void cv::warpAffine( InputArray _src, OutputArray _dst,
     CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 2 && M0.cols == 3 );
     M0.convertTo(matM, matM.type());
 
-    CV_IPP_RUN_FAST(ipp_warpAffine(src, dst, interpolation, borderType, matM, flags));
+    CV_IPP_RUN_FAST(ipp_warpAffine(src, dst, interpolation, borderType, borderValue, matM, flags));
 
     if( !(flags & WARP_INVERSE_MAP) )
     {
diff --git a/modules/imgproc/test/test_drawing.cpp b/modules/imgproc/test/test_drawing.cpp
index 12784e0bdbcd..b0b1b47080e1 100755
--- a/modules/imgproc/test/test_drawing.cpp
+++ b/modules/imgproc/test/test_drawing.cpp
@@ -857,6 +857,74 @@ TEST(Drawing, ttf_text)
 }
 #endif
 
+TEST(Drawing, fillpoly_contours)
+{
+    const int imgSize = 50;
+    const int type = CV_8UC1;
+    const int shift = 0;
+    const Scalar cl = Scalar::all(255);
+    const cv::LineTypes lineType = LINE_8;
+
+    // check that contours of fillPoly and polylines match
+    {
+        cv::Mat img(imgSize, imgSize, type);
+        img = 0;
+        std::vector<std::vector<cv::Point>> polygonPoints{
+            { {44, 27}, {7, 37}, {7, 19}, {38, 19} }
+        };
+        cv::fillPoly(img, polygonPoints, cl, lineType, shift);
+        cv::polylines(img, polygonPoints, true, 0, 1, lineType, shift);
+
+        {
+            cv::Mat labelImage(img.size(), CV_32S);
+            int labels = cv::connectedComponents(img, labelImage, 4);
+            EXPECT_EQ(2, labels) << "filling went over the border";
+        }
+    }
+
+    // check that line generated with fillPoly and polylines match
+    {
+        cv::Mat img1(imgSize, imgSize, type), img2(imgSize, imgSize, type);
+        img1 = 0;
+        img2 = 0;
+        std::vector<std::vector<cv::Point>> polygonPoints{
+            { {44, 27}, {38, 19} }
+        };
+        cv::fillPoly(img1, polygonPoints, cl, lineType, shift);
+        cv::polylines(img2, polygonPoints, true, cl, 1, lineType, shift);
+        EXPECT_MAT_N_DIFF(img1, img2, 0);
+    }
+}
+
+TEST(Drawing, fillpoly_match_lines)
+{
+    const int imgSize = 49;
+    const int type = CV_8UC1;
+    const int shift = 0;
+    const Scalar cl = Scalar::all(255);
+    const cv::LineTypes lineType = LINE_8;
+    cv::Mat img1(imgSize, imgSize, type), img2(imgSize, imgSize, type);
+    for (int x1 = 0; x1 < imgSize; x1 += imgSize / 2)
+    {
+        for (int y1 = 0; y1 < imgSize; y1 += imgSize / 2)
+        {
+            for (int x2 = 0; x2 < imgSize; x2++)
+            {
+                for (int y2 = 0; y2 < imgSize; y2++)
+                {
+                    img1 = 0;
+                    img2 = 0;
+                    std::vector<std::vector<cv::Point>> polygonPoints{
+                        { {x1, y1}, {x2, y2} }
+                    };
+                    cv::fillPoly(img1, polygonPoints, cl, lineType, shift);
+                    cv::polylines(img2, polygonPoints, true, cl, 1, lineType, shift);
+                    EXPECT_MAT_N_DIFF(img1, img2, 0);
+                }
+            }
+        }
+    }
+}
 
 TEST(Drawing, fillpoly_fully)
 {
diff --git a/modules/js/perf/package.json b/modules/js/perf/package.json
index 04607ddffe56..01a2a8a4ccb6 100644
--- a/modules/js/perf/package.json
+++ b/modules/js/perf/package.json
@@ -1,19 +1,19 @@
 {
     "name": "opencv_js_perf",
-    "description": "Perfermance tests for opencv js bindings",
+    "description": "Performance tests for opencv js bindings",
     "version": "1.0.0",
-    "dependencies" : {
-      "benchmark" : "latest"
+    "dependencies": {
+        "benchmark": "latest"
     },
     "repository": {
-      "type": "git",
-      "url": "https://github.com/opencv/opencv.git"
+        "type": "git",
+        "url": "https://github.com/opencv/opencv.git"
     },
     "keywords": [],
     "author": "",
     "license": "Apache 2.0 License",
     "bugs": {
-      "url": "https://github.com/opencv/opencv/issues"
+        "url": "https://github.com/opencv/opencv/issues"
     },
     "homepage": "https://github.com/opencv/opencv"
-  }
\ No newline at end of file
+}
diff --git a/modules/objdetect/include/opencv2/objdetect/face.hpp b/modules/objdetect/include/opencv2/objdetect/face.hpp
index cf09c79d50dd..566204f7f926 100644
--- a/modules/objdetect/include/opencv2/objdetect/face.hpp
+++ b/modules/objdetect/include/opencv2/objdetect/face.hpp
@@ -155,6 +155,22 @@ class CV_EXPORTS_W FaceRecognizerSF
      *  @param target_id the id of target device
      */
     CV_WRAP static Ptr<FaceRecognizerSF> create(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config, int backend_id = 0, int target_id = 0);
+
+    /**
+     *  @brief Creates an instance of this class from a buffer containing the model weights and configuration.
+     *  @param framework Name of the framework (ONNX, etc.)
+     *  @param bufferModel A buffer containing the binary model weights.
+     *  @param bufferConfig A buffer containing the network configuration.
+     *  @param backend_id The id of the backend.
+     *  @param target_id The id of the target device.
+     *
+     *  @return A pointer to the created instance of FaceRecognizerSF.
+     */
+    CV_WRAP static Ptr<FaceRecognizerSF> create(const String& framework,
+                                                const std::vector<uchar>& bufferModel,
+                                                const std::vector<uchar>& bufferConfig,
+                                                int backend_id = 0,
+                                                int target_id = 0);
 };
 
 //! @}
diff --git a/modules/objdetect/src/face_recognize.cpp b/modules/objdetect/src/face_recognize.cpp
index 8183573ce982..a5f4641da306 100644
--- a/modules/objdetect/src/face_recognize.cpp
+++ b/modules/objdetect/src/face_recognize.cpp
@@ -26,6 +26,19 @@ class FaceRecognizerSFImpl : public FaceRecognizerSF
         net.setPreferableBackend(backend_id);
         net.setPreferableTarget(target_id);
     }
+
+    FaceRecognizerSFImpl(const String& framework,
+                         const std::vector<uchar>& bufferModel,
+                         const std::vector<uchar>& bufferConfig,
+                         int backend_id, int target_id)
+    {
+        net = dnn::readNet(framework, bufferModel, bufferConfig);
+        CV_Assert(!net.empty());
+
+        net.setPreferableBackend(backend_id);
+        net.setPreferableTarget(target_id);
+    }
+
     void alignCrop(InputArray _src_img, InputArray _face_mat, OutputArray _aligned_img) const override
     {
         Mat face_mat = _face_mat.getMat();
@@ -189,4 +202,17 @@ Ptr<FaceRecognizerSF> FaceRecognizerSF::create(const String& model, const String
 #endif
 }
 
+Ptr<FaceRecognizerSF> FaceRecognizerSF::create(const String& framework,
+                                               const std::vector<uchar>& bufferModel,
+                                               const std::vector<uchar>& bufferConfig,
+                                               int backend_id, int target_id)
+{
+#ifdef HAVE_OPENCV_DNN
+    return makePtr<FaceRecognizerSFImpl>(framework, bufferModel, bufferConfig, backend_id, target_id);
+#else
+    CV_UNUSED(bufferModel); CV_UNUSED(bufferConfig); CV_UNUSED(backend_id); CV_UNUSED(target_id);
+    CV_Error(cv::Error::StsNotImplemented, "cv::FaceRecognizerSF requires enabled 'dnn' module");
+#endif
+}
+
 } // namespace cv
diff --git a/modules/video/src/hal_replacement.hpp b/modules/video/src/hal_replacement.hpp
index 8d10ab39d1f3..396fa9a2d548 100644
--- a/modules/video/src/hal_replacement.hpp
+++ b/modules/video/src/hal_replacement.hpp
@@ -27,7 +27,9 @@
 //! @{
 
 /**
-@brief Lucas-Kanade optical flow for single pyramid layer. See calcOpticalFlowPyrLK
+@brief Lucas-Kanade optical flow for single pyramid layer. See calcOpticalFlowPyrLK.
+@note OpenCV builds pyramid levels with `win_size` padding. Out-of-bound access to source
+image data is legal within `+-win_size` range.
 @param prev_data previous frame image data
 @param prev_data_step previous frame image data step
 @param prev_deriv_data previous frame Schaar derivatives
@@ -67,6 +69,29 @@ inline int hal_ni_LKOpticalFlowLevel(const uchar *prev_data, size_t prev_data_st
 #define cv_hal_LKOpticalFlowLevel hal_ni_LKOpticalFlowLevel
 //! @endcond
 
+/**
+@brief Computes Schaar derivatives with inteleaved layout xyxy...
+@note OpenCV builds pyramid levels with `win_size` padding. Out-of-bound access to source
+image data is legal within `+-win_size` range.
+@param src_data source image data
+@param src_step source image step
+@param dst_data destination buffer data
+@param dst_step destination buffer step
+@param width image width
+@param height image height
+@param cn source image channels
+**/
+inline int hal_ni_ScharrDeriv(const uchar* src_data, size_t src_step,
+                              short* dst_data, size_t dst_step,
+                              int width, int height, int cn)
+{
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
+//! @cond IGNORED
+#define cv_hal_ScharrDeriv hal_ni_ScharrDeriv
+//! @endcond
+
 //! @}
 
 #if defined(__clang__)
diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp
index 03de93ee08a8..25fdc35c5b71 100644
--- a/modules/video/src/lkpyramid.cpp
+++ b/modules/video/src/lkpyramid.cpp
@@ -62,6 +62,9 @@ static void calcScharrDeriv(const cv::Mat& src, cv::Mat& dst)
     int rows = src.rows, cols = src.cols, cn = src.channels(), depth = src.depth();
     CV_Assert(depth == CV_8U);
     dst.create(rows, cols, CV_MAKETYPE(DataType<deriv_type>::depth, cn*2));
+
+    CALL_HAL(ScharrDeriv, cv_hal_ScharrDeriv, src.data, src.step, (short*)dst.data, dst.step, cols, rows, cn);
+
     parallel_for_(Range(0, rows), cv::detail::ScharrDerivInvoker(src, dst), cv::getNumThreads());
 }
 
diff --git a/modules/videoio/src/precomp.hpp b/modules/videoio/src/precomp.hpp
index 6a9546a6f332..8ebeec4a9eb0 100644
--- a/modules/videoio/src/precomp.hpp
+++ b/modules/videoio/src/precomp.hpp
@@ -53,7 +53,7 @@
 #include <unistd.h>  // -D_FORTIFY_SOURCE=2 workaround: https://github.com/opencv/opencv/issues/15020
 #endif
 
-
+#include "opencv2/core/cvdef.h"
 #include "opencv2/videoio.hpp"
 
 #include "opencv2/core/utility.hpp"
diff --git a/samples/cpp/filestorage.cpp b/samples/cpp/filestorage.cpp
deleted file mode 100644
index e0b462bba6b6..000000000000
--- a/samples/cpp/filestorage.cpp
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * filestorage_sample demonstrate the usage of the opencv serialization functionality
- */
-
-#include "opencv2/core.hpp"
-#include <iostream>
-#include <string>
-
-using std::string;
-using std::cout;
-using std::endl;
-using std::cerr;
-using std::ostream;
-using namespace cv;
-
-static void help(char** av)
-{
-  cout << "\nfilestorage_sample demonstrate the usage of the opencv serialization functionality.\n"
-      << "usage:\n"
-      <<  av[0] << " outputfile.yml.gz\n"
-      << "\n   outputfile above can have many different extensions, see below."
-      << "\nThis program demonstrates the use of FileStorage for serialization, that is in use << and >>  in OpenCV\n"
-      << "For example, how to create a class and have it serialize, but also how to use it to read and write matrices.\n"
-      << "FileStorage allows you to serialize to various formats specified by the file end type."
-          << "\nYou should try using different file extensions.(e.g. yaml yml xml xml.gz yaml.gz etc...)\n" << endl;
-}
-
-struct MyData
-{
-  MyData() :
-    A(0), X(0), id()
-  {
-  }
-  explicit MyData(int) :
-    A(97), X(CV_PI), id("mydata1234")
-  {
-  }
-  int A;
-  double X;
-  string id;
-  void write(FileStorage& fs) const //Write serialization for this class
-  {
-    fs << "{" << "A" << A << "X" << X << "id" << id << "}";
-  }
-  void read(const FileNode& node)  //Read serialization for this class
-  {
-
-    A = (int)node["A"];
-    X = (double)node["X"];
-    id = (string)node["id"];
-  }
-};
-
-//These write and read functions must exist as per the inline functions in operations.hpp
-static void write(FileStorage& fs, const std::string&, const MyData& x){
-  x.write(fs);
-}
-static void read(const FileNode& node, MyData& x, const MyData& default_value = MyData()){
-  if(node.empty())
-    x = default_value;
-  else
-    x.read(node);
-}
-
-static ostream& operator<<(ostream& out, const MyData& m){
-  out << "{ id = " << m.id << ", ";
-  out << "X = " << m.X << ", ";
-  out << "A = " << m.A << "}";
-  return out;
-}
-int main(int ac, char** av)
-{
-  cv::CommandLineParser parser(ac, av,
-    "{@input||}{help h ||}"
-  );
-  if (parser.has("help"))
-  {
-    help(av);
-    return 0;
-  }
-  string filename = parser.get<string>("@input");
-  if (filename.empty())
-  {
-    help(av);
-    return 1;
-  }
-
-  //write
-  {
-    FileStorage fs(filename, FileStorage::WRITE);
-
-    cout << "writing images\n";
-    fs << "images" << "[";
-
-    fs << "image1.jpg" << "myfi.png" << "baboon.jpg";
-    cout << "image1.jpg" << " myfi.png" << " baboon.jpg" << endl;
-
-    fs << "]";
-
-    cout << "writing mats\n";
-    Mat R =Mat_<double>::eye(3, 3),T = Mat_<double>::zeros(3, 1);
-    cout << "R = " << R << "\n";
-    cout << "T = " << T << "\n";
-    fs << "R" << R;
-    fs << "T" << T;
-
-    cout << "writing MyData struct\n";
-    MyData m(1);
-    fs << "mdata" << m;
-    cout << m << endl;
-  }
-
-  //read
-  {
-    FileStorage fs(filename, FileStorage::READ);
-
-    if (!fs.isOpened())
-    {
-      cerr << "failed to open " << filename << endl;
-      help(av);
-      return 1;
-    }
-
-    FileNode n = fs["images"];
-    if (n.type() != FileNode::SEQ)
-    {
-      cerr << "images is not a sequence! FAIL" << endl;
-      return 1;
-    }
-
-    cout << "reading images\n";
-    FileNodeIterator it = n.begin(), it_end = n.end();
-    for (; it != it_end; ++it)
-    {
-      cout << (string)*it << "\n";
-    }
-
-    Mat R, T;
-    cout << "reading R and T" << endl;
-
-    fs["R"] >> R;
-    fs["T"] >> T;
-
-    cout << "R = " << R << "\n";
-    cout << "T = " << T << endl;
-
-    MyData m;
-    fs["mdata"] >> m;
-
-    cout << "read mdata\n";
-    cout << m << endl;
-
-    cout << "attempting to read mdata_b\n";   //Show default behavior for empty matrix
-    fs["mdata_b"] >> m;
-    cout << "read mdata_b\n";
-    cout << m << endl;
-
-  }
-
-  cout << "Try opening " << filename << " to see the serialized data." << endl << endl;
-
-  //read from string
-  {
-    cout << "Read data from string\n";
-    string dataString =
-        "%YAML:1.0\n"
-        "mdata:\n"
-        "   A: 97\n"
-        "   X: 3.1415926535897931e+00\n"
-        "   id: mydata1234\n";
-    MyData m;
-    FileStorage fs(dataString, FileStorage::READ | FileStorage::MEMORY);
-    cout << "attempting to read mdata_b from string\n";   //Show default behavior for empty matrix
-    fs["mdata"] >> m;
-    cout << "read mdata\n";
-    cout << m << endl;
-  }
-
-  //write to string
-  {
-    cout << "Write data to string\n";
-    FileStorage fs(filename, FileStorage::WRITE | FileStorage::MEMORY | FileStorage::FORMAT_YAML);
-
-    cout << "writing MyData struct\n";
-    MyData m(1);
-    fs << "mdata" << m;
-    cout << m << endl;
-    string createdString = fs.releaseAndGetString();
-    cout << "Created string:\n" << createdString << "\n";
-  }
-
-  return 0;
-}
diff --git a/samples/cpp/tutorial_code/core/file_input_output/file_input_output.cpp b/samples/cpp/tutorial_code/core/file_input_output/file_input_output.cpp
index b17e24eaf475..8a04f6f84454 100644
--- a/samples/cpp/tutorial_code/core/file_input_output/file_input_output.cpp
+++ b/samples/cpp/tutorial_code/core/file_input_output/file_input_output.cpp
@@ -8,14 +8,14 @@ using namespace std;
 static void help(char** av)
 {
     cout << endl
-        << av[0] << " shows the usage of the OpenCV serialization functionality."         << endl
+        << av[0] << " shows the usage of the OpenCV serialization functionality."         << endl << endl
         << "usage: "                                                                      << endl
-        <<  av[0] << " outputfile.yml.gz"                                                 << endl
-        << "The output file may be either XML (xml) or YAML (yml/yaml). You can even compress it by "
-        << "specifying this in its extension like xml.gz yaml.gz etc... "                  << endl
+        <<  av[0] << " [output file name] (default outputfile.yml.gz)"                    << endl << endl
+        << "The output file may be XML (xml), YAML (yml/yaml), or JSON (json)." << endl
+        << "You can even compress it by specifying this in its extension like xml.gz yaml.gz etc... " << endl
         << "With FileStorage you can serialize objects in OpenCV by using the << and >> operators" << endl
         << "For example: - create a class and have it serialized"                         << endl
-        << "             - use it to read and write matrices."                            << endl;
+        << "             - use it to read and write matrices."                            << endl << endl;
 }
 
 class MyData
@@ -68,13 +68,16 @@ static ostream& operator<<(ostream& out, const MyData& m)
 
 int main(int ac, char** av)
 {
+    string filename;
+
     if (ac != 2)
     {
         help(av);
-        return 1;
+        filename = "outputfile.yml.gz";
     }
+    else
+        filename = av[1];
 
-    string filename = av[1];
     { //write
         //! [iomati]
         Mat R = Mat_<uchar>::eye(3, 3),
@@ -118,7 +121,7 @@ int main(int ac, char** av)
         //! [close]
         fs.release();                                       // explicit close
         //! [close]
-        cout << "Write Done." << endl;
+        cout << "Write operation to file:" << filename << " completed successfully." << endl;
     }
 
     {//read
diff --git a/samples/cpp/tutorial_code/snippets/core_various.cpp b/samples/cpp/tutorial_code/snippets/core_various.cpp
index 2be97f989da3..b3d590100dec 100644
--- a/samples/cpp/tutorial_code/snippets/core_various.cpp
+++ b/samples/cpp/tutorial_code/snippets/core_various.cpp
@@ -78,6 +78,7 @@ int main()
             tm.start();
             // do something ...
             tm.stop();
+            cout << "Last iteration: " << tm.getLastTimeSec() << endl;
         }
         cout << "Average time per iteration in seconds: " << tm.getAvgTimeSec() << endl;
         cout << "Average FPS: " << tm.getFPS() << endl;
diff --git a/samples/python/tutorial_code/core/file_input_output/file_input_output.py b/samples/python/tutorial_code/core/file_input_output/file_input_output.py
index 66b3108dbaf3..95eb1afcc94d 100644
--- a/samples/python/tutorial_code/core/file_input_output/file_input_output.py
+++ b/samples/python/tutorial_code/core/file_input_output/file_input_output.py
@@ -9,10 +9,10 @@ def help(filename):
         '''
         {0} shows the usage of the OpenCV serialization functionality. \n\n
         usage:\n
-            python3 {0} outputfile.yml.gz\n\n
-        The output file may be either in XML, YAML or JSON. You can even compress it\n
-        by specifying this in its extension like xml.gz yaml.gz etc... With\n
-        FileStorage you can serialize objects in OpenCV.\n\n
+            python3 {0} [output file name] (default outputfile.yml.gz)\n\n
+        The output file may be XML (xml), YAML (yml/yaml), or JSON (json).\n
+        You can even compress it by specifying this in its extension like xml.gz yaml.gz etc...\n
+        With FileStorage you can serialize objects in OpenCV.\n\n
         For example: - create a class and have it serialized\n
                      - use it to read and write matrices.\n
         '''.format(filename)
@@ -49,7 +49,9 @@ def read(self, node):
 def main(argv):
     if len(argv) != 2:
         help(argv[0])
-        exit(1)
+        filename = 'outputfile.yml.gz'
+    else :
+        filename = argv[1]
 
     # write
     ## [iomati]
@@ -60,8 +62,6 @@ def main(argv):
     m = MyData()
     ## [customIOi]
 
-    filename = argv[1]
-
     ## [open]
     s = cv.FileStorage(filename, cv.FileStorage_WRITE)
     # or:
@@ -98,7 +98,7 @@ def main(argv):
     ## [close]
     s.release()
     ## [close]
-    print ('Write Done.')
+    print ('Write operation to file:', filename, 'completed successfully.')
 
     # read
     print ('\nReading: ')